element.
# For black navbar, do "navbar navbar-inverse"
- 'navbar_class': "navbar",
-
+ "navbar_class": "navbar",
# Fix navigation bar to top of page?
# Values: "true" (default) or "false"
- 'navbar_fixed_top': "true",
-
+ "navbar_fixed_top": "true",
# Location of link to source.
# Options are "nav" (default), "footer" or anything else to exclude.
- 'source_link_position': "footer",
-
+ "source_link_position": "footer",
# Bootswatch (http://bootswatch.com/) theme.
#
# Options are nothing with "" (default) or the name of a valid theme
# such as "amelia" or "cosmo".
- 'bootswatch_theme': "cosmo",
-
+ "bootswatch_theme": "cosmo",
# Choose Bootstrap version.
# Values: "3" (default) or "2" (in quotes)
- 'bootstrap_version': "3",
+ "bootstrap_version": "3",
}
# Add any paths that contain custom themes here, relative to this directory.
@@ -288,7 +287,7 @@
# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
-html_sidebars = {'**': ['localtoc.html']}
+html_sidebars = {"**": ["localtoc.html"]}
# Additional templates that should be rendered to pages, maps page names to
# template names.
@@ -321,7 +320,7 @@
# html_file_suffix = None
# Output file base name for HTML help builder.
-htmlhelp_basename = 'AutoSklearndoc'
+htmlhelp_basename = "AutoSklearndoc"
# -- Options for LaTeX output ---------------------------------------------
@@ -337,9 +336,15 @@
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
-latex_documents = [('index', 'AutoSklearn.tex', u'AutoSklearn Documentation',
- u'Matthias Feurer, Aaron Klein, Katharina Eggensperger',
- 'manual'), ]
+latex_documents = [
+ (
+ "index",
+ "AutoSklearn.tex",
+ "AutoSklearn Documentation",
+ "Matthias Feurer, Aaron Klein, Katharina Eggensperger",
+ "manual",
+ ),
+]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
@@ -365,8 +370,15 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
-man_pages = [('index', 'autosklearn', u'AutoSklearn Documentation',
- [u'Matthias Feurer, Aaron Klein, Katharina Eggensperger'], 1)]
+man_pages = [
+ (
+ "index",
+ "autosklearn",
+ "AutoSklearn Documentation",
+ ["Matthias Feurer, Aaron Klein, Katharina Eggensperger"],
+ 1,
+ )
+]
# If true, show URL addresses after external links.
# man_show_urls = False
@@ -376,10 +388,17 @@
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
-texinfo_documents = [('index', 'AutoSklearn', u'AutoSklearn Documentation',
- u'Matthias Feurer, Aaron Klein, Katharina Eggensperger',
- 'AutoSklearn', 'One line description of project.',
- 'Miscellaneous'), ]
+texinfo_documents = [
+ (
+ "index",
+ "AutoSklearn",
+ "AutoSklearn Documentation",
+ "Matthias Feurer, Aaron Klein, Katharina Eggensperger",
+ "AutoSklearn",
+ "One line description of project.",
+ "Miscellaneous",
+ ),
+]
# Documents to append as an appendix to all manuals.
# texinfo_appendices = []
@@ -396,12 +415,12 @@
# Only the class’ docstring is inserted. This is the default.
# You can still document __init__ as a separate method using automethod or
# the members option to autoclass.
-#"both"
+# "both"
# Both the class’ and the __init__ method’s docstring are concatenated and
# inserted.
# "init"
# Only the __init__ method’s docstring is inserted.
-autoclass_content = 'both'
+autoclass_content = "both"
def setup(app):
diff --git a/examples/20_basic/example_classification.py b/examples/20_basic/example_classification.py
index fcb99b65ef..621dcf4f86 100644
--- a/examples/20_basic/example_classification.py
+++ b/examples/20_basic/example_classification.py
@@ -20,8 +20,9 @@
# ============
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
############################################################################
# Build and fit a classifier
@@ -30,9 +31,9 @@
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
- tmp_folder='/tmp/autosklearn_classification_example_tmp',
+ tmp_folder="/tmp/autosklearn_classification_example_tmp",
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
############################################################################
# View the models found by auto-sklearn
@@ -52,4 +53,3 @@
predictions = automl.predict(X_test)
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))
-
diff --git a/examples/20_basic/example_multilabel_classification.py b/examples/20_basic/example_multilabel_classification.py
index 835b110ea6..bedf974868 100644
--- a/examples/20_basic/example_multilabel_classification.py
+++ b/examples/20_basic/example_multilabel_classification.py
@@ -29,8 +29,8 @@
# This is to comply with Scikit-learn requirement:
# "Positive classes are indicated with 1 and negative classes with 0 or -1."
# More information on: https://scikit-learn.org/stable/modules/multiclass.html
-y[y == 'TRUE'] = 1
-y[y == 'FALSE'] = 0
+y[y == "TRUE"] = 1
+y[y == "FALSE"] = 0
y = y.astype(int)
# Using type of target is a good way to make sure your data
@@ -51,9 +51,9 @@
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
- smac_scenario_args={'runcount_limit': 1},
+ smac_scenario_args={"runcount_limit": 1},
)
-automl.fit(X_train, y_train, dataset_name='reuters')
+automl.fit(X_train, y_train, dataset_name="reuters")
############################################################################
# View the models found by auto-sklearn
diff --git a/examples/20_basic/example_multioutput_regression.py b/examples/20_basic/example_multioutput_regression.py
index a2e345fcac..cb12643adb 100644
--- a/examples/20_basic/example_multioutput_regression.py
+++ b/examples/20_basic/example_multioutput_regression.py
@@ -32,9 +32,9 @@
automl = AutoSklearnRegressor(
time_left_for_this_task=120,
per_run_time_limit=30,
- tmp_folder='/tmp/autosklearn_multioutput_regression_example_tmp',
+ tmp_folder="/tmp/autosklearn_multioutput_regression_example_tmp",
)
-automl.fit(X_train, y_train, dataset_name='synthetic')
+automl.fit(X_train, y_train, dataset_name="synthetic")
############################################################################
# View the models found by auto-sklearn
diff --git a/examples/20_basic/example_regression.py b/examples/20_basic/example_regression.py
index 6b47607db0..5ade1c2866 100644
--- a/examples/20_basic/example_regression.py
+++ b/examples/20_basic/example_regression.py
@@ -21,8 +21,9 @@
X, y = sklearn.datasets.load_diabetes(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
###########################
# Build and fit a regressor
@@ -31,9 +32,9 @@
automl = autosklearn.regression.AutoSklearnRegressor(
time_left_for_this_task=120,
per_run_time_limit=30,
- tmp_folder='/tmp/autosklearn_regression_example_tmp',
+ tmp_folder="/tmp/autosklearn_regression_example_tmp",
)
-automl.fit(X_train, y_train, dataset_name='diabetes')
+automl.fit(X_train, y_train, dataset_name="diabetes")
############################################################################
# View the models found by auto-sklearn
@@ -69,12 +70,12 @@
# than the true value), points above the diagonal were underestimated (predicted value is lower than
# the true value).
-plt.scatter(train_predictions, y_train, label="Train samples", c='#d95f02')
-plt.scatter(test_predictions, y_test, label="Test samples", c='#7570b3')
+plt.scatter(train_predictions, y_train, label="Train samples", c="#d95f02")
+plt.scatter(test_predictions, y_test, label="Test samples", c="#7570b3")
plt.xlabel("Predicted value")
plt.ylabel("True value")
plt.legend()
-plt.plot([30, 400], [30, 400], c='k', zorder=0)
+plt.plot([30, 400], [30, 400], c="k", zorder=0)
plt.xlim([30, 400])
plt.ylim([30, 400])
plt.tight_layout()
diff --git a/examples/40_advanced/custom_metrics.py b/examples/40_advanced/custom_metrics.py
index 6b548e5718..c6ad14efdd 100644
--- a/examples/40_advanced/custom_metrics.py
+++ b/examples/40_advanced/custom_metrics.py
@@ -9,6 +9,7 @@
# Custom metrics definition
# =========================
+
def accuracy(solution, prediction):
# custom function defining accuracy
return np.mean(solution == prediction)
diff --git a/examples/40_advanced/example_calc_multiple_metrics.py b/examples/40_advanced/example_calc_multiple_metrics.py
index c7a4e78503..fa4d17cc1e 100644
--- a/examples/40_advanced/example_calc_multiple_metrics.py
+++ b/examples/40_advanced/example_calc_multiple_metrics.py
@@ -25,9 +25,9 @@ def error(solution, prediction):
def get_metric_result(cv_results):
results = pd.DataFrame.from_dict(cv_results)
- results = results[results['status'] == "Success"]
- cols = ['rank_test_scores', 'param_classifier:__choice__', 'mean_test_score']
- cols.extend([key for key in cv_results.keys() if key.startswith('metric_')])
+ results = results[results["status"] == "Success"]
+ cols = ["rank_test_scores", "param_classifier:__choice__", "mean_test_score"]
+ cols.extend([key for key in cv_results.keys() if key.startswith("metric_")])
return results[cols]
@@ -36,25 +36,26 @@ def get_metric_result(cv_results):
# ============
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
############################################################################
# Build and fit a classifier
# ==========================
error_rate = autosklearn.metrics.make_scorer(
- name='custom_error',
+ name="custom_error",
score_func=error,
optimum=0,
greater_is_better=False,
needs_proba=False,
- needs_threshold=False
+ needs_threshold=False,
)
cls = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
- scoring_functions=[balanced_accuracy, precision, recall, f1, error_rate]
+ scoring_functions=[balanced_accuracy, precision, recall, f1, error_rate],
)
cls.fit(X_train, y_train, X_test, y_test)
diff --git a/examples/40_advanced/example_debug_logging.py b/examples/40_advanced/example_debug_logging.py
index 07e2e3ed99..664ce0b461 100644
--- a/examples/40_advanced/example_debug_logging.py
+++ b/examples/40_advanced/example_debug_logging.py
@@ -28,8 +28,9 @@
# Load kr-vs-kp dataset from https://www.openml.org/d/3
X, y = data = sklearn.datasets.fetch_openml(data_id=3, return_X_y=True, as_frame=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
############################################################################
@@ -40,33 +41,31 @@
# We will instead create a custom one as follows:
logging_config = {
- 'version': 1,
- 'disable_existing_loggers': True,
- 'formatters': {
- 'custom': {
+ "version": 1,
+ "disable_existing_loggers": True,
+ "formatters": {
+ "custom": {
# More format options are available in the official
# `documentation
`_
- 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
}
},
-
# Any INFO level msg will be printed to the console
- 'handlers': {
- 'console': {
- 'level': 'INFO',
- 'formatter': 'custom',
- 'class': 'logging.StreamHandler',
- 'stream': 'ext://sys.stdout',
+ "handlers": {
+ "console": {
+ "level": "INFO",
+ "formatter": "custom",
+ "class": "logging.StreamHandler",
+ "stream": "ext://sys.stdout",
},
},
-
- 'loggers': {
- '': { # root logger
- 'level': 'DEBUG',
+ "loggers": {
+ "": { # root logger
+ "level": "DEBUG",
},
- 'Client-EnsembleBuilder': {
- 'level': 'DEBUG',
- 'handlers': ['console'],
+ "Client-EnsembleBuilder": {
+ "level": "DEBUG",
+ "handlers": ["console"],
},
},
}
@@ -80,11 +79,11 @@
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
- smac_scenario_args={'runcount_limit': 2},
+ smac_scenario_args={"runcount_limit": 2},
# Pass the config file we created
logging_config=logging_config,
# *auto-sklearn* generates temporal files under tmp_folder
- tmp_folder='./tmp_folder',
+ tmp_folder="./tmp_folder",
# By default tmp_folder is deleted. We will preserve it
# for debug purposes
delete_tmp_folder_after_terminate=False,
@@ -101,5 +100,5 @@
# * tmp_folder/smac3-output
# Auto-sklearn always outputs to this log file
# tmp_folder/AutoML*.log
-for filename in pathlib.Path('./tmp_folder').glob('*'):
+for filename in pathlib.Path("./tmp_folder").glob("*"):
print(filename)
diff --git a/examples/40_advanced/example_feature_types.py b/examples/40_advanced/example_feature_types.py
index 6317eb5a46..7d22edd715 100644
--- a/examples/40_advanced/example_feature_types.py
+++ b/examples/40_advanced/example_feature_types.py
@@ -4,9 +4,10 @@
Feature Types
=============
-In *auto-sklearn* it is possible to specify the feature types of a dataset when calling the method
-:meth:`fit() ` by specifying the argument
-``feat_type``. The following example demonstrates a way it can be done.
+In *auto-sklearn* it is possible to specify the feature types of a dataset when calling
+the method :meth:`fit() ` by
+specifying the argument ``feat_type``.
+The following example demonstrates a way it can be done.
Additionally, you can provide a properly formatted pandas DataFrame, and the feature
types will be automatically inferred, as demonstrated in
@@ -26,11 +27,12 @@
# ============
# Load Australian dataset from https://www.openml.org/d/40981
bunch = data = sklearn.datasets.fetch_openml(data_id=40981, as_frame=True)
-y = bunch['target'].to_numpy()
-X = bunch['data'].to_numpy(np.float)
+y = bunch["target"].to_numpy()
+X = bunch["data"].to_numpy(np.float)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
# Auto-sklearn can automatically recognize categorical/numerical data from a pandas
# DataFrame. This example highlights how the user can provide the feature types,
@@ -38,8 +40,7 @@
# feat_type is a list that tags each column from a DataFrame/ numpy array / list
# with the case-insensitive string categorical or numerical, accordingly.
feat_type = [
- 'Categorical' if x.name == 'category' else 'Numerical'
- for x in bunch['data'].dtypes
+ "Categorical" if x.name == "category" else "Numerical" for x in bunch["data"].dtypes
]
############################################################################
@@ -51,7 +52,7 @@
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
- smac_scenario_args={'runcount_limit': 1},
+ smac_scenario_args={"runcount_limit": 1},
)
cls.fit(X_train, y_train, X_test, y_test, feat_type=feat_type)
diff --git a/examples/40_advanced/example_get_pipeline_components.py b/examples/40_advanced/example_get_pipeline_components.py
index f7a97ead27..80686889ac 100644
--- a/examples/40_advanced/example_get_pipeline_components.py
+++ b/examples/40_advanced/example_get_pipeline_components.py
@@ -27,8 +27,9 @@
# ============
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
############################################################################
# Build and fit the classifier
@@ -40,20 +41,16 @@
disable_evaluator_output=False,
# To simplify querying the models in the final ensemble, we
# restrict auto-sklearn to use only pca as a preprocessor
- include={
- 'feature_preprocessor': ['pca']
- },
+ include={"feature_preprocessor": ["pca"]},
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
############################################################################
# Predict using the model
# =======================
predictions = automl.predict(X_test)
-print("Accuracy score:{}".format(
- sklearn.metrics.accuracy_score(y_test, predictions))
-)
+print("Accuracy score:{}".format(sklearn.metrics.accuracy_score(y_test, predictions)))
############################################################################
@@ -104,7 +101,7 @@
# Let's iterative over all entries
for run_key in automl.automl_.runhistory_.data:
- print('#########')
+ print("#########")
print(run_key)
print(automl.automl_.runhistory_.data[run_key])
@@ -166,7 +163,7 @@
print("Lowest loss:", losses_and_configurations[0][0])
print(
"Best configuration:",
- automl.automl_.runhistory_.ids_config[losses_and_configurations[0][1]]
+ automl.automl_.runhistory_.ids_config[losses_and_configurations[0][1]],
)
############################################################################
@@ -188,7 +185,7 @@
# The explained variance ratio per stage
for i, (weight, pipeline) in enumerate(automl.get_models_with_weights()):
for stage_name, component in pipeline.named_steps.items():
- if 'feature_preprocessor' in stage_name:
+ if "feature_preprocessor" in stage_name:
print(
"The {}th pipeline has a explained variance of {}".format(
i,
@@ -196,6 +193,6 @@
# Access the sklearn object via the choice attribute
# We want the explained variance attributed of
# each principal component
- component.choice.preprocessor.explained_variance_ratio_
+ component.choice.preprocessor.explained_variance_ratio_,
)
)
diff --git a/examples/40_advanced/example_inspect_predictions.py b/examples/40_advanced/example_inspect_predictions.py
index 24e149a37b..cf6de2476f 100644
--- a/examples/40_advanced/example_inspect_predictions.py
+++ b/examples/40_advanced/example_inspect_predictions.py
@@ -36,9 +36,9 @@
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
- tmp_folder='/tmp/autosklearn_inspect_predictions_example_tmp',
+ tmp_folder="/tmp/autosklearn_inspect_predictions_example_tmp",
)
-automl.fit(X_train, y_train, dataset_name='Run_or_walk_information')
+automl.fit(X_train, y_train, dataset_name="Run_or_walk_information")
s = automl.score(X_train, y_train)
print(f"Train score {s}")
@@ -61,16 +61,19 @@
r = permutation_importance(automl, X_test, y_test, n_repeats=10, random_state=0)
sort_idx = r.importances_mean.argsort()[::-1]
-plt.boxplot(r.importances[sort_idx].T,
- labels=[dataset.feature_names[i] for i in sort_idx])
+plt.boxplot(
+ r.importances[sort_idx].T, labels=[dataset.feature_names[i] for i in sort_idx]
+)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
for i in sort_idx[::-1]:
- print(f"{dataset.feature_names[i]:10s}: {r.importances_mean[i]:.3f} +/- "
- f"{r.importances_std[i]:.3f}")
+ print(
+ f"{dataset.feature_names[i]:10s}: {r.importances_mean[i]:.3f} +/- "
+ f"{r.importances_std[i]:.3f}"
+ )
############################################################################################
# Create partial dependence (PD) and individual conditional expectation (ICE) plots - part 2
@@ -90,11 +93,14 @@
# combining ICE (thin lines) and PD (thick line)
features = [1, 2]
-plot_partial_dependence(automl, dataset.data,
- features=features,
- grid_resolution=5,
- kind="both",
- feature_names=dataset.feature_names)
+plot_partial_dependence(
+ automl,
+ dataset.data,
+ features=features,
+ grid_resolution=5,
+ kind="both",
+ feature_names=dataset.feature_names,
+)
plt.tight_layout()
plt.show()
@@ -106,9 +112,12 @@
# these features. Again, we'll look at acceleration_y and acceleration_z.
features = [[1, 2]]
-plot_partial_dependence(automl, dataset.data,
- features=features,
- grid_resolution=5,
- feature_names=dataset.feature_names)
+plot_partial_dependence(
+ automl,
+ dataset.data,
+ features=features,
+ grid_resolution=5,
+ feature_names=dataset.feature_names,
+)
plt.tight_layout()
plt.show()
diff --git a/examples/40_advanced/example_interpretable_models.py b/examples/40_advanced/example_interpretable_models.py
index a78695082c..7b551de7b8 100644
--- a/examples/40_advanced/example_interpretable_models.py
+++ b/examples/40_advanced/example_interpretable_models.py
@@ -29,7 +29,9 @@
# Show available preprocessors
# ============================
-from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice
+from autosklearn.pipeline.components.feature_preprocessing import (
+ FeaturePreprocessorChoice,
+)
for name in FeaturePreprocessorChoice.get_components():
print(name)
@@ -39,8 +41,9 @@
# ============
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
############################################################################
# Build and fit a classifier
@@ -55,18 +58,18 @@
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
- tmp_folder='/tmp/autosklearn_interpretable_models_example_tmp',
+ tmp_folder="/tmp/autosklearn_interpretable_models_example_tmp",
include={
- 'classifier': [
- 'decision_tree', 'lda', 'sgd'
+ "classifier": ["decision_tree", "lda", "sgd"],
+ "feature_preprocessor": [
+ "no_preprocessing",
+ "polynomial",
+ "select_percentile_classification",
],
- 'feature_preprocessor': [
- 'no_preprocessing', 'polynomial', 'select_percentile_classification'
- ]
},
ensemble_size=1,
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
############################################################################
# Print the final ensemble constructed by auto-sklearn
diff --git a/examples/40_advanced/example_metrics.py b/examples/40_advanced/example_metrics.py
index 2cf39f1553..33d0f678fd 100644
--- a/examples/40_advanced/example_metrics.py
+++ b/examples/40_advanced/example_metrics.py
@@ -51,8 +51,9 @@ def error_wk(solution, prediction, extra_argument):
# ============
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
############################################################################
# Print a list of available metrics
@@ -68,7 +69,7 @@ def error_wk(solution, prediction, extra_argument):
# First example: Use predefined accuracy metric
# =============================================
-print("#"*80)
+print("#" * 80)
print("Use predefined accuracy metric")
cls = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=60,
@@ -87,7 +88,7 @@ def error_wk(solution, prediction, extra_argument):
# Second example: Use own accuracy metric
# =======================================
-print("#"*80)
+print("#" * 80)
print("Use self defined accuracy metric")
accuracy_scorer = autosklearn.metrics.make_scorer(
name="accu",
@@ -114,15 +115,15 @@ def error_wk(solution, prediction, extra_argument):
# Third example: Use own error metric
# ===================================
-print("#"*80)
+print("#" * 80)
print("Use self defined error metric")
error_rate = autosklearn.metrics.make_scorer(
- name='error',
+ name="error",
score_func=error,
optimum=0,
greater_is_better=False,
needs_proba=False,
- needs_threshold=False
+ needs_threshold=False,
)
cls = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=60,
@@ -141,7 +142,7 @@ def error_wk(solution, prediction, extra_argument):
# Fourth example: Use own accuracy metric with additional argument
# ================================================================
-print("#"*80)
+print("#" * 80)
print("Use self defined accuracy with additional argument")
accuracy_scorer = autosklearn.metrics.make_scorer(
name="accu_add",
@@ -153,10 +154,7 @@ def error_wk(solution, prediction, extra_argument):
extra_argument=None,
)
cls = autosklearn.classification.AutoSklearnClassifier(
- time_left_for_this_task=60,
- per_run_time_limit=30,
- seed=1,
- metric=accuracy_scorer
+ time_left_for_this_task=60, per_run_time_limit=30, seed=1, metric=accuracy_scorer
)
cls.fit(X_train, y_train)
@@ -169,7 +167,7 @@ def error_wk(solution, prediction, extra_argument):
# Fifth example: Use own accuracy metric with additional argument
# ===============================================================
-print("#"*80)
+print("#" * 80)
print("Use self defined error with additional argument")
error_rate = autosklearn.metrics.make_scorer(
name="error_add",
diff --git a/examples/40_advanced/example_pandas_train_test.py b/examples/40_advanced/example_pandas_train_test.py
index 910cac4c31..7e584fd8aa 100644
--- a/examples/40_advanced/example_pandas_train_test.py
+++ b/examples/40_advanced/example_pandas_train_test.py
@@ -58,22 +58,19 @@
# Targets for classification are also automatically encoded
# If using fetch_openml, data is already properly encoded, below
# is an example for user reference
-X = pd.DataFrame(
- data=X,
- columns=['A' + str(i) for i in range(1, 15)]
-)
-desired_boolean_columns = ['A1']
-desired_categorical_columns = ['A4', 'A5', 'A6', 'A8', 'A9', 'A11', 'A12']
-desired_numerical_columns = ['A2', 'A3', 'A7', 'A10', 'A13', 'A14']
+X = pd.DataFrame(data=X, columns=["A" + str(i) for i in range(1, 15)])
+desired_boolean_columns = ["A1"]
+desired_categorical_columns = ["A4", "A5", "A6", "A8", "A9", "A11", "A12"]
+desired_numerical_columns = ["A2", "A3", "A7", "A10", "A13", "A14"]
for column in X.columns:
if column in desired_boolean_columns:
- X[column] = X[column].astype('bool')
+ X[column] = X[column].astype("bool")
elif column in desired_categorical_columns:
- X[column] = X[column].astype('category')
+ X[column] = X[column].astype("category")
else:
X[column] = pd.to_numeric(X[column])
-y = pd.DataFrame(y, dtype='category')
+y = pd.DataFrame(y, dtype="category")
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
X, y, test_size=0.5, random_state=3
@@ -100,15 +97,15 @@
############################################################################
# Plot the ensemble performance
# ===================================
-# The *performance_over_time_* attribute returns a pandas dataframe, which can
+# The *performance_over_time_* attribute returns a pandas dataframe, which can
# be directly used for plotting
poT = cls.performance_over_time_
poT.plot(
- x='Timestamp',
- kind='line',
+ x="Timestamp",
+ kind="line",
legend=True,
- title='Auto-sklearn accuracy over time',
+ title="Auto-sklearn accuracy over time",
grid=True,
)
plt.show()
diff --git a/examples/40_advanced/example_resampling.py b/examples/40_advanced/example_resampling.py
index 124316a60a..aa6a272373 100644
--- a/examples/40_advanced/example_resampling.py
+++ b/examples/40_advanced/example_resampling.py
@@ -22,8 +22,9 @@
# ============
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
############################################################################
# Holdout
@@ -32,15 +33,15 @@
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
- tmp_folder='/tmp/autosklearn_resampling_example_tmp',
+ tmp_folder="/tmp/autosklearn_resampling_example_tmp",
disable_evaluator_output=False,
# 'holdout' with 'train_size'=0.67 is the default argument setting
# for AutoSklearnClassifier. It is explicitly specified in this example
# for demonstrational purpose.
- resampling_strategy='holdout',
- resampling_strategy_arguments={'train_size': 0.67},
+ resampling_strategy="holdout",
+ resampling_strategy_arguments={"train_size": 0.67},
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
############################################################################
# Get the Score of the final ensemble
@@ -57,18 +58,18 @@
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
- tmp_folder='/tmp/autosklearn_resampling_example_tmp',
+ tmp_folder="/tmp/autosklearn_resampling_example_tmp",
disable_evaluator_output=False,
- resampling_strategy='cv',
- resampling_strategy_arguments={'folds': 5},
+ resampling_strategy="cv",
+ resampling_strategy_arguments={"folds": 5},
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
# One can use models trained during cross-validation directly to predict
# for unseen data. For this, all k models trained during k-fold
# cross-validation are considered as a single soft-voting ensemble inside
# the ensemble constructed with ensemble selection.
-print('Before re-fit')
+print("Before re-fit")
predictions = automl.predict(X_test)
print("Accuracy score CV", sklearn.metrics.accuracy_score(y_test, predictions))
@@ -78,7 +79,7 @@
# During fit(), models are fit on individual cross-validation folds. To use
# all available data, we call refit() which trains all models in the
# final ensemble on the whole dataset.
-print('After re-fit')
+print("After re-fit")
automl.refit(X_train.copy(), y_train.copy())
predictions = automl.predict(X_test)
print("Accuracy score CV", sklearn.metrics.accuracy_score(y_test, predictions))
@@ -106,11 +107,11 @@
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
- tmp_folder='/tmp/autosklearn_resampling_example_tmp',
+ tmp_folder="/tmp/autosklearn_resampling_example_tmp",
disable_evaluator_output=False,
resampling_strategy=resampling_strategy,
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
print(automl.sprint_statistics())
@@ -126,4 +127,6 @@
# Obviously, this score is pretty bad as we "destroyed" the dataset by
# splitting it on the first feature.
predictions = automl.predict(X_test)
-print("Accuracy score custom split", sklearn.metrics.accuracy_score(y_test, predictions))
+print(
+ "Accuracy score custom split", sklearn.metrics.accuracy_score(y_test, predictions)
+)
diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py
index 3d230f4ab0..d216caef7c 100644
--- a/examples/40_advanced/example_single_configuration.py
+++ b/examples/40_advanced/example_single_configuration.py
@@ -46,9 +46,7 @@
# We will limit the configuration space only to
# have RandomForest as a valid model. We recommend enabling all
# possible models to get a better performance.
- include={
- 'classifier': ['random_forest']
- },
+ include={"classifier": ["random_forest"]},
delete_tmp_folder_after_terminate=False,
)
@@ -60,17 +58,21 @@
# min_samples_split in the Random Forest. We recommend you to look into
# how the ConfigSpace package works here:
# https://automl.github.io/ConfigSpace/master/
-cs = cls.get_configuration_space(X, y, dataset_name='kr-vs-kp')
+cs = cls.get_configuration_space(X, y, dataset_name="kr-vs-kp")
config = cs.sample_configuration()
-config._values['classifier:random_forest:min_samples_split'] = 11
+config._values["classifier:random_forest:min_samples_split"] = 11
# Make sure that your changed configuration complies with the configuration space
config.is_valid_configuration()
-pipeline, run_info, run_value = cls.fit_pipeline(X=X_train, y=y_train,
- dataset_name='kr-vs-kp',
- config=config,
- X_test=X_test, y_test=y_test)
+pipeline, run_info, run_value = cls.fit_pipeline(
+ X=X_train,
+ y=y_train,
+ dataset_name="kr-vs-kp",
+ config=config,
+ X_test=X_test,
+ y_test=y_test,
+)
# This object complies with Scikit-Learn Pipeline API.
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
@@ -84,9 +86,9 @@
# We can make sure that our pipeline configuration was honored as follows
print("Passed Configuration:", pipeline.config)
-print("Random Forest:", pipeline.named_steps['classifier'].choice.estimator)
+print("Random Forest:", pipeline.named_steps["classifier"].choice.estimator)
# We can also search for new configurations using the fit() method
# Any configurations found by Auto-Sklearn -- even the ones created using
# fit_pipeline() are stored to disk and can be used for Ensemble Selection
-cs = cls.fit(X, y, dataset_name='kr-vs-kp')
+cs = cls.fit(X, y, dataset_name="kr-vs-kp")
diff --git a/examples/40_advanced/example_text_preprocessing.py b/examples/40_advanced/example_text_preprocessing.py
index f60188781b..76c2d91cfc 100644
--- a/examples/40_advanced/example_text_preprocessing.py
+++ b/examples/40_advanced/example_text_preprocessing.py
@@ -25,20 +25,28 @@
print(f"{X.info()}\n")
# manually convert these to string columns
-X = X.astype({'name': 'string', 'ticket': 'string', 'cabin': 'string', 'boat': 'string',
- 'home.dest': 'string'})
+X = X.astype(
+ {
+ "name": "string",
+ "ticket": "string",
+ "cabin": "string",
+ "boat": "string",
+ "home.dest": "string",
+ }
+)
# now *auto-sklearn* handles the string columns with its text feature preprocessing pipeline
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
cls = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=30,
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
- smac_scenario_args={'runcount_limit': 1},
+ smac_scenario_args={"runcount_limit": 1},
)
cls.fit(X_train, y_train, X_test, y_test)
@@ -48,20 +56,24 @@
X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True, as_frame=True)
-X = X.select_dtypes(exclude=['object'])
+X = X.select_dtypes(exclude=["object"])
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
cls = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=30,
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
- smac_scenario_args={'runcount_limit': 1},
+ smac_scenario_args={"runcount_limit": 1},
)
cls.fit(X_train, y_train, X_test, y_test)
predictions = cls.predict(X_test)
-print("Accuracy score without text preprocessing", sklearn.metrics.accuracy_score(y_test, predictions))
+print(
+ "Accuracy score without text preprocessing",
+ sklearn.metrics.accuracy_score(y_test, predictions),
+)
diff --git a/examples/60_search/example_parallel_manual_spawning_cli.py b/examples/60_search/example_parallel_manual_spawning_cli.py
index 41200cd78c..fa2bff375b 100644
--- a/examples/60_search/example_parallel_manual_spawning_cli.py
+++ b/examples/60_search/example_parallel_manual_spawning_cli.py
@@ -68,7 +68,7 @@
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.constants import MULTICLASS_CLASSIFICATION
-tmp_folder = '/tmp/autosklearn_parallel_3_example_tmp'
+tmp_folder = "/tmp/autosklearn_parallel_3_example_tmp"
worker_processes = []
@@ -83,7 +83,7 @@
# location. This filename is also given to the worker so they can find all
# relevant information to connect to the scheduler.
-scheduler_file_name = 'scheduler-file.json'
+scheduler_file_name = "scheduler-file.json"
############################################################################
@@ -99,12 +99,16 @@
# We will now execute this bash command from within Python to have a
# self-contained example:
+
def cli_start_scheduler(scheduler_file_name):
- command = (
- f"dask-scheduler --scheduler-file {scheduler_file_name} --idle-timeout 10"
+ command = f"dask-scheduler --scheduler-file {scheduler_file_name} --idle-timeout 10"
+ proc = subprocess.run(
+ command,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ shell=True,
+ check=True,
)
- proc = subprocess.run(command, stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT, shell=True, check=True)
while proc.returncode is None:
time.sleep(1)
@@ -112,7 +116,7 @@ def cli_start_scheduler(scheduler_file_name):
if __name__ == "__main__":
process_python_worker = multiprocessing.Process(
target=cli_start_scheduler,
- args=(scheduler_file_name, ),
+ args=(scheduler_file_name,),
)
process_python_worker.start()
worker_processes.append(process_python_worker)
@@ -141,22 +145,25 @@ def cli_start_scheduler(scheduler_file_name):
# We disable dask's memory management by passing ``--memory-limit`` as
# Auto-sklearn does the memory management itself.
+
def cli_start_worker(scheduler_file_name):
command = (
"DASK_DISTRIBUTED__WORKER__DAEMON=False "
"dask-worker --nthreads 1 --lifetime 35 --memory-limit 0 "
f"--scheduler-file {scheduler_file_name}"
)
- proc = subprocess.run(command, stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT, shell=True)
+ proc = subprocess.run(
+ command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True
+ )
while proc.returncode is None:
time.sleep(1)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
for _ in range(2):
process_cli_worker = multiprocessing.Process(
target=cli_start_worker,
- args=(scheduler_file_name, ),
+ args=(scheduler_file_name,),
)
process_cli_worker.start()
worker_processes.append(process_cli_worker)
@@ -178,8 +185,9 @@ def cli_start_worker(scheduler_file_name):
# ~~~~~~~~~~~~~~~~~~
if __name__ == "__main__":
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
- X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+ X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+ )
automl = AutoSklearnClassifier(
delete_tmp_folder_after_terminate=False,
@@ -198,7 +206,7 @@ def cli_start_worker(scheduler_file_name):
automl.fit_ensemble(
y_train,
task=MULTICLASS_CLASSIFICATION,
- dataset_name='digits',
+ dataset_name="digits",
ensemble_size=20,
ensemble_nbest=50,
)
@@ -215,7 +223,7 @@ def cli_start_worker(scheduler_file_name):
# This is only necessary if the workers are started from within this python
# script. In a real application one would start them directly from the command
# line.
-if __name__ == '__main__':
+if __name__ == "__main__":
process_python_worker.join()
for process in worker_processes:
process.join()
diff --git a/examples/60_search/example_parallel_manual_spawning_python.py b/examples/60_search/example_parallel_manual_spawning_python.py
index ed723598a9..75c5bcee30 100644
--- a/examples/60_search/example_parallel_manual_spawning_python.py
+++ b/examples/60_search/example_parallel_manual_spawning_python.py
@@ -58,7 +58,7 @@
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.constants import MULTICLASS_CLASSIFICATION
-tmp_folder = '/tmp/autosklearn_parallel_2_example_tmp'
+tmp_folder = "/tmp/autosklearn_parallel_2_example_tmp"
############################################################################
@@ -73,8 +73,9 @@
# https://docs.dask.org/en/latest/setup/python-advanced.html for further
# information.
+
def start_python_worker(scheduler_address):
- dask.config.set({'distributed.worker.daemon': False})
+ dask.config.set({"distributed.worker.daemon": False})
async def do_work():
async with dask.distributed.Nanny(
@@ -97,14 +98,17 @@ async def do_work():
# To use auto-sklearn in parallel we must guard the code with
# ``if __name__ == '__main__'``. We then start a dask cluster as a context,
# which means that it is automatically stopped once all computation is done.
-if __name__ == '__main__':
+if __name__ == "__main__":
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
- X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+ X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+ )
# 1. Create a dask scheduler (LocalCluster)
with dask.distributed.LocalCluster(
- n_workers=0, processes=True, threads_per_worker=1,
+ n_workers=0,
+ processes=True,
+ threads_per_worker=1,
) as cluster:
# 2. Start the workers
@@ -114,7 +118,7 @@ async def do_work():
for _ in range(2):
process_python_worker = multiprocessing.Process(
target=start_python_worker,
- args=(cluster.scheduler_address, ),
+ args=(cluster.scheduler_address,),
)
process_python_worker.start()
worker_processes.append(process_python_worker)
@@ -141,7 +145,7 @@ async def do_work():
automl.fit_ensemble(
y_train,
task=MULTICLASS_CLASSIFICATION,
- dataset_name='digits',
+ dataset_name="digits",
ensemble_size=20,
ensemble_nbest=50,
)
diff --git a/examples/60_search/example_parallel_n_jobs.py b/examples/60_search/example_parallel_n_jobs.py
index b7265ce3fa..1cb5014ca8 100644
--- a/examples/60_search/example_parallel_n_jobs.py
+++ b/examples/60_search/example_parallel_n_jobs.py
@@ -27,26 +27,27 @@
# Data Loading
# ============
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
############################################################################
# Build and fit a classifier
# ==========================
#
# To use ``n_jobs_`` we must guard the code
-if __name__ == '__main__':
+if __name__ == "__main__":
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
- tmp_folder='/tmp/autosklearn_parallel_1_example_tmp',
+ tmp_folder="/tmp/autosklearn_parallel_1_example_tmp",
n_jobs=4,
# Each one of the 4 jobs is allocated 3GB
memory_limit=3072,
seed=5,
)
- automl.fit(X_train, y_train, dataset_name='breast_cancer')
+ automl.fit(X_train, y_train, dataset_name="breast_cancer")
# Print statistics about the auto-sklearn run such as number of
# iterations, number of models failed with a time out.
diff --git a/examples/60_search/example_random_search.py b/examples/60_search/example_random_search.py
index 2c9cc76695..520c8c18b0 100644
--- a/examples/60_search/example_random_search.py
+++ b/examples/60_search/example_random_search.py
@@ -29,8 +29,9 @@
# ============
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
############################################################################
@@ -48,8 +49,10 @@ def get_roar_object_callback(
"""Random online adaptive racing."""
if n_jobs > 1 or (dask_client and len(dask_client.nthreads()) > 1):
- raise ValueError("Please make sure to guard the code invoking Auto-sklearn by "
- "`if __name__ == '__main__'` and remove this exception.")
+ raise ValueError(
+ "Please make sure to guard the code invoking Auto-sklearn by "
+ "`if __name__ == '__main__'` and remove this exception."
+ )
scenario = Scenario(scenario_dict)
return ROAR(
@@ -66,15 +69,15 @@ def get_roar_object_callback(
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=60,
per_run_time_limit=15,
- tmp_folder='/tmp/autosklearn_random_search_example_tmp',
+ tmp_folder="/tmp/autosklearn_random_search_example_tmp",
initial_configurations_via_metalearning=0,
# The callback to get the SMAC object
get_smac_object_callback=get_roar_object_callback,
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
-print('#' * 80)
-print('Results for ROAR.')
+print("#" * 80)
+print("Results for ROAR.")
# Print the final ensemble constructed by auto-sklearn via ROAR.
pprint(automl.show_models(), indent=4)
predictions = automl.predict(X_test)
@@ -88,22 +91,18 @@ def get_roar_object_callback(
# Fit a classifier using Random Search
# ====================================
def get_random_search_object_callback(
- scenario_dict,
- seed,
- ta,
- ta_kwargs,
- metalearning_configurations,
- n_jobs,
- dask_client
+ scenario_dict, seed, ta, ta_kwargs, metalearning_configurations, n_jobs, dask_client
):
- """ Random search """
+ """Random search"""
if n_jobs > 1 or (dask_client and len(dask_client.nthreads()) > 1):
- raise ValueError("Please make sure to guard the code invoking Auto-sklearn by "
- "`if __name__ == '__main__'` and remove this exception.")
+ raise ValueError(
+ "Please make sure to guard the code invoking Auto-sklearn by "
+ "`if __name__ == '__main__'` and remove this exception."
+ )
- scenario_dict['minR'] = len(scenario_dict['instances'])
- scenario_dict['initial_incumbent'] = 'RANDOM'
+ scenario_dict["minR"] = len(scenario_dict["instances"])
+ scenario_dict["initial_incumbent"] = "RANDOM"
scenario = Scenario(scenario_dict)
return ROAR(
scenario=scenario,
@@ -119,15 +118,15 @@ def get_random_search_object_callback(
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=60,
per_run_time_limit=15,
- tmp_folder='/tmp/autosklearn_random_search_example_tmp',
+ tmp_folder="/tmp/autosklearn_random_search_example_tmp",
initial_configurations_via_metalearning=0,
# Passing the callback to get the SMAC object
get_smac_object_callback=get_random_search_object_callback,
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
-print('#' * 80)
-print('Results for random search.')
+print("#" * 80)
+print("Results for random search.")
# Print the final ensemble constructed by auto-sklearn via random search.
pprint(automl.show_models(), indent=4)
diff --git a/examples/60_search/example_sequential.py b/examples/60_search/example_sequential.py
index fad088396d..1ff63649da 100644
--- a/examples/60_search/example_sequential.py
+++ b/examples/60_search/example_sequential.py
@@ -22,8 +22,9 @@
# ======================================
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1
+)
############################################################################
# Build and fit the classifier
@@ -32,14 +33,14 @@
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
- tmp_folder='/tmp/autosklearn_sequential_example_tmp',
+ tmp_folder="/tmp/autosklearn_sequential_example_tmp",
# Do not construct ensembles in parallel to avoid using more than one
# core at a time. The ensemble will be constructed after auto-sklearn
# finished fitting all machine learning models.
ensemble_size=0,
delete_tmp_folder_after_terminate=False,
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
# This call to fit_ensemble uses all models trained in the previous call
# to fit to build an ensemble which can be used with automl.predict()
diff --git a/examples/60_search/example_successive_halving.py b/examples/60_search/example_successive_halving.py
index fdb29da6e0..e57be7f157 100644
--- a/examples/60_search/example_successive_halving.py
+++ b/examples/60_search/example_successive_halving.py
@@ -27,6 +27,7 @@
# Define a callback that instantiates SuccessiveHalving
# =====================================================
+
def get_smac_object_callback(budget_type):
def get_smac_object(
scenario_dict,
@@ -43,8 +44,10 @@ def get_smac_object(
from smac.scenario.scenario import Scenario
if n_jobs > 1 or (dask_client and len(dask_client.nthreads()) > 1):
- raise ValueError("Please make sure to guard the code invoking Auto-sklearn by "
- "`if __name__ == '__main__'` and remove this exception.")
+ raise ValueError(
+ "Please make sure to guard the code invoking Auto-sklearn by "
+ "`if __name__ == '__main__'` and remove this exception."
+ )
scenario = Scenario(scenario_dict)
if len(metalearning_configurations) > 0:
@@ -54,7 +57,7 @@ def get_smac_object(
initial_configurations = None
rh2EPM = RunHistory2EPM4LogCost
- ta_kwargs['budget_type'] = budget_type
+ ta_kwargs["budget_type"] = budget_type
return SMAC4AC(
scenario=scenario,
@@ -66,14 +69,15 @@ def get_smac_object(
run_id=seed,
intensifier=SuccessiveHalving,
intensifier_kwargs={
- 'initial_budget': 10.0,
- 'max_budget': 100,
- 'eta': 2,
- 'min_chall': 1
+ "initial_budget": 10.0,
+ "max_budget": 100,
+ "eta": 2,
+ "min_chall": 1,
},
n_jobs=n_jobs,
dask_client=dask_client,
)
+
return get_smac_object
@@ -82,8 +86,9 @@ def get_smac_object(
# ============
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1, shuffle=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1, shuffle=True
+)
############################################################################
# Build and fit a classifier
@@ -92,23 +97,26 @@ def get_smac_object(
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=40,
per_run_time_limit=10,
- tmp_folder='/tmp/autosklearn_sh_example_tmp',
+ tmp_folder="/tmp/autosklearn_sh_example_tmp",
disable_evaluator_output=False,
# 'holdout' with 'train_size'=0.67 is the default argument setting
# for AutoSklearnClassifier. It is explicitly specified in this example
# for demonstrational purpose.
- resampling_strategy='holdout',
- resampling_strategy_arguments={'train_size': 0.67},
+ resampling_strategy="holdout",
+ resampling_strategy_arguments={"train_size": 0.67},
include={
- 'classifier': [
- 'extra_trees', 'gradient_boosting', 'random_forest',
- 'sgd', 'passive_aggressive'
+ "classifier": [
+ "extra_trees",
+ "gradient_boosting",
+ "random_forest",
+ "sgd",
+ "passive_aggressive",
],
- 'feature_preprocessor': ['no_preprocessing']
+ "feature_preprocessor": ["no_preprocessing"],
},
- get_smac_object_callback=get_smac_object_callback('iterations'),
+ get_smac_object_callback=get_smac_object_callback("iterations"),
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
pprint(automl.show_models(), indent=4)
predictions = automl.predict(X_test)
@@ -122,25 +130,29 @@ def get_smac_object(
# ========================================================
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1, shuffle=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1, shuffle=True
+)
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=40,
per_run_time_limit=10,
- tmp_folder='/tmp/autosklearn_sh_example_tmp_01',
+ tmp_folder="/tmp/autosklearn_sh_example_tmp_01",
disable_evaluator_output=False,
- resampling_strategy='cv',
+ resampling_strategy="cv",
include={
- 'classifier': [
- 'extra_trees', 'gradient_boosting', 'random_forest',
- 'sgd', 'passive_aggressive'
+ "classifier": [
+ "extra_trees",
+ "gradient_boosting",
+ "random_forest",
+ "sgd",
+ "passive_aggressive",
],
- 'feature_preprocessor': ['no_preprocessing']
+ "feature_preprocessor": ["no_preprocessing"],
},
- get_smac_object_callback=get_smac_object_callback('iterations'),
+ get_smac_object_callback=get_smac_object_callback("iterations"),
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
# Print the final ensemble constructed by auto-sklearn.
pprint(automl.show_models(), indent=4)
@@ -156,25 +168,29 @@ def get_smac_object(
# =============================================================
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1, shuffle=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1, shuffle=True
+)
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=40,
per_run_time_limit=10,
- tmp_folder='/tmp/autosklearn_sh_example_tmp_cv_02',
+ tmp_folder="/tmp/autosklearn_sh_example_tmp_cv_02",
disable_evaluator_output=False,
- resampling_strategy='cv-iterative-fit',
+ resampling_strategy="cv-iterative-fit",
include={
- 'classifier': [
- 'extra_trees', 'gradient_boosting', 'random_forest',
- 'sgd', 'passive_aggressive'
+ "classifier": [
+ "extra_trees",
+ "gradient_boosting",
+ "random_forest",
+ "sgd",
+ "passive_aggressive",
],
- 'feature_preprocessor': ['no_preprocessing']
+ "feature_preprocessor": ["no_preprocessing"],
},
- get_smac_object_callback=get_smac_object_callback('iterations'),
+ get_smac_object_callback=get_smac_object_callback("iterations"),
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
# Print the final ensemble constructed by auto-sklearn.
pprint(automl.show_models(), indent=4)
@@ -190,22 +206,23 @@ def get_smac_object(
# ===============================================================
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1, shuffle=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1, shuffle=True
+)
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=40,
per_run_time_limit=10,
- tmp_folder='/tmp/autosklearn_sh_example_tmp_03',
+ tmp_folder="/tmp/autosklearn_sh_example_tmp_03",
disable_evaluator_output=False,
# 'holdout' with 'train_size'=0.67 is the default argument setting
# for AutoSklearnClassifier. It is explicitly specified in this example
# for demonstrational purpose.
- resampling_strategy='holdout',
- resampling_strategy_arguments={'train_size': 0.67},
- get_smac_object_callback=get_smac_object_callback('subsample'),
+ resampling_strategy="holdout",
+ resampling_strategy_arguments={"train_size": 0.67},
+ get_smac_object_callback=get_smac_object_callback("subsample"),
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
# Print the final ensemble constructed by auto-sklearn.
pprint(automl.show_models(), indent=4)
@@ -222,27 +239,26 @@ def get_smac_object(
# subsamples otherwise
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=1, shuffle=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=1, shuffle=True
+)
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=40,
per_run_time_limit=10,
- tmp_folder='/tmp/autosklearn_sh_example_tmp_04',
+ tmp_folder="/tmp/autosklearn_sh_example_tmp_04",
disable_evaluator_output=False,
# 'holdout' with 'train_size'=0.67 is the default argument setting
# for AutoSklearnClassifier. It is explicitly specified in this example
# for demonstrational purpose.
- resampling_strategy='holdout',
- resampling_strategy_arguments={'train_size': 0.67},
+ resampling_strategy="holdout",
+ resampling_strategy_arguments={"train_size": 0.67},
include={
- 'classifier': [
- 'extra_trees', 'gradient_boosting', 'random_forest', 'sgd'
- ]
+ "classifier": ["extra_trees", "gradient_boosting", "random_forest", "sgd"]
},
- get_smac_object_callback=get_smac_object_callback('mixed'),
+ get_smac_object_callback=get_smac_object_callback("mixed"),
)
-automl.fit(X_train, y_train, dataset_name='breast_cancer')
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
# Print the final ensemble constructed by auto-sklearn.
pprint(automl.show_models(), indent=4)
diff --git a/examples/80_extending/example_extending_classification.py b/examples/80_extending/example_extending_classification.py
index b6132f4c18..b5112c022b 100644
--- a/examples/80_extending/example_extending_classification.py
+++ b/examples/80_extending/example_extending_classification.py
@@ -9,16 +9,22 @@
from pprint import pprint
from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter, \
- UniformIntegerHyperparameter, UniformFloatHyperparameter
+from ConfigSpace.hyperparameters import (
+ CategoricalHyperparameter,
+ UniformIntegerHyperparameter,
+ UniformFloatHyperparameter,
+)
import sklearn.metrics
import autosklearn.classification
import autosklearn.pipeline.components.classification
-from autosklearn.pipeline.components.base \
- import AutoSklearnClassificationAlgorithm
-from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, UNSIGNED_DATA, \
- PREDICTIONS
+from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm
+from autosklearn.pipeline.constants import (
+ DENSE,
+ SIGNED_DATA,
+ UNSIGNED_DATA,
+ PREDICTIONS,
+)
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
@@ -28,8 +34,8 @@
# Create MLP classifier component for auto-sklearn
# ================================================
-class MLPClassifier(AutoSklearnClassificationAlgorithm):
+class MLPClassifier(AutoSklearnClassificationAlgorithm):
def __init__(
self,
hidden_layer_depth,
@@ -52,15 +58,18 @@ def fit(self, X, y):
self.alpha = float(self.alpha)
from sklearn.neural_network import MLPClassifier
+
hidden_layer_sizes = tuple(
self.num_nodes_per_layer for i in range(self.hidden_layer_depth)
)
- self.estimator = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes,
- activation=self.activation,
- alpha=self.alpha,
- solver=self.solver,
- random_state=self.random_state)
+ self.estimator = MLPClassifier(
+ hidden_layer_sizes=hidden_layer_sizes,
+ activation=self.activation,
+ alpha=self.alpha,
+ solver=self.solver,
+ random_state=self.random_state,
+ )
self.estimator.fit(X, y)
return self
@@ -77,17 +86,17 @@ def predict_proba(self, X):
@staticmethod
def get_properties(dataset_properties=None):
return {
- 'shortname': 'MLP Classifier',
- 'name': 'MLP CLassifier',
- 'handles_regression': False,
- 'handles_classification': True,
- 'handles_multiclass': True,
- 'handles_multilabel': False,
- 'handles_multioutput': False,
- 'is_deterministic': False,
+ "shortname": "MLP Classifier",
+ "name": "MLP CLassifier",
+ "handles_regression": False,
+ "handles_classification": True,
+ "handles_multiclass": True,
+ "handles_multilabel": False,
+ "handles_multioutput": False,
+ "is_deterministic": False,
# Both input and output must be tuple(iterable)
- 'input': [DENSE, SIGNED_DATA, UNSIGNED_DATA],
- 'output': [PREDICTIONS]
+ "input": [DENSE, SIGNED_DATA, UNSIGNED_DATA],
+ "output": [PREDICTIONS],
}
@staticmethod
@@ -100,18 +109,25 @@ def get_hyperparameter_search_space(dataset_properties=None):
name="num_nodes_per_layer", lower=16, upper=216, default_value=32
)
activation = CategoricalHyperparameter(
- name="activation", choices=['identity', 'logistic', 'tanh', 'relu'],
- default_value='relu'
+ name="activation",
+ choices=["identity", "logistic", "tanh", "relu"],
+ default_value="relu",
)
alpha = UniformFloatHyperparameter(
name="alpha", lower=0.0001, upper=1.0, default_value=0.0001
)
solver = CategoricalHyperparameter(
- name="solver", choices=['lbfgs', 'sgd', 'adam'], default_value='adam'
+ name="solver", choices=["lbfgs", "sgd", "adam"], default_value="adam"
+ )
+ cs.add_hyperparameters(
+ [
+ hidden_layer_depth,
+ num_nodes_per_layer,
+ activation,
+ alpha,
+ solver,
+ ]
)
- cs.add_hyperparameters([
- hidden_layer_depth, num_nodes_per_layer, activation, alpha, solver,
- ])
return cs
@@ -134,13 +150,11 @@ def get_hyperparameter_search_space(dataset_properties=None):
clf = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=30,
per_run_time_limit=10,
- include={
- 'classifier': ['MLPClassifier']
- },
+ include={"classifier": ["MLPClassifier"]},
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
- smac_scenario_args={'runcount_limit': 5},
+ smac_scenario_args={"runcount_limit": 5},
)
clf.fit(X_train, y_train)
diff --git a/examples/80_extending/example_extending_data_preprocessor.py b/examples/80_extending/example_extending_data_preprocessor.py
index 7fdd72e971..aa5c443255 100644
--- a/examples/80_extending/example_extending_data_preprocessor.py
+++ b/examples/80_extending/example_extending_data_preprocessor.py
@@ -21,9 +21,8 @@
# Create NoPreprocessing component for auto-sklearn
# =================================================
class NoPreprocessing(AutoSklearnPreprocessingAlgorithm):
-
def __init__(self, **kwargs):
- """ This preprocessors does not change the data """
+ """This preprocessors does not change the data"""
# Some internal checks makes sure parameters are set
for key, val in kwargs.items():
setattr(self, key, val)
@@ -37,16 +36,16 @@ def transform(self, X):
@staticmethod
def get_properties(dataset_properties=None):
return {
- 'shortname': 'NoPreprocessing',
- 'name': 'NoPreprocessing',
- 'handles_regression': True,
- 'handles_classification': True,
- 'handles_multiclass': True,
- 'handles_multilabel': True,
- 'handles_multioutput': True,
- 'is_deterministic': True,
- 'input': (SPARSE, DENSE, UNSIGNED_DATA),
- 'output': (INPUT,)
+ "shortname": "NoPreprocessing",
+ "name": "NoPreprocessing",
+ "handles_regression": True,
+ "handles_classification": True,
+ "handles_multiclass": True,
+ "handles_multilabel": True,
+ "handles_multioutput": True,
+ "is_deterministic": True,
+ "input": (SPARSE, DENSE, UNSIGNED_DATA),
+ "output": (INPUT,),
}
@staticmethod
@@ -70,13 +69,11 @@ def get_hyperparameter_search_space(dataset_properties=None):
clf = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
- include={
- 'data_preprocessor': ['NoPreprocessing']
- },
+ include={"data_preprocessor": ["NoPreprocessing"]},
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
- smac_scenario_args={'runcount_limit': 5},
+ smac_scenario_args={"runcount_limit": 5},
)
clf.fit(X_train, y_train)
diff --git a/examples/80_extending/example_extending_preprocessor.py b/examples/80_extending/example_extending_preprocessor.py
index 9ac93a45b3..1eb3fc1daf 100644
--- a/examples/80_extending/example_extending_preprocessor.py
+++ b/examples/80_extending/example_extending_preprocessor.py
@@ -10,16 +10,17 @@
from pprint import pprint
from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import UniformFloatHyperparameter, CategoricalHyperparameter
+from ConfigSpace.hyperparameters import (
+ UniformFloatHyperparameter,
+ CategoricalHyperparameter,
+)
from ConfigSpace.conditions import InCondition
import sklearn.metrics
import autosklearn.classification
import autosklearn.pipeline.components.feature_preprocessing
-from autosklearn.pipeline.components.base \
- import AutoSklearnPreprocessingAlgorithm
-from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, \
- UNSIGNED_DATA
+from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
+from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, UNSIGNED_DATA
from autosklearn.util.common import check_none
from sklearn.datasets import load_breast_cancer
@@ -30,7 +31,6 @@
# Create LDA component for auto-sklearn
# =====================================
class LDA(AutoSklearnPreprocessingAlgorithm):
-
def __init__(self, solver, tol, shrinkage=None, random_state=None):
self.solver = solver
self.shrinkage = shrinkage
@@ -46,6 +46,7 @@ def fit(self, X, y=None):
self.tol = float(self.tol)
import sklearn.discriminant_analysis
+
self.preprocessor = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
shrinkage=self.shrinkage,
solver=self.solver,
@@ -62,23 +63,23 @@ def transform(self, X):
@staticmethod
def get_properties(dataset_properties=None):
return {
- 'shortname': 'LDA',
- 'name': 'Linear Discriminant Analysis',
- 'handles_regression': False,
- 'handles_classification': True,
- 'handles_multiclass': False,
- 'handles_multilabel': False,
- 'handles_multioutput': False,
- 'is_deterministic': True,
- 'input': (DENSE, UNSIGNED_DATA, SIGNED_DATA),
- 'output': (DENSE, UNSIGNED_DATA, SIGNED_DATA)
+ "shortname": "LDA",
+ "name": "Linear Discriminant Analysis",
+ "handles_regression": False,
+ "handles_classification": True,
+ "handles_multiclass": False,
+ "handles_multilabel": False,
+ "handles_multioutput": False,
+ "is_deterministic": True,
+ "input": (DENSE, UNSIGNED_DATA, SIGNED_DATA),
+ "output": (DENSE, UNSIGNED_DATA, SIGNED_DATA),
}
@staticmethod
def get_hyperparameter_search_space(dataset_properties=None):
cs = ConfigurationSpace()
solver = CategoricalHyperparameter(
- name="solver", choices=['svd', 'lsqr', 'eigen'], default_value='svd'
+ name="solver", choices=["svd", "lsqr", "eigen"], default_value="svd"
)
shrinkage = UniformFloatHyperparameter(
name="shrinkage", lower=0.0, upper=1.0, default_value=0.5
@@ -87,7 +88,7 @@ def get_hyperparameter_search_space(dataset_properties=None):
name="tol", lower=0.0001, upper=1, default_value=0.0001
)
cs.add_hyperparameters([solver, shrinkage, tol])
- shrinkage_condition = InCondition(shrinkage, solver, ['lsqr', 'eigen'])
+ shrinkage_condition = InCondition(shrinkage, solver, ["lsqr", "eigen"])
cs.add_condition(shrinkage_condition)
return cs
@@ -115,13 +116,11 @@ def get_hyperparameter_search_space(dataset_properties=None):
clf = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=30,
- include={
- 'feature_preprocessor': ['LDA']
- },
+ include={"feature_preprocessor": ["LDA"]},
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
- smac_scenario_args={'runcount_limit': 5},
+ smac_scenario_args={"runcount_limit": 5},
)
clf.fit(X_train, y_train)
diff --git a/examples/80_extending/example_extending_regression.py b/examples/80_extending/example_extending_regression.py
index 3bdc008d4e..4d6987a9db 100644
--- a/examples/80_extending/example_extending_regression.py
+++ b/examples/80_extending/example_extending_regression.py
@@ -9,16 +9,24 @@
from pprint import pprint
from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \
- UniformIntegerHyperparameter, CategoricalHyperparameter
+from ConfigSpace.hyperparameters import (
+ UniformFloatHyperparameter,
+ UniformIntegerHyperparameter,
+ CategoricalHyperparameter,
+)
from ConfigSpace.conditions import EqualsCondition
import sklearn.metrics
import autosklearn.regression
import autosklearn.pipeline.components.regression
from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm
-from autosklearn.pipeline.constants import SPARSE, DENSE, \
- SIGNED_DATA, UNSIGNED_DATA, PREDICTIONS
+from autosklearn.pipeline.constants import (
+ SPARSE,
+ DENSE,
+ SIGNED_DATA,
+ UNSIGNED_DATA,
+ PREDICTIONS,
+)
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
@@ -28,8 +36,8 @@
# Implement kernel ridge regression component for auto-sklearn
# ============================================================
-class KernelRidgeRegression(AutoSklearnRegressionAlgorithm):
+class KernelRidgeRegression(AutoSklearnRegressionAlgorithm):
def __init__(self, alpha, kernel, gamma, degree, coef0, random_state=None):
self.alpha = alpha
self.kernel = kernel
@@ -46,12 +54,13 @@ def fit(self, X, y):
self.coef0 = float(self.coef0)
import sklearn.kernel_ridge
+
self.estimator = sklearn.kernel_ridge.KernelRidge(
alpha=self.alpha,
kernel=self.kernel,
gamma=self.gamma,
degree=self.degree,
- coef0=self.coef0
+ coef0=self.coef0,
)
self.estimator.fit(X, y)
return self
@@ -64,42 +73,46 @@ def predict(self, X):
@staticmethod
def get_properties(dataset_properties=None):
return {
- 'shortname': 'KRR',
- 'name': 'Kernel Ridge Regression',
- 'handles_regression': True,
- 'handles_classification': False,
- 'handles_multiclass': False,
- 'handles_multilabel': False,
- 'handles_multioutput': True,
- 'is_deterministic': True,
- 'input': (SPARSE, DENSE, UNSIGNED_DATA, SIGNED_DATA),
- 'output': (PREDICTIONS,)
+ "shortname": "KRR",
+ "name": "Kernel Ridge Regression",
+ "handles_regression": True,
+ "handles_classification": False,
+ "handles_multiclass": False,
+ "handles_multilabel": False,
+ "handles_multioutput": True,
+ "is_deterministic": True,
+ "input": (SPARSE, DENSE, UNSIGNED_DATA, SIGNED_DATA),
+ "output": (PREDICTIONS,),
}
@staticmethod
def get_hyperparameter_search_space(dataset_properties=None):
cs = ConfigurationSpace()
alpha = UniformFloatHyperparameter(
- name='alpha', lower=10 ** -5, upper=1, log=True, default_value=1.0
+ name="alpha", lower=10**-5, upper=1, log=True, default_value=1.0
)
kernel = CategoricalHyperparameter(
- name='kernel',
+ name="kernel",
# We restrict ourselves to two possible kernels for this example
- choices=['polynomial', 'rbf'],
- default_value='polynomial'
+ choices=["polynomial", "rbf"],
+ default_value="polynomial",
)
gamma = UniformFloatHyperparameter(
- name='gamma', lower=0.00001, upper=1, default_value=0.1, log=True
+ name="gamma", lower=0.00001, upper=1, default_value=0.1, log=True
)
degree = UniformIntegerHyperparameter(
- name='degree', lower=2, upper=5, default_value=3
+ name="degree", lower=2, upper=5, default_value=3
)
coef0 = UniformFloatHyperparameter(
- name='coef0', lower=1e-2, upper=1e2, log=True, default_value=1,
+ name="coef0",
+ lower=1e-2,
+ upper=1e2,
+ log=True,
+ default_value=1,
)
cs.add_hyperparameters([alpha, kernel, gamma, degree, coef0])
- degree_condition = EqualsCondition(degree, kernel, 'polynomial')
- coef0_condition = EqualsCondition(coef0, kernel, 'polynomial')
+ degree_condition = EqualsCondition(degree, kernel, "polynomial")
+ coef0_condition = EqualsCondition(coef0, kernel, "polynomial")
cs.add_conditions([degree_condition, coef0_condition])
return cs
@@ -123,13 +136,11 @@ def get_hyperparameter_search_space(dataset_properties=None):
reg = autosklearn.regression.AutoSklearnRegressor(
time_left_for_this_task=30,
per_run_time_limit=10,
- include={
- 'regressor': ['KernelRidgeRegression']
- },
+ include={"regressor": ["KernelRidgeRegression"]},
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
- smac_scenario_args={'runcount_limit': 5},
+ smac_scenario_args={"runcount_limit": 5},
)
reg.fit(X_train, y_train)
diff --git a/examples/80_extending/example_restrict_number_of_hyperparameters.py b/examples/80_extending/example_restrict_number_of_hyperparameters.py
index 9c6ec2501f..d8bd2f4a98 100644
--- a/examples/80_extending/example_restrict_number_of_hyperparameters.py
+++ b/examples/80_extending/example_restrict_number_of_hyperparameters.py
@@ -9,15 +9,19 @@
"""
from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, UniformFloatHyperparameter
+from ConfigSpace.hyperparameters import (
+ UniformIntegerHyperparameter,
+ UniformFloatHyperparameter,
+)
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import autosklearn.classification
import autosklearn.pipeline.components.classification
-from autosklearn.pipeline.components.classification \
- import AutoSklearnClassificationAlgorithm
+from autosklearn.pipeline.components.classification import (
+ AutoSklearnClassificationAlgorithm,
+)
from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE
@@ -29,8 +33,8 @@
# default parametrization (``max_features``). Instead, it also
# tunes the number of estimators (``n_estimators``).
-class CustomRandomForest(AutoSklearnClassificationAlgorithm):
+class CustomRandomForest(AutoSklearnClassificationAlgorithm):
def __init__(self, n_estimators, max_features, random_state=None):
self.n_estimators = n_estimators
self.max_features = max_features
@@ -67,16 +71,16 @@ def predict_proba(self, X):
@staticmethod
def get_properties(dataset_properties=None):
return {
- 'shortname': 'RF',
- 'name': 'Random Forest Classifier',
- 'handles_regression': False,
- 'handles_classification': True,
- 'handles_multiclass': True,
- 'handles_multilabel': True,
- 'handles_multioutput': False,
- 'is_deterministic': True,
- 'input': (DENSE, SPARSE, UNSIGNED_DATA),
- 'output': (PREDICTIONS,)
+ "shortname": "RF",
+ "name": "Random Forest Classifier",
+ "handles_regression": False,
+ "handles_classification": True,
+ "handles_multiclass": True,
+ "handles_multilabel": True,
+ "handles_multioutput": False,
+ "is_deterministic": True,
+ "input": (DENSE, SPARSE, UNSIGNED_DATA),
+ "output": (PREDICTIONS,),
}
@staticmethod
@@ -87,8 +91,12 @@ def get_hyperparameter_search_space(dataset_properties=None):
# m is the total number of features, and max_features is the hyperparameter specified below.
# The default is 0.5, which yields sqrt(m) features as max_features in the estimator. This
# corresponds with Geurts' heuristic.
- max_features = UniformFloatHyperparameter("max_features", 0., 1., default_value=0.5)
- n_estimators = UniformIntegerHyperparameter("n_estimators", 10, 1000, default_value=100)
+ max_features = UniformFloatHyperparameter(
+ "max_features", 0.0, 1.0, default_value=0.5
+ )
+ n_estimators = UniformIntegerHyperparameter(
+ "n_estimators", 10, 1000, default_value=100
+ )
cs.add_hyperparameters([max_features, n_estimators])
return cs
@@ -114,13 +122,11 @@ def get_hyperparameter_search_space(dataset_properties=None):
time_left_for_this_task=30,
per_run_time_limit=10,
# Here we exclude auto-sklearn's default random forest component
- exclude={
- 'classifier': ['random_forest']
- },
+ exclude={"classifier": ["random_forest"]},
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
- smac_scenario_args={'runcount_limit': 1},
+ smac_scenario_args={"runcount_limit": 1},
)
clf.fit(X_train, y_train)
@@ -131,5 +137,5 @@ def get_hyperparameter_search_space(dataset_properties=None):
# Observe that this configuration space only contains our custom random
# forest, but not auto-sklearn's ``random_forest``
cs = clf.get_configuration_space(X_train, y_train)
-assert 'random_forest' not in str(cs)
+assert "random_forest" not in str(cs)
print(cs)
diff --git a/misc/create_hyperparameter_table.py b/misc/create_hyperparameter_table.py
index dd23f8ac29..7495ee686a 100644
--- a/misc/create_hyperparameter_table.py
+++ b/misc/create_hyperparameter_table.py
@@ -15,43 +15,43 @@
CONST = "constant"
UN = "unparameterized"
-template_string = \
-"""
+template_string = r"""
\documentclass{article} %% For LaTeX2
\usepackage[a4paper, left=5mm, right=5mm, top=5mm, bottom=5mm]{geometry}
-%%\\usepackage[landscape]{geometry}
-\\usepackage{multirow} %% import command \multicolmun
-\\usepackage{tabularx} %% Convenient table formatting
-\\usepackage{booktabs} %% provides \\toprule, \midrule and \\bottomrule
+%%\usepackage[landscape]{geometry}
+\usepackage{multirow} %% import command \multicolmun
+\usepackage{tabularx} %% Convenient table formatting
+\usepackage{booktabs} %% provides \\toprule, \midrule and \\bottomrule
-\\begin{document}
+\begin{document}
%s
-\\end{document}
+\end{document}
"""
-caption_str = "Number of Hyperparameters for each possible %s " \
- "for a dataset with these properties: %s"
-
-table_str = \
-"""
-\\begin{table}[t!]
-\\centering
-\\scriptsize
-\\caption{ %s }
-\\begin{tabularx}{\\textwidth}{ X X X X X X }
-\\toprule
-name & \#$\lambda$ & cat (cond) & cont (cond) & const & un \\\\
-\\toprule
-\\\\
+caption_str = (
+ "Number of Hyperparameters for each possible %s "
+ "for a dataset with these properties: %s"
+)
+
+table_str = r"""
+\begin{table}[t!]
+\centering
+\scriptsize
+\caption{ %s }
+\begin{tabularx}{\textwidth}{ X X X X X X }
+\toprule
+name & \#$\lambda$ & cat (cond) & cont (cond) & const & un \\
+\toprule
+\\
%s
-\\\\
-\\toprule
-\\bottomrule
-\\end{tabularx}
-\\end{table}
+\\
+\toprule
+\bottomrule
+\end{tabularx}
+\end{table}
"""
@@ -59,11 +59,13 @@ def get_dict(task_type="classifier", **kwargs):
assert task_type in ("classifier", "regressor")
if task_type == "classifier":
- cs = autosklearn.pipeline.classification.SimpleClassificationPipeline\
- .get_hyperparameter_search_space(dataset_properties=kwargs)
+ cs = autosklearn.pipeline.classification.SimpleClassificationPipeline.get_hyperparameter_search_space(
+ dataset_properties=kwargs
+ )
elif task_type == "regressor":
- cs = autosklearn.pipeline.regression.SimpleRegressionPipeline\
- .get_hyperparameter_search_space(dataset_properties=kwargs)
+ cs = autosklearn.pipeline.regression.SimpleRegressionPipeline.get_hyperparameter_search_space(
+ dataset_properties=kwargs
+ )
else:
raise ValueError("'task_type' is not in ('classifier', 'regressor')")
@@ -73,7 +75,7 @@ def get_dict(task_type="classifier", **kwargs):
for h in cs.get_hyperparameters():
if h.name == "feature_preprocessor:__choice__":
preprocessor = h
- elif h.name == (task_type + ':__choice__'):
+ elif h.name == (task_type + ":__choice__"):
estimator = h
if estimator is None:
@@ -100,8 +102,9 @@ def get_dict(task_type="classifier", **kwargs):
preprocessor_dict[i][UN] = 0
for h in cs.get_hyperparameters():
- if h.name == "feature_preprocessor:__choice__" or \
- h.name == (task_type + ':__choice__'):
+ if h.name == "feature_preprocessor:__choice__" or h.name == (
+ task_type + ":__choice__"
+ ):
continue
# walk over both dicts
for d in (estimator_dict, preprocessor_dict):
@@ -116,14 +119,18 @@ def get_dict(task_type="classifier", **kwargs):
d[est][CAT] += 1
elif isinstance(h, ConfigSpace.hyperparameters.Constant):
d[est][CONST] += 1
- elif isinstance(h, ConfigSpace.hyperparameters.UnParametrizedHyperparameter):
+ elif isinstance(
+ h, ConfigSpace.hyperparameters.UnParametrizedHyperparameter
+ ):
d[est][UN] += 1
else:
raise ValueError("Don't know that type: %s" % type(h))
for h in cs.get_conditions():
- if h.parent.name == (task_type + ':__choice__') or h.parent.name == \
- "feature_preprocessor:__choice__":
+ if (
+ h.parent.name == (task_type + ":__choice__")
+ or h.parent.name == "feature_preprocessor:__choice__"
+ ):
# ignore this condition
# print "IGNORE", h
continue
@@ -132,22 +139,30 @@ def get_dict(task_type="classifier", **kwargs):
for d in (estimator_dict, preprocessor_dict):
est = h.child.name.split(":")[1]
if est not in d:
- #print "Could not find %s" % est
+ # print "Could not find %s" % est
continue
- #print "####"
- #print vars(h)
- #print h.parent
- #print type(h)
- if isinstance(h.child, ConfigSpace.hyperparameters.UniformIntegerHyperparameter):
+ # print "####"
+ # print vars(h)
+ # print h.parent
+ # print type(h)
+ if isinstance(
+ h.child, ConfigSpace.hyperparameters.UniformIntegerHyperparameter
+ ):
d[est][COND][CONT] += 1
- elif isinstance(h.child, ConfigSpace.hyperparameters.UniformFloatHyperparameter):
+ elif isinstance(
+ h.child, ConfigSpace.hyperparameters.UniformFloatHyperparameter
+ ):
d[est][COND][CONT] += 1
- elif isinstance(h.child, ConfigSpace.hyperparameters.CategoricalHyperparameter):
+ elif isinstance(
+ h.child, ConfigSpace.hyperparameters.CategoricalHyperparameter
+ ):
d[est][COND][CAT] += 1
elif isinstance(h.child, ConfigSpace.hyperparameters.Constant):
d[est][COND][CONST] += 1
- elif isinstance(h.child, ConfigSpace.hyperparameters.UnParametrizedHyperparameter):
+ elif isinstance(
+ h.child, ConfigSpace.hyperparameters.UnParametrizedHyperparameter
+ ):
d[est][COND][UN] += 1
else:
raise ValueError("Don't know that type: %s" % type(h))
@@ -159,7 +174,11 @@ def build_table(d):
lines = list()
for est in d.keys():
sum_ = 0
- t_list = list([est.replace("_", " "), ])
+ t_list = list(
+ [
+ est.replace("_", " "),
+ ]
+ )
for t in (CAT, CONT):
sum_ += d[est][t]
t_list.append("%d (%d)" % (d[est][t], d[est][COND][t]))
@@ -175,33 +194,68 @@ def main():
parser = ArgumentParser()
# General Options
- parser.add_argument("-s", "--save", dest="save", default=None,
- help="Where to save plot instead of showing it?")
- parser.add_argument("-t", "--type", dest="task_type", default="classifier",
- choices=("classifier", ), help="Type of dataset")
- parser.add_argument("--sparse", dest="sparse", default=False,
- action="store_true", help="dataset property")
+ parser.add_argument(
+ "-s",
+ "--save",
+ dest="save",
+ default=None,
+ help="Where to save plot instead of showing it?",
+ )
+ parser.add_argument(
+ "-t",
+ "--type",
+ dest="task_type",
+ default="classifier",
+ choices=("classifier",),
+ help="Type of dataset",
+ )
+ parser.add_argument(
+ "--sparse",
+ dest="sparse",
+ default=False,
+ action="store_true",
+ help="dataset property",
+ )
prop = parser.add_mutually_exclusive_group(required=True)
- prop.add_argument("--multilabel", dest="multilabel", default=False,
- action="store_true", help="dataset property")
- prop.add_argument("--multiclass", dest="multiclass", default=False,
- action="store_true", help="dataset property")
- prop.add_argument("--binary", dest="binary", default=False,
- action="store_true", help="dataset property")
+ prop.add_argument(
+ "--multilabel",
+ dest="multilabel",
+ default=False,
+ action="store_true",
+ help="dataset property",
+ )
+ prop.add_argument(
+ "--multiclass",
+ dest="multiclass",
+ default=False,
+ action="store_true",
+ help="dataset property",
+ )
+ prop.add_argument(
+ "--binary",
+ dest="binary",
+ default=False,
+ action="store_true",
+ help="dataset property",
+ )
args, unknown = parser.parse_known_args()
- props = {"sparse": args.sparse,
- "multilabel": args.multilabel,
- "multiclass": args.multiclass}
+ props = {
+ "sparse": args.sparse,
+ "multilabel": args.multilabel,
+ "multiclass": args.multiclass,
+ }
est_dict, preproc_dict = get_dict(task_type=args.task_type, **props)
est_table = build_table(est_dict)
preproc_table = build_table(preproc_dict)
est_table = table_str % (caption_str % (args.task_type, str(props)), est_table)
- preproc_table = table_str % (caption_str % (
- "feature_preprocessor", str(props)), preproc_table)
+ preproc_table = table_str % (
+ caption_str % ("feature_preprocessor", str(props)),
+ preproc_table,
+ )
tex_doc = template_string % "\n".join([est_table, preproc_table])
if args.save is None:
@@ -210,7 +264,7 @@ def main():
fh = open(args.save, "w")
fh.write(tex_doc)
fh.close()
- proc = subprocess.Popen(shlex.split('pdflatex %s' % args.save))
+ proc = subprocess.Popen(shlex.split("pdflatex %s" % args.save))
proc.communicate()
try:
os.remove(args.save.replace(".tex", ".aux"))
@@ -221,4 +275,4 @@ def main():
if __name__ == "__main__":
- main()
\ No newline at end of file
+ main()
diff --git a/misc/create_list_of_potential_models.py b/misc/create_list_of_potential_models.py
index 8153c639e7..cec7959ab1 100644
--- a/misc/create_list_of_potential_models.py
+++ b/misc/create_list_of_potential_models.py
@@ -5,30 +5,32 @@
import sklearn.base
-files = glob.glob(os.path.join(os.path.dirname(sklearn.__file__), "**/*.py"),
- recursive=True)
+files = glob.glob(
+ os.path.join(os.path.dirname(sklearn.__file__), "**/*.py"), recursive=True
+)
+
def find_all(cls):
found = set()
for file in files:
- parts = file.split('/')
- parts[-1] = parts[-1].replace('.py', '')
- sklearn_dir = parts.index('sklearn')
- name = '.'.join(parts[sklearn_dir:])
+ parts = file.split("/")
+ parts[-1] = parts[-1].replace(".py", "")
+ sklearn_dir = parts.index("sklearn")
+ name = ".".join(parts[sklearn_dir:])
module = importlib.import_module(name)
for member in module.__dict__.values():
if not inspect.isclass(member):
continue
if issubclass(member, cls):
found.add(member)
- print('#####')
+ print("#####")
found = list(found)
found.sort(key=lambda t: str(t))
for f in found:
print(f)
return found
-#classifiers = find_all(sklearn.base.ClassifierMixin)
-#regressors = find_all(sklearn.base.RegressorMixin)
-preprocs = find_all(sklearn.base.TransformerMixin)
+# classifiers = find_all(sklearn.base.ClassifierMixin)
+# regressors = find_all(sklearn.base.RegressorMixin)
+preprocs = find_all(sklearn.base.TransformerMixin)
diff --git a/mypy.ini b/mypy.ini
deleted file mode 100644
index 8c8b6589af..0000000000
--- a/mypy.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-[mypy]
-# Reports any config lines that are not recognized
-warn_unused_configs=True
-ignore_missing_imports=True
-follow_imports=skip
-disallow_untyped_defs=True
-disallow_incomplete_defs=True
-disallow_untyped_decorators=True
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..0e48e3fc5f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,152 @@
+# For TOML reference
+# https://learnxinyminutes.com/docs/toml/
+
+[tool.pytest.ini_options]
+testpaths = ["test"]
+minversion = "3.7"
+#addopts = "--cov=autosklearn"
+
+[tool.coverage.run]
+branch = true
+context = "autosklearn"
+
+[tool.coverage.report]
+show_missing = true
+skip_covered = true
+exclude_lines = [
+ "pragma: no cover",
+ '\.\.\.',
+ "raise NotImplementedError",
+ "if TYPE_CHECKING"
+]
+
+[tool.black]
+target-version = ['py37']
+
+[tool.isort]
+py_version = "37"
+profile = "black" # Play nicely with black
+src_paths = ["autosklearn", "test"]
+known_types = ["typing", "abc"] # We put these in their own section TYPES
+known_first_party = ["autosklearn"] # Say that autosklearn is FIRSTPARTY
+known_test = ["test"] # Say that test.* is TEST
+sections = ["FUTURE", "TYPES", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "TEST", "LOCALFOLDER"] # section ordering
+multi_line_output = 3 # https://pycqa.github.io/isort/docs/configuration/multi_line_output_modes.html
+
+[tool.pydocstyle]
+convention = "numpy"
+add-ignore = [ # http://www.pydocstyle.org/en/stable/error_codes.html
+ "D100", # Missing docstring in public module
+ "D101", # Missing docstring in public class
+ "D104", # Missing docstring in public package
+ "D105", # Missing docstring in magic method
+
+ "D203", # 1 blank line required before class docstring
+ "D205", # 1 blank line required between summary and description
+ "D210", # No whitespaces allowed surrounding docstring text
+ "D212", # Multi-line docstring summary should start at the first line
+ "D213", # Multi-line docstring summary should start at the second line
+
+ "D400", # First line should end with a period
+ "D401", # First line should be in imperative mood
+ "D404", # First word of the docstring should not be "This"
+ "D413", # Missing blank line after last section
+ "D415" # First line should end with a period, question mark, or exclamation point
+]
+
+[tool.mypy]
+python_version = "3.7"
+
+show_error_codes = true
+
+warn_unused_configs = true # warn about unused [tool.mypy] lines
+
+follow_imports = "normal" # Type check top level api code we use from imports
+ignore_missing_imports = false # prefer explicit ignores
+
+disallow_untyped_defs = true # All functions must have types
+disallow_untyped_decorators = true # ... even decorators
+disallow_incomplete_defs = true # ...all types
+
+# This is a problem with the tests of `automl_common` being distributed as a submodule
+# probably indicative that is should be a package.
+exclude = "autosklearn/automl_common/test"
+
+# This is handled by automl_common itself in its own CI
+[[tool.mypy.overrides]]
+module = ["autosklearn.automl_common.common.*"]
+ignore_errors = true
+
+# Submodules that need to be updated with mypy
+[[tool.mypy.overrides]]
+module = [
+ "autosklearn", #__init__
+ "autosklearn.estimators",
+ "autosklearn.automl",
+ "autosklearn.smbo",
+ "autosklearn.experimental.askl2",
+ "autosklearn.ensemble_builder",
+ "autosklearn.ensembles.singlebest_ensemble",
+ "autosklearn.ensembles.ensemble_selection",
+ "autosklearn.evaluation", #__init__
+ "autosklearn.evaluation.abstract_evaluator",
+ "autosklearn.evaluation.test_evaluator",
+ "autosklearn.evaluation.train_evaluator",
+ "autosklearn.metalearning.input.aslib_simple",
+ "autosklearn.metalearning.mismbo",
+ "autosklearn.metalearning.metafeatures.metafeature",
+ "autosklearn.metalearning.metafeatures.metafeatures",
+ "autosklearn.metalearning.metalearning.meta_base",
+ "autosklearn.metalearning.metalearning.metrics.misc",
+ "autosklearn.metalearning.metalearning.create_datasets",
+ "autosklearn.metalearning.metalearning.kNearestDatasets.kND",
+ "autosklearn.metalearning.metalearning.clustering.gmeans",
+ "autosklearn.metalearning.optimizers.optimizer_base",
+ "autosklearn.metalearning.optimizers.metalearn_optimizer.metalearn_optimizer_parser",
+ "autosklearn.metalearning.optimizers.metalearn_optimizer.metalearner",
+ "autosklearn.pipeline.base",
+ "autosklearn.pipeline.classification",
+ "autosklearn.pipeline.regression",
+ "autosklearn.pipeline.components.base",
+ "autosklearn.pipeline.components.data_preprocessing.*",
+ "autosklearn.pipeline.components.regression.*",
+ "autosklearn.pipeline.components.classification.*",
+ "autosklearn.pipeline.components.feature_preprocessing.*",
+ "autosklearn.pipeline.util",
+ "autosklearn.pipeline.logging_",
+ "autosklearn.pipeline.create_searchspace_util",
+ "autosklearn.pipeline.implementations.util",
+ "autosklearn.pipeline.implementations.SparseOneHotEncoder",
+ "autosklearn.pipeline.implementations.MinorityCoalescer",
+ "autosklearn.pipeline.implementations.CategoryShift",
+ "autosklearn.experimental.selector",
+ "autosklearn.data.validation",
+ "autosklearn.data.abstract_data_manager",
+ "autosklearn.data.xy_data_manager",
+ "autosklearn.data.target_validator",
+ "autosklearn.data.feature_validator",
+ "autosklearn.util.single_threaded_client",
+ "autosklearn.util.logging_",
+]
+ignore_errors = true
+
+# Packages without exported types
+[[tool.mypy.overrides]]
+module = [
+ "sklearn.*",
+ "dask.*",
+ "ConfigSpace.*",
+ "arff.*",
+ "scipy.*",
+ "smac.*",
+ "pandas.*",
+ "pynisher.*",
+ "distro.*",
+ "joblib.*",
+ "threadpoolctl.*",
+ "setuptools.*",
+ "pkg_resources.*",
+ "yaml.*",
+]
+ignore_missing_imports = true
+
diff --git a/pytest.ini b/pytest.ini
deleted file mode 100644
index 049e247a21..0000000000
--- a/pytest.ini
+++ /dev/null
@@ -1,3 +0,0 @@
-[pytest]
-testpaths =
- test
diff --git a/scripts/01_create_commands.py b/scripts/01_create_commands.py
index c6e28c606b..72e406d3d7 100644
--- a/scripts/01_create_commands.py
+++ b/scripts/01_create_commands.py
@@ -5,45 +5,49 @@
import openml
-sys.path.append('.')
+sys.path.append(".")
from update_metadata_util import classification_tasks, regression_tasks
parser = argparse.ArgumentParser()
-parser.add_argument('--working-directory', type=str, required=True)
-parser.add_argument('--test', action='store_true')
+parser.add_argument("--working-directory", type=str, required=True)
+parser.add_argument("--test", action="store_true")
args = parser.parse_args()
working_directory = args.working_directory
test = args.test
-command_file_name = os.path.join(working_directory, 'metadata_commands.txt')
+command_file_name = os.path.join(working_directory, "metadata_commands.txt")
this_directory = os.path.dirname(os.path.abspath(__file__))
-script_name = 'run_auto-sklearn_for_metadata_generation.py'
+script_name = "run_auto-sklearn_for_metadata_generation.py"
absolute_script_name = os.path.join(this_directory, script_name)
commands = []
-for task_id in (classification_tasks if not test else (233, 245, 258)):
- for metric in ('accuracy', 'balanced_accuracy', 'roc_auc', 'logloss'):
+for task_id in classification_tasks if not test else (233, 245, 258):
+ for metric in ("accuracy", "balanced_accuracy", "roc_auc", "logloss"):
if (
len(openml.tasks.get_task(task_id, download_data=False).class_labels) > 2
- and metric == 'roc_auc'
+ and metric == "roc_auc"
):
continue
- command = ('python3 %s --working-directory %s --time-limit 86400 '
- '--per-run-time-limit 1800 --task-id %d -s 1 --metric %s' %
- (absolute_script_name, working_directory, task_id, metric))
+ command = (
+ "python3 %s --working-directory %s --time-limit 86400 "
+ "--per-run-time-limit 1800 --task-id %d -s 1 --metric %s"
+ % (absolute_script_name, working_directory, task_id, metric)
+ )
commands.append(command)
-for task_id in (regression_tasks if not test else (360029, 360033)):
- for metric in ('r2', 'root_mean_squared_error', 'mean_absolute_error'):
- command = ('python3 %s --working-directory %s --time-limit 86400 '
- '--per-run-time-limit 1800 --task-id %d -s 1 --metric %s' %
- (absolute_script_name, working_directory, task_id, metric))
+for task_id in regression_tasks if not test else (360029, 360033):
+ for metric in ("r2", "root_mean_squared_error", "mean_absolute_error"):
+ command = (
+ "python3 %s --working-directory %s --time-limit 86400 "
+ "--per-run-time-limit 1800 --task-id %d -s 1 --metric %s"
+ % (absolute_script_name, working_directory, task_id, metric)
+ )
commands.append(command)
-with open(command_file_name, 'w') as fh:
+with open(command_file_name, "w") as fh:
for command in commands:
fh.writelines(command)
- fh.write('\n')
+ fh.write("\n")
diff --git a/scripts/02_retrieve_metadata.py b/scripts/02_retrieve_metadata.py
index 611b190dfa..f87f65ecc4 100644
--- a/scripts/02_retrieve_metadata.py
+++ b/scripts/02_retrieve_metadata.py
@@ -16,8 +16,9 @@
from autosklearn.util import pipeline
-def retrieve_matadata(validation_directory, metric, configuration_space,
- cutoff=0, only_best=True):
+def retrieve_matadata(
+ validation_directory, metric, configuration_space, cutoff=0, only_best=True
+):
if not only_best:
raise NotImplementedError()
if cutoff > 0:
@@ -29,9 +30,9 @@ def retrieve_matadata(validation_directory, metric, configuration_space,
configurations_to_ids = dict()
try:
- validation_trajectory_files = glob.glob(os.path.join(
- validation_directory, '*', '*', 'validation_trajectory_*.json'
- ))
+ validation_trajectory_files = glob.glob(
+ os.path.join(validation_directory, "*", "*", "validation_trajectory_*.json")
+ )
except FileNotFoundError:
return {}, {}
@@ -66,7 +67,8 @@ def retrieve_matadata(validation_directory, metric, configuration_space,
try:
best_configuration = Configuration(
- configuration_space=configuration_space, values=config)
+ configuration_space=configuration_space, values=config
+ )
best_value = score
best_configuration_dir = validation_trajectory_file
except Exception as e:
@@ -74,18 +76,22 @@ def retrieve_matadata(validation_directory, metric, configuration_space,
n_broken += 1
if task_name is None:
- print('Could not find any configuration better than the default configuration!')
+ print(
+ "Could not find any configuration better than the default configuration!"
+ )
continue
if best_configuration is None:
- print('Could not find a valid configuration; total %d, better %d, broken %d'
- % (n_configs, n_better, n_broken))
+ print(
+ "Could not find a valid configuration; total %d, better %d, broken %d"
+ % (n_configs, n_better, n_broken)
+ )
continue
elif best_configuration in configurations_to_ids:
- print('Found configuration in', best_configuration_dir)
+ print("Found configuration in", best_configuration_dir)
config_id = configurations_to_ids[best_configuration]
else:
- print('Found configuration in', best_configuration_dir)
+ print("Found configuration in", best_configuration_dir)
config_id = len(configurations_to_ids)
configurations_to_ids[config_id] = best_configuration
configurations[config_id] = best_configuration
@@ -102,34 +108,33 @@ def retrieve_matadata(validation_directory, metric, configuration_space,
return outputs, configurations
-def write_output(outputs, configurations, output_dir, configuration_space,
- metric):
+def write_output(outputs, configurations, output_dir, configuration_space, metric):
arff_object = dict()
- arff_object['attributes'] = [('instance_id', 'STRING'),
- ('repetition', 'NUMERIC'),
- ('algorithm', 'STRING'),
- (metric, 'NUMERIC'),
- ('runstatus',
- ['ok', 'timeout', 'memout', 'not_applicable',
- 'crash', 'other'])]
- arff_object['relation'] = "ALGORITHM_RUNS"
- arff_object['description'] = ""
+ arff_object["attributes"] = [
+ ("instance_id", "STRING"),
+ ("repetition", "NUMERIC"),
+ ("algorithm", "STRING"),
+ (metric, "NUMERIC"),
+ ("runstatus", ["ok", "timeout", "memout", "not_applicable", "crash", "other"]),
+ ]
+ arff_object["relation"] = "ALGORITHM_RUNS"
+ arff_object["description"] = ""
data = []
keep_configurations = set()
for dataset, (configuration_id, value) in outputs.items():
if not np.isfinite(value):
- runstatus = 'not_applicable'
+ runstatus = "not_applicable"
value = None
else:
- runstatus = 'ok'
+ runstatus = "ok"
line = [dataset, 1, configuration_id + 1, value, runstatus]
data.append(line)
keep_configurations.add(configuration_id)
- arff_object['data'] = data
+ arff_object["data"] = data
with open(os.path.join(output_dir, "algorithm_runs.arff"), "w") as fh:
arff.dump(arff_object, fh)
@@ -139,7 +144,7 @@ def write_output(outputs, configurations, output_dir, configuration_space,
if idx not in keep_configurations:
continue
configuration = configurations[idx]
- line = {'idx': idx + 1}
+ line = {"idx": idx + 1}
for hp_name in configuration:
value = configuration[hp_name]
if value is not None:
@@ -147,7 +152,7 @@ def write_output(outputs, configurations, output_dir, configuration_space,
hyperparameters.append(line)
- fieldnames = ['idx']
+ fieldnames = ["idx"]
for hyperparameter in configuration_space.get_hyperparameters():
fieldnames.append(hyperparameter.name)
fieldnames = [fieldnames[0]] + sorted(fieldnames[1:])
@@ -158,16 +163,17 @@ def write_output(outputs, configurations, output_dir, configuration_space,
csv_writer.writerow(line)
description = dict()
- description['algorithms_deterministic'] = \
- ",".join([str(configuration_id + 1)
- for configuration_id in sorted(configurations.keys())])
- description['algorithms_stochastic'] = \
- ",".join([])
- description['performance_measures'] = metric
- description['performance_type'] = 'solution_quality'
-
- with open(os.path.join(output_dir, "description.results.txt"),
- "w") as fh:
+ description["algorithms_deterministic"] = ",".join(
+ [
+ str(configuration_id + 1)
+ for configuration_id in sorted(configurations.keys())
+ ]
+ )
+ description["algorithms_stochastic"] = ",".join([])
+ description["performance_measures"] = metric
+ description["performance_type"] = "solution_quality"
+
+ with open(os.path.join(output_dir, "description.results.txt"), "w") as fh:
for key in description:
fh.write("%s: %s\n" % (key, description[key]))
@@ -184,44 +190,56 @@ def main():
cutoff = args.cutoff
only_best = args.only_best
- for task_type in ('classification', 'regression'):
- if task_type == 'classification':
+ for task_type in ("classification", "regression"):
+ if task_type == "classification":
metadata_sets = itertools.product(
- [0, 1], [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION],
- CLASSIFICATION_METRICS)
- input_directory = os.path.join(working_directory, 'configuration',
- 'classification')
- elif task_type == 'regression':
- metadata_sets = itertools.product(
- [0, 1], [REGRESSION], REGRESSION_METRICS)
- input_directory = os.path.join(working_directory, 'configuration',
- 'regression')
+ [0, 1],
+ [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION],
+ CLASSIFICATION_METRICS,
+ )
+ input_directory = os.path.join(
+ working_directory, "configuration", "classification"
+ )
+ elif task_type == "regression":
+ metadata_sets = itertools.product([0, 1], [REGRESSION], REGRESSION_METRICS)
+ input_directory = os.path.join(
+ working_directory, "configuration", "regression"
+ )
else:
raise ValueError(task_type)
- output_dir = os.path.join(working_directory, 'configuration_results')
+ output_dir = os.path.join(working_directory, "configuration_results")
for sparse, task, metric in metadata_sets:
print(TASK_TYPES_TO_STRING[task], metric, sparse)
- output_dir_ = os.path.join(output_dir, '%s_%s_%s' % (
- metric, TASK_TYPES_TO_STRING[task],
- 'sparse' if sparse else 'dense'))
+ output_dir_ = os.path.join(
+ output_dir,
+ "%s_%s_%s"
+ % (metric, TASK_TYPES_TO_STRING[task], "sparse" if sparse else "dense"),
+ )
configuration_space = pipeline.get_configuration_space(
- {'is_sparse': sparse, 'task': task})
+ {"is_sparse": sparse, "task": task}
+ )
outputs, configurations = retrieve_matadata(
validation_directory=input_directory,
metric=metric,
cutoff=cutoff,
configuration_space=configuration_space,
- only_best=only_best)
+ only_best=only_best,
+ )
if len(outputs) == 0:
- print("No output found for %s, %s, %s" %
- (metric, TASK_TYPES_TO_STRING[task],
- 'sparse' if sparse else 'dense'))
+ print(
+ "No output found for %s, %s, %s"
+ % (
+ metric,
+ TASK_TYPES_TO_STRING[task],
+ "sparse" if sparse else "dense",
+ )
+ )
continue
try:
@@ -229,8 +247,9 @@ def main():
except:
pass
- write_output(outputs, configurations, output_dir_,
- configuration_space, metric)
+ write_output(
+ outputs, configurations, output_dir_, configuration_space, metric
+ )
if __name__ == "__main__":
diff --git a/scripts/03_calculate_metafeatures.py b/scripts/03_calculate_metafeatures.py
index 1d058c5dae..3b32dde8e3 100644
--- a/scripts/03_calculate_metafeatures.py
+++ b/scripts/03_calculate_metafeatures.py
@@ -11,15 +11,22 @@
import numpy as np
import pandas as pd
-from autosklearn.constants import BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, REGRESSION
+from autosklearn.constants import (
+ BINARY_CLASSIFICATION,
+ MULTICLASS_CLASSIFICATION,
+ REGRESSION,
+)
from autosklearn.metalearning.metafeatures import metafeatures
-from autosklearn.smbo import _calculate_metafeatures, _calculate_metafeatures_encoded, \
- EXCLUDE_META_FEATURES_REGRESSION, EXCLUDE_META_FEATURES_CLASSIFICATION
+from autosklearn.smbo import (
+ _calculate_metafeatures,
+ _calculate_metafeatures_encoded,
+ EXCLUDE_META_FEATURES_REGRESSION,
+ EXCLUDE_META_FEATURES_CLASSIFICATION,
+)
from autosklearn.util.stopwatch import StopWatch
-sys.path.append('.')
-from update_metadata_util import load_task, classification_tasks, \
- regression_tasks
+sys.path.append(".")
+from update_metadata_util import load_task, classification_tasks, regression_tasks
logger = logging.getLogger("03_calculate_metafeatures")
@@ -28,7 +35,7 @@ def calculate_metafeatures(task_id):
X_train, y_train, X_test, y_test, cat, task_type, dataset_name = load_task(task_id)
watch = StopWatch()
- if task_type == 'classification':
+ if task_type == "classification":
if len(np.unique(y_train)) == 2:
task_type = BINARY_CLASSIFICATION
else:
@@ -37,20 +44,27 @@ def calculate_metafeatures(task_id):
task_type = REGRESSION
_metafeatures_labels = _calculate_metafeatures(
- x_train=X_train, y_train=y_train, data_feat_type=cat,
- data_info_task=task_type, basename=dataset_name, logger_=logger,
+ x_train=X_train,
+ y_train=y_train,
+ data_feat_type=cat,
+ data_info_task=task_type,
+ basename=dataset_name,
+ logger_=logger,
watcher=watch,
)
_metafeatures_encoded_labels = _calculate_metafeatures_encoded(
- x_train=X_train, y_train=y_train, data_feat_type=cat,
- task=task_type, basename=dataset_name, logger_=logger,
+ x_train=X_train,
+ y_train=y_train,
+ data_feat_type=cat,
+ task=task_type,
+ basename=dataset_name,
+ logger_=logger,
watcher=watch,
)
mf = _metafeatures_labels
- mf.metafeature_values.update(
- _metafeatures_encoded_labels.metafeature_values)
+ mf.metafeature_values.update(_metafeatures_encoded_labels.metafeature_values)
return mf
@@ -59,15 +73,15 @@ def calculate_metafeatures(task_id):
parser = ArgumentParser()
parser.add_argument("--working-directory", type=str, required=True)
parser.add_argument("--memory-limit", type=int, default=3072)
- parser.add_argument("--test-mode", action='store_true')
+ parser.add_argument("--test-mode", action="store_true")
args = parser.parse_args()
working_directory = args.working_directory
memory_limit = args.memory_limit
test_mode = args.test_mode
- for task_type in ('classification', 'regression'):
- output_directory = os.path.join(working_directory, 'metafeatures', task_type)
+ for task_type in ("classification", "regression"):
+ output_directory = os.path.join(working_directory, "metafeatures", task_type)
try:
os.makedirs(output_directory)
except:
@@ -75,7 +89,7 @@ def calculate_metafeatures(task_id):
all_metafeatures = {}
- if task_type == 'classification':
+ if task_type == "classification":
tasks = classification_tasks
else:
tasks = regression_tasks
@@ -90,12 +104,9 @@ def producer():
for task_id in tasks:
yield task_id
- memory = joblib.Memory(location='/tmp/joblib', verbose=10)
+ memory = joblib.Memory(location="/tmp/joblib", verbose=10)
cached_calculate_metafeatures = memory.cache(calculate_metafeatures)
- mfs = [
- cached_calculate_metafeatures(task_id)
- for task_id in producer()
- ]
+ mfs = [cached_calculate_metafeatures(task_id) for task_id in producer()]
for mf in mfs:
if mf is not None:
@@ -110,45 +121,50 @@ def producer():
for i, task_id in enumerate(all_metafeatures):
calculation_times[task_id] = dict()
for metafeature_name in sorted(
- all_metafeatures[task_id].metafeature_values):
+ all_metafeatures[task_id].metafeature_values
+ ):
metafeature_value = all_metafeatures[task_id].metafeature_values[
- metafeature_name]
- calculation_times[task_id][metafeature_name] = \
- metafeature_value.time
+ metafeature_name
+ ]
+ calculation_times[task_id][metafeature_name] = metafeature_value.time
if metafeature_value.type_ == "HELPERFUNCTION":
- helperfunction_values[task_id][metafeature_name] = \
- metafeature_value.value
+ helperfunction_values[task_id][
+ metafeature_name
+ ] = metafeature_value.value
else:
- metafeature_values[task_id][metafeature_name] = \
- metafeature_value.value
+ metafeature_values[task_id][
+ metafeature_name
+ ] = metafeature_value.value
calculation_times = pd.DataFrame(calculation_times).transpose()
calculation_times = calculation_times.sort_index()
- with open(os.path.join(output_directory, "calculation_times.csv"),
- "w") as fh:
+ with open(os.path.join(output_directory, "calculation_times.csv"), "w") as fh:
fh.write(calculation_times.to_csv())
# Write all metafeatures in the aslib1.0 format
- metafeature_values = metafeature_values = pd.DataFrame(metafeature_values).transpose()
+ metafeature_values = metafeature_values = pd.DataFrame(
+ metafeature_values
+ ).transpose()
metafeature_values = metafeature_values.sort_index()
arff_object = dict()
- arff_object['attributes'] = [('instance_id', 'STRING'),
- ('repetition', 'NUMERIC')] + \
- [('%s' % name, 'NUMERIC') for name in
- metafeature_values.columns]
- arff_object['relation'] = "FEATURE_VALUES"
- arff_object['description'] = ""
+ arff_object["attributes"] = [
+ ("instance_id", "STRING"),
+ ("repetition", "NUMERIC"),
+ ] + [("%s" % name, "NUMERIC") for name in metafeature_values.columns]
+ arff_object["relation"] = "FEATURE_VALUES"
+ arff_object["description"] = ""
data = []
for idx in metafeature_values.index:
line = [idx, 1]
- line += [value if np.isfinite(value) else None
- for value in metafeature_values.loc[idx, :].values]
+ line += [
+ value if np.isfinite(value) else None
+ for value in metafeature_values.loc[idx, :].values
+ ]
data.append(line)
- arff_object['data'] = data
+ arff_object["data"] = data
- with open(os.path.join(output_directory, "feature_values.arff"),
- "w") as fh:
+ with open(os.path.join(output_directory, "feature_values.arff"), "w") as fh:
arff.dump(arff_object, fh)
# Feature steps and runtimes according to the aslib1.0 format
@@ -157,7 +173,8 @@ def producer():
exclude_metafeatures = (
EXCLUDE_META_FEATURES_CLASSIFICATION
- if task_type == 'classification' else EXCLUDE_META_FEATURES_REGRESSION
+ if task_type == "classification"
+ else EXCLUDE_META_FEATURES_REGRESSION
)
for metafeature_name in metafeatures.metafeatures.functions:
@@ -174,42 +191,48 @@ def producer():
# Write the feature runstatus in the aslib1.0 format
arff_object = dict()
- arff_object['attributes'] = [('instance_id', 'STRING'),
- ('repetition', 'NUMERIC')] + \
- [('%s' % name,
- ['ok', 'timeout', 'memout', 'presolved',
- 'crash', 'other'])
- for name in feature_steps]
- arff_object['relation'] = "FEATURE_RUNSTATUS"
- arff_object['description'] = ""
+ arff_object["attributes"] = [
+ ("instance_id", "STRING"),
+ ("repetition", "NUMERIC"),
+ ] + [
+ ("%s" % name, ["ok", "timeout", "memout", "presolved", "crash", "other"])
+ for name in feature_steps
+ ]
+ arff_object["relation"] = "FEATURE_RUNSTATUS"
+ arff_object["description"] = ""
data = []
for idx in metafeature_values.index:
line = [idx, 1]
for feature_step in feature_steps:
if feature_step in helperfunction_values[idx]:
- line.append('ok' if helperfunction_values[feature_step] is not \
- None else 'other')
+ line.append(
+ "ok"
+ if helperfunction_values[feature_step] is not None
+ else "other"
+ )
elif feature_step in metafeature_values.loc[idx]:
- line.append('ok' if np.isfinite(metafeature_values.loc[idx][
- feature_step]) else 'other')
+ line.append(
+ "ok"
+ if np.isfinite(metafeature_values.loc[idx][feature_step])
+ else "other"
+ )
else:
- line.append('other')
+ line.append("other")
data.append(line)
- arff_object['data'] = data
+ arff_object["data"] = data
- with open(os.path.join(output_directory, "feature_runstatus.arff"),
- "w") as fh:
+ with open(os.path.join(output_directory, "feature_runstatus.arff"), "w") as fh:
arff.dump(arff_object, fh)
arff_object = dict()
- arff_object['attributes'] = [('instance_id', 'STRING'),
- ('repetition', 'NUMERIC')] + \
- [('%s' % feature_step, 'NUMERIC') for
- feature_step in feature_steps]
- arff_object['relation'] = "FEATURE_COSTS"
- arff_object['description'] = ""
+ arff_object["attributes"] = [
+ ("instance_id", "STRING"),
+ ("repetition", "NUMERIC"),
+ ] + [("%s" % feature_step, "NUMERIC") for feature_step in feature_steps]
+ arff_object["relation"] = "FEATURE_COSTS"
+ arff_object["description"] = ""
data = []
for instance_id in calculation_times.index:
@@ -220,33 +243,35 @@ def producer():
for feature in feature_steps[feature_step]:
time_ += calculation_times[feature][instance_id]
if not np.isfinite(time_):
- raise ValueError("Feature cost %s for instance %s and feature "
- "step %s not finite" % (time_, instance_id, feature))
+ raise ValueError(
+ "Feature cost %s for instance %s and feature "
+ "step %s not finite" % (time_, instance_id, feature)
+ )
line.append(time_)
data.append(line)
- arff_object['data'] = data
+ arff_object["data"] = data
- with open(os.path.join(output_directory, "feature_costs.arff"),
- "w") as fh:
+ with open(os.path.join(output_directory, "feature_costs.arff"), "w") as fh:
arff.dump(arff_object, fh)
# Write the features part of the description.txt to a file
description = OrderedDict()
- description['features_cutoff_time'] = '3600'
- description['features_cutoff_memory'] = args.memory_limit
- description['number_of_feature_steps'] = str(len(feature_steps))
+ description["features_cutoff_time"] = "3600"
+ description["features_cutoff_memory"] = args.memory_limit
+ description["number_of_feature_steps"] = str(len(feature_steps))
for feature_step in feature_steps:
- description['feature_step %s' % feature_step] = \
- ", ".join(feature_steps[feature_step])
- description['features_deterministic'] = ", ".join([
- metafeature_name for
- metafeature_name in
- metafeature_names])
- description['features_stochastic'] = ''
- description['default_steps'] = ", ".join(feature_steps)
-
- with open(os.path.join(output_directory,
- "description.features.txt"), "w") as fh:
+ description["feature_step %s" % feature_step] = ", ".join(
+ feature_steps[feature_step]
+ )
+ description["features_deterministic"] = ", ".join(
+ [metafeature_name for metafeature_name in metafeature_names]
+ )
+ description["features_stochastic"] = ""
+ description["default_steps"] = ", ".join(feature_steps)
+
+ with open(
+ os.path.join(output_directory, "description.features.txt"), "w"
+ ) as fh:
for entry in description:
fh.write("%s: %s\n" % (entry, description[entry]))
diff --git a/scripts/04_create_aslib_files.py b/scripts/04_create_aslib_files.py
index d5e10a9c15..8c83dc1648 100644
--- a/scripts/04_create_aslib_files.py
+++ b/scripts/04_create_aslib_files.py
@@ -10,16 +10,16 @@
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--working-directory", type=str, required=True)
- parser.add_argument("--scenario_id", type=str, default='auto-sklearn')
+ parser.add_argument("--scenario_id", type=str, default="auto-sklearn")
parser.add_argument("--algorithm_cutoff_time", type=int, default=1800)
parser.add_argument("--algorithm_cutoff_memory", type=int, default=3072)
args = parser.parse_args()
working_directory = args.working_directory
- output_dir = os.path.join(working_directory, 'metadata')
- results_dir = os.path.join(working_directory, 'configuration_results')
- metafeatures_dir = os.path.join(working_directory, 'metafeatures')
+ output_dir = os.path.join(working_directory, "metadata")
+ results_dir = os.path.join(working_directory, "configuration_results")
+ metafeatures_dir = os.path.join(working_directory, "metafeatures")
scenario_id = args.scenario_id
algorithm_cutoff_time = args.algorithm_cutoff_time
@@ -31,25 +31,29 @@
except (OSError, IOError):
pass
- for task_type in ('classification', 'regression'):
- if task_type == 'classification':
+ for task_type in ("classification", "regression"):
+ if task_type == "classification":
metadata_sets = itertools.product(
- [0, 1], [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION],
- CLASSIFICATION_METRICS)
- elif task_type == 'regression':
- metadata_sets = itertools.product(
- [0, 1], [REGRESSION], REGRESSION_METRICS)
+ [0, 1],
+ [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION],
+ CLASSIFICATION_METRICS,
+ )
+ elif task_type == "regression":
+ metadata_sets = itertools.product([0, 1], [REGRESSION], REGRESSION_METRICS)
else:
raise ValueError(task_type)
- input_directory = os.path.join(working_directory, 'configuration', task_type)
+ input_directory = os.path.join(working_directory, "configuration", task_type)
metafeatures_dir_for_task = os.path.join(metafeatures_dir, task_type)
for sparse, task, metric in metadata_sets:
print(TASK_TYPES_TO_STRING[task], metric, sparse)
- dir_name = '%s_%s_%s' % (metric, TASK_TYPES_TO_STRING[task],
- 'sparse' if sparse else 'dense')
+ dir_name = "%s_%s_%s" % (
+ metric,
+ TASK_TYPES_TO_STRING[task],
+ "sparse" if sparse else "dense",
+ )
output_dir_ = os.path.join(output_dir, dir_name)
results_dir_ = os.path.join(results_dir, dir_name)
@@ -67,21 +71,19 @@
pass
# Create description.txt
- with open(os.path.join(metafeatures_dir_for_task,
- "description.features.txt")) as fh:
+ with open(
+ os.path.join(metafeatures_dir_for_task, "description.features.txt")
+ ) as fh:
description_metafeatures = fh.read()
- with open(os.path.join(results_dir_,
- "description.results.txt")) as fh:
+ with open(os.path.join(results_dir_, "description.results.txt")) as fh:
description_results = fh.read()
description = [description_metafeatures, description_results]
description.append("scenario_id: %s" % scenario_id)
description.append("maximize: false")
- description.append(
- "algorithm_cutoff_time: %d" % algorithm_cutoff_time)
- description.append(
- "algorithm_cutoff_memory: %d" % algorithm_cutoff_memory)
+ description.append("algorithm_cutoff_time: %d" % algorithm_cutoff_time)
+ description.append("algorithm_cutoff_memory: %d" % algorithm_cutoff_memory)
with open(os.path.join(output_dir_, "description.txt"), "w") as fh:
for line in description:
@@ -89,59 +91,54 @@
fh.write("\n")
# Copy feature values and add instance id
- with open(os.path.join(metafeatures_dir_for_task,
- "feature_values.arff")) as fh:
+ with open(
+ os.path.join(metafeatures_dir_for_task, "feature_values.arff")
+ ) as fh:
feature_values = arff.load(fh)
- feature_values['relation'] = scenario_id + "_" + feature_values[
- 'relation']
+ feature_values["relation"] = scenario_id + "_" + feature_values["relation"]
- with open(os.path.join(output_dir_, "feature_values.arff"),
- "w") as fh:
+ with open(os.path.join(output_dir_, "feature_values.arff"), "w") as fh:
arff.dump(feature_values, fh)
# Copy feature runstatus and add instance id
- with open(os.path.join(metafeatures_dir_for_task,
- "feature_runstatus.arff")) as fh:
+ with open(
+ os.path.join(metafeatures_dir_for_task, "feature_runstatus.arff")
+ ) as fh:
feature_runstatus = arff.load(fh)
- feature_runstatus['relation'] = scenario_id + "_" + \
- feature_runstatus['relation']
+ feature_runstatus["relation"] = (
+ scenario_id + "_" + feature_runstatus["relation"]
+ )
- with open(os.path.join(output_dir_, "feature_runstatus.arff"), "w") \
- as fh:
+ with open(os.path.join(output_dir_, "feature_runstatus.arff"), "w") as fh:
arff.dump(feature_runstatus, fh)
# Copy feature runstatus and add instance id
with open(
- os.path.join(metafeatures_dir_for_task, "feature_costs.arff")) as fh:
+ os.path.join(metafeatures_dir_for_task, "feature_costs.arff")
+ ) as fh:
feature_costs = arff.load(fh)
- feature_costs['relation'] = scenario_id + "_" + feature_costs[
- 'relation']
- for i in range(len(feature_costs['data'])):
- for j in range(2, len(feature_costs['data'][i])):
- feature_costs['data'][i][j] = \
- round(feature_costs['data'][i][j], 5)
+ feature_costs["relation"] = scenario_id + "_" + feature_costs["relation"]
+ for i in range(len(feature_costs["data"])):
+ for j in range(2, len(feature_costs["data"][i])):
+ feature_costs["data"][i][j] = round(feature_costs["data"][i][j], 5)
- with open(os.path.join(output_dir_, "feature_costs.arff"), "w") \
- as fh:
+ with open(os.path.join(output_dir_, "feature_costs.arff"), "w") as fh:
arff.dump(feature_costs, fh)
# Copy algorithm runs and add instance id
with open(os.path.join(results_dir_, "algorithm_runs.arff")) as fh:
algorithm_runs = arff.load(fh)
- algorithm_runs['relation'] = scenario_id + "_" + algorithm_runs[
- 'relation']
+ algorithm_runs["relation"] = scenario_id + "_" + algorithm_runs["relation"]
- with open(os.path.join(output_dir_, "algorithm_runs.arff"), "w") \
- as fh:
+ with open(os.path.join(output_dir_, "algorithm_runs.arff"), "w") as fh:
arff.dump(algorithm_runs, fh)
# Copy configurations file
with open(os.path.join(results_dir_, "configurations.csv")) as fh:
algorithm_runs = fh.read()
- with open(os.path.join(output_dir_, "configurations.csv"), "w") \
- as fh:
+ with open(os.path.join(output_dir_, "configurations.csv"), "w") as fh:
fh.write(algorithm_runs)
diff --git a/scripts/2015_nips_paper/plot/plot_ranks.py b/scripts/2015_nips_paper/plot/plot_ranks.py
index 5be095389c..b2e85248b7 100644
--- a/scripts/2015_nips_paper/plot/plot_ranks.py
+++ b/scripts/2015_nips_paper/plot/plot_ranks.py
@@ -17,8 +17,8 @@ def read_csv(fn, has_header=True, data_type=str):
"""
data = list()
header = None
- with open(fn, 'r') as csvfile:
- csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
+ with open(fn, "r") as csvfile:
+ csv_reader = csv.reader(csvfile, delimiter=",", quotechar="|")
for row in csv_reader:
if header is None and has_header:
header = row
@@ -37,7 +37,7 @@ def fill_trajectory(performance_list, time_list):
series = pd.concat(series_list, axis=1)
# Fill missing performance values (NaNs) with last non-NaN value.
- series = series.fillna(method='ffill')
+ series = series.fillna(method="ffill")
# return the trajectories over seeds (series object)
return series
@@ -52,10 +52,10 @@ def main():
working_directory = "../log_output"
# list of models
- model_list = ['vanilla', 'ensemble', 'metalearning', 'meta_ensemble']
+ model_list = ["vanilla", "ensemble", "metalearning", "meta_ensemble"]
# list of seeds
- seed_dir = os.path.join(working_directory, 'vanilla')
+ seed_dir = os.path.join(working_directory, "vanilla")
seed_list = [seed for seed in os.listdir(seed_dir)]
# list of tasks
@@ -74,21 +74,23 @@ def main():
for seed in seed_list:
# collect all csv files of different seeds for current model and
# current task.
- if model in ['vanilla', 'ensemble']:
- csv_file = os.path.join(working_directory,
- 'vanilla',
- seed,
- task_id,
- "score_{}.csv".format(model)
- )
-
- elif model in ['metalearning', 'meta_ensemble']:
- csv_file = os.path.join(working_directory,
- 'metalearning',
- seed,
- task_id,
- "score_{}.csv".format(model),
- )
+ if model in ["vanilla", "ensemble"]:
+ csv_file = os.path.join(
+ working_directory,
+ "vanilla",
+ seed,
+ task_id,
+ "score_{}.csv".format(model),
+ )
+
+ elif model in ["metalearning", "meta_ensemble"]:
+ csv_file = os.path.join(
+ working_directory,
+ "metalearning",
+ seed,
+ task_id,
+ "score_{}.csv".format(model),
+ )
csv_files.append(csv_file)
performance_list = []
@@ -99,8 +101,9 @@ def main():
_, csv_data = read_csv(fl, has_header=True)
csv_data = np.array(csv_data)
# Replace too high values with args.maxsize
- data = [min([sys.maxsize, float(i.strip())]) for i in
- csv_data[:, 2]] # test trajectories are stored in third column
+ data = [
+ min([sys.maxsize, float(i.strip())]) for i in csv_data[:, 2]
+ ] # test trajectories are stored in third column
time_steps = [float(i.strip()) for i in csv_data[:, 0]]
assert time_steps[0] == 0
@@ -123,15 +126,16 @@ def main():
n_tasks = len(task_list)
for i in range(n_iter):
- pick = np.random.choice(all_trajectories[0][0].shape[1],
- size=(len(model_list)))
+ pick = np.random.choice(all_trajectories[0][0].shape[1], size=(len(model_list)))
for j in range(n_tasks):
all_trajectories_tmp = pd.DataFrame(
- {model_list[k]: at[j].iloc[:, pick[k]] for
- k, at in enumerate(all_trajectories)}
+ {
+ model_list[k]: at[j].iloc[:, pick[k]]
+ for k, at in enumerate(all_trajectories)
+ }
)
- all_trajectories_tmp = all_trajectories_tmp.fillna(method='ffill', axis=0)
+ all_trajectories_tmp = all_trajectories_tmp.fillna(method="ffill", axis=0)
r_tmp = all_trajectories_tmp.rank(axis=1)
all_rankings.append(r_tmp)
@@ -141,7 +145,7 @@ def main():
for ranking in all_rankings:
ranks_for_model.append(ranking.loc[:, model])
ranks_for_model = pd.DataFrame(ranks_for_model)
- ranks_for_model = ranks_for_model.fillna(method='ffill', axis=1)
+ ranks_for_model = ranks_for_model.fillna(method="ffill", axis=1)
final_ranks.append(ranks_for_model.mean(skipna=True))
# Step 3. Plot the average ranks over time.
@@ -155,8 +159,8 @@ def main():
X_data.append(max_runtime)
y_data.append(y)
plt.plot(X_data, y_data, label=model)
- plt.xlabel('time [sec]')
- plt.ylabel('average rank')
+ plt.xlabel("time [sec]")
+ plt.ylabel("average rank")
plt.legend()
plt.savefig(saveto)
diff --git a/scripts/2015_nips_paper/run/remove_dataset_from_metadata.py b/scripts/2015_nips_paper/run/remove_dataset_from_metadata.py
index f31e16e65f..d16e67e23c 100644
--- a/scripts/2015_nips_paper/run/remove_dataset_from_metadata.py
+++ b/scripts/2015_nips_paper/run/remove_dataset_from_metadata.py
@@ -4,25 +4,27 @@
from shutil import copyfile
-def remove_dataset_from_aslib_arff(input_file,
- output_file,
- id,
- ):
+def remove_dataset_from_aslib_arff(
+ input_file,
+ output_file,
+ id,
+):
with open(input_file) as fh:
arff_object = arff.load(fh)
- for i in range(len(arff_object['data']) - 1, -1, -1):
- if str(arff_object['data'][i][0]) == str(id):
- del arff_object['data'][i]
+ for i in range(len(arff_object["data"]) - 1, -1, -1):
+ if str(arff_object["data"][i][0]) == str(id):
+ del arff_object["data"][i]
with open(output_file, "w") as fh:
arff.dump(arff_object, fh)
del arff_object
-def remove_dataset(metadata_directory,
- output_directory,
- id,
- ):
+def remove_dataset(
+ metadata_directory,
+ output_directory,
+ id,
+):
metadata_sub_directories = os.listdir(metadata_directory)
for metadata_sub_directory in metadata_sub_directories:
diff --git a/scripts/2015_nips_paper/run/run_auto_sklearn.py b/scripts/2015_nips_paper/run/run_auto_sklearn.py
index 366280692e..960ab7be80 100644
--- a/scripts/2015_nips_paper/run/run_auto_sklearn.py
+++ b/scripts/2015_nips_paper/run/run_auto_sklearn.py
@@ -21,11 +21,12 @@ def load_task(task_id):
X_test = X[test_indices]
y_test = y[test_indices]
dataset = openml.datasets.get_dataset(task.dataset_id)
- _, _, cat = dataset.get_data(return_categorical_indicator=True,
- target=task.target_name)
+ _, _, cat = dataset.get_data(
+ return_categorical_indicator=True, target=task.target_name
+ )
del _
del dataset
- cat = ['categorical' if c else 'numerical' for c in cat]
+ cat = ["categorical" if c else "numerical" for c in cat]
unique = np.unique(y_train)
mapping = {unique_value: i for i, unique_value in enumerate(unique)}
@@ -35,13 +36,14 @@ def load_task(task_id):
return X_train, y_train, X_test, y_test, cat
-def run_experiment(working_directory,
- time_limit,
- per_run_time_limit,
- task_id,
- seed,
- use_metalearning,
- ):
+def run_experiment(
+ working_directory,
+ time_limit,
+ per_run_time_limit,
+ task_id,
+ seed,
+ use_metalearning,
+):
# set this to local dataset cache
# openml.config.cache_directory = os.path.join(working_directory, "../cache")
@@ -57,12 +59,14 @@ def run_experiment(working_directory,
if use_metalearning is True:
# path to the original metadata directory.
metadata_directory = os.path.abspath(os.path.dirname(__file__))
- metadata_directory = os.path.join(metadata_directory,
- "../../../autosklearn/metalearning/files/")
+ metadata_directory = os.path.join(
+ metadata_directory, "../../../autosklearn/metalearning/files/"
+ )
# Create new metadata directory not containing task_id.
- new_metadata_directory = os.path.abspath(os.path.join(working_directory,
- "metadata_%i" % task_id))
+ new_metadata_directory = os.path.abspath(
+ os.path.join(working_directory, "metadata_%i" % task_id)
+ )
try:
os.makedirs(new_metadata_directory)
@@ -73,100 +77,105 @@ def run_experiment(working_directory,
remove_dataset(metadata_directory, new_metadata_directory, task_id)
automl_arguments = {
- 'time_left_for_this_task': time_limit,
- 'per_run_time_limit': per_run_time_limit,
- 'initial_configurations_via_metalearning': 25,
- 'ensemble_size': 0,
- 'seed': seed,
- 'memory_limit': 3072,
- 'resampling_strategy': 'holdout',
- 'resampling_strategy_arguments': {'train_size': 0.67},
- 'tmp_folder': tmp_dir,
- 'delete_tmp_folder_after_terminate': False,
- 'disable_evaluator_output': False,
- 'metadata_directory': new_metadata_directory
+ "time_left_for_this_task": time_limit,
+ "per_run_time_limit": per_run_time_limit,
+ "initial_configurations_via_metalearning": 25,
+ "ensemble_size": 0,
+ "seed": seed,
+ "memory_limit": 3072,
+ "resampling_strategy": "holdout",
+ "resampling_strategy_arguments": {"train_size": 0.67},
+ "tmp_folder": tmp_dir,
+ "delete_tmp_folder_after_terminate": False,
+ "disable_evaluator_output": False,
+ "metadata_directory": new_metadata_directory,
}
# Without metalearning
else:
automl_arguments = {
- 'time_left_for_this_task': time_limit,
- 'per_run_time_limit': per_run_time_limit,
- 'initial_configurations_via_metalearning': 0,
- 'ensemble_size': 0,
- 'seed': seed,
- 'memory_limit': 3072,
- 'resampling_strategy': 'holdout',
- 'resampling_strategy_arguments': {'train_size': 0.67},
- 'tmp_folder': tmp_dir,
- 'delete_tmp_folder_after_terminate': False,
- 'disable_evaluator_output': False,
+ "time_left_for_this_task": time_limit,
+ "per_run_time_limit": per_run_time_limit,
+ "initial_configurations_via_metalearning": 0,
+ "ensemble_size": 0,
+ "seed": seed,
+ "memory_limit": 3072,
+ "resampling_strategy": "holdout",
+ "resampling_strategy_arguments": {"train_size": 0.67},
+ "tmp_folder": tmp_dir,
+ "delete_tmp_folder_after_terminate": False,
+ "disable_evaluator_output": False,
}
automl = AutoSklearnClassifier(**automl_arguments)
X_train, y_train, X_test, y_test, cat = load_task(task_id)
- automl.fit(X_train, y_train,
- dataset_name=str(task_id),
- X_test=X_test, y_test=y_test,
- metric=balanced_accuracy)
+ automl.fit(
+ X_train,
+ y_train,
+ dataset_name=str(task_id),
+ X_test=X_test,
+ y_test=y_test,
+ metric=balanced_accuracy,
+ )
-def main(working_directory,
- output_file,
- task_id,
- seed,
- model,
- time_limit,
- per_run_time_limit):
+def main(
+ working_directory, output_file, task_id, seed, model, time_limit, per_run_time_limit
+):
# vanilla and metalearning must be called first before ensemble and
# meta_ensemble can be called, respectively.
if model == "vanilla":
- run_experiment(working_directory,
- time_limit,
- per_run_time_limit,
- task_id,
- seed,
- use_metalearning=False,
- )
- score_ensemble.main(working_directory,
- output_file,
- task_id,
- seed,
- ensemble_size=1,
- )
+ run_experiment(
+ working_directory,
+ time_limit,
+ per_run_time_limit,
+ task_id,
+ seed,
+ use_metalearning=False,
+ )
+ score_ensemble.main(
+ working_directory,
+ output_file,
+ task_id,
+ seed,
+ ensemble_size=1,
+ )
elif model == "metalearning":
- run_experiment(working_directory,
- time_limit,
- per_run_time_limit,
- task_id,
- seed,
- use_metalearning=True,
- )
- score_ensemble.main(working_directory,
- output_file,
- task_id,
- seed,
- ensemble_size=1,
- )
+ run_experiment(
+ working_directory,
+ time_limit,
+ per_run_time_limit,
+ task_id,
+ seed,
+ use_metalearning=True,
+ )
+ score_ensemble.main(
+ working_directory,
+ output_file,
+ task_id,
+ seed,
+ ensemble_size=1,
+ )
else:
- score_ensemble.main(working_directory,
- output_file,
- task_id,
- seed,
- ensemble_size=50,
- )
+ score_ensemble.main(
+ working_directory,
+ output_file,
+ task_id,
+ seed,
+ ensemble_size=50,
+ )
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--working-directory', type=str, required=True)
+ parser.add_argument("--working-directory", type=str, required=True)
parser.add_argument("--output-file", type=str, required=True)
parser.add_argument("--time-limit", type=int, required=True)
parser.add_argument("--per-runtime-limit", type=int, required=True)
- parser.add_argument('--task-id', type=int, required=True)
- parser.add_argument('-s', '--seed', type=int)
+ parser.add_argument("--task-id", type=int, required=True)
+ parser.add_argument("-s", "--seed", type=int)
parser.add_argument("--model", type=str, required=True)
args = parser.parse_args()
@@ -178,11 +187,12 @@ def main(working_directory,
time_limit = args.time_limit
per_run_time_limit = args.per_runtime_limit
- main(working_directory,
- output_file,
- task_id,
- seed,
- model,
- time_limit,
- per_run_time_limit,
- )
+ main(
+ working_directory,
+ output_file,
+ task_id,
+ seed,
+ model,
+ time_limit,
+ per_run_time_limit,
+ )
diff --git a/scripts/2015_nips_paper/run/score_ensemble.py b/scripts/2015_nips_paper/run/score_ensemble.py
index 3d10954d94..1e873f01fd 100644
--- a/scripts/2015_nips_paper/run/score_ensemble.py
+++ b/scripts/2015_nips_paper/run/score_ensemble.py
@@ -14,21 +14,21 @@
def _load_file(f):
- split = f.split('_')
+ split = f.split("_")
as_seed = int(split[-2])
- ta_seed = int(split[-1].split('.')[0])
+ ta_seed = int(split[-1].split(".")[0])
np_array = np.load(f)
return np_array, (as_seed, ta_seed), os.path.getmtime(f)
def read_files(directory, seed=None, n_jobs=1):
- seed_pattern = '*' if seed is None else str(seed)
- glob_pattern = os.path.join(directory, "predictions_*_%s_*.npy" %
- seed_pattern)
+ seed_pattern = "*" if seed is None else str(seed)
+ glob_pattern = os.path.join(directory, "predictions_*_%s_*.npy" % seed_pattern)
files = sorted(glob.glob(glob_pattern))
files = joblib.Parallel(n_jobs=n_jobs, verbose=10)(
- joblib.delayed(_load_file)(f=f) for f in files)
+ joblib.delayed(_load_file)(f=f) for f in files
+ )
return files
@@ -38,13 +38,13 @@ def main(input_directories, output_file, task_id, seed, ensemble_size, n_jobs=1)
if isinstance(input_directories, str):
# add seed and task id directories
- input_directories += '/%i/%i' % (seed, task_id)
+ input_directories += "/%i/%i" % (seed, task_id)
input_directories = [input_directories]
else:
new_directories = []
for dir in input_directories:
- dir += '/%i/%i' % (seed, task_id)
+ dir += "/%i/%i" % (seed, task_id)
new_directories.append(dir)
input_directories = new_directories
@@ -54,28 +54,28 @@ def main(input_directories, output_file, task_id, seed, ensemble_size, n_jobs=1)
# Get the prediction files.
for input_directory in input_directories:
- print('Loading files from input directory:', input_directory)
+ print("Loading files from input directory:", input_directory)
validation_files_ = read_files(
- os.path.join(input_directory,
- '.auto-sklearn/predictions_ensemble'),
- n_jobs=n_jobs)
+ os.path.join(input_directory, ".auto-sklearn/predictions_ensemble"),
+ n_jobs=n_jobs,
+ )
validation_files.extend(validation_files_)
test_files_ = read_files(
- os.path.join(input_directory,
- '.auto-sklearn/predictions_test'),
- n_jobs=n_jobs)
+ os.path.join(input_directory, ".auto-sklearn/predictions_test"),
+ n_jobs=n_jobs,
+ )
test_files.extend(test_files_)
assert len(validation_files_) > 0
assert len(validation_files_) == len(test_files_)
- print('Loaded %d files!' % len(validation_files_))
+ print("Loaded %d files!" % len(validation_files_))
# if not specified, we get all files.
- seed_pattern = '*' if seed is None else str(seed)
- glob_pattern = os.path.join(input_directory,
- ".auto-sklearn",
- "start_time_%s" % seed_pattern)
+ seed_pattern = "*" if seed is None else str(seed)
+ glob_pattern = os.path.join(
+ input_directory, ".auto-sklearn", "start_time_%s" % seed_pattern
+ )
start_time_files = glob.glob(glob_pattern)
# find the earliest startime.
@@ -90,14 +90,15 @@ def main(input_directories, output_file, task_id, seed, ensemble_size, n_jobs=1)
validation_files.sort(key=lambda t: t[-1])
- keys_to_test_files = {test_file[1]: test_file
- for test_file in test_files}
+ keys_to_test_files = {test_file[1]: test_file for test_file in test_files}
# Resort such that both files have the same order
- test_files = [keys_to_test_files[validation_file[1]]
- for validation_file in validation_files]
+ test_files = [
+ keys_to_test_files[validation_file[1]] for validation_file in validation_files
+ ]
assert [validation_file[1] for validation_file in validation_files] == [
- test_file[1] for test_file in test_files]
+ test_file[1] for test_file in test_files
+ ]
losses = []
top_models_at_step = dict()
@@ -106,7 +107,7 @@ def main(input_directories, output_file, task_id, seed, ensemble_size, n_jobs=1)
temporary_directory=input_directory,
output_directory=input_directory + "_output",
delete_tmp_folder_after_terminate=False,
- prefix="auto-sklearn"
+ prefix="auto-sklearn",
)
valid_labels = backend.load_targets_ensemble()
score = balanced_accuracy
@@ -124,46 +125,63 @@ def main(input_directories, output_file, task_id, seed, ensemble_size, n_jobs=1)
if top_model in models_to_remove:
models_to_remove.remove(top_model)
- print("Removing the following %d models from the library: %s"
- % (len(models_to_remove), models_to_remove))
+ print(
+ "Removing the following %d models from the library: %s"
+ % (len(models_to_remove), models_to_remove)
+ )
for model_id in models_to_remove:
validation_files[model_id] = None
test_files[model_id] = None
- print('Starting ensemble building!')
+ print("Starting ensemble building!")
output = joblib.Parallel(n_jobs=n_jobs, verbose=20)(
- joblib.delayed(
- evaluate)(input_directory=input_directories[0],
- validation_files=[validation_files[j] for
- j in range(len(validation_files))
- if j in top_models_at_step[i]],
- test_files=[test_files[j] for
- j in range(len(test_files))
- if j in top_models_at_step[i]],
- ensemble_size=ensemble_size)
- for i in range(len(test_files)))
+ joblib.delayed(evaluate)(
+ input_directory=input_directories[0],
+ validation_files=[
+ validation_files[j]
+ for j in range(len(validation_files))
+ if j in top_models_at_step[i]
+ ],
+ test_files=[
+ test_files[j]
+ for j in range(len(test_files))
+ if j in top_models_at_step[i]
+ ],
+ ensemble_size=ensemble_size,
+ )
+ for i in range(len(test_files))
+ )
# Create output csv file
file_path = os.path.abspath("%s/%s" % (input_directory, output_file))
with open(file_path, "w") as csv_file:
- fieldnames = ['Time', 'Training (Empirical) Performance',
- 'Test Set Performance']
+ fieldnames = [
+ "Time",
+ "Training (Empirical) Performance",
+ "Test Set Performance",
+ ]
csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
csv_writer.writeheader()
# First time step
- csv_writer.writerow({'Time': 0,
- 'Training (Empirical) Performance': 1.0,
- 'Test Set Performance': 1.0})
+ csv_writer.writerow(
+ {
+ "Time": 0,
+ "Training (Empirical) Performance": 1.0,
+ "Test Set Performance": 1.0,
+ }
+ )
for i, o in enumerate(output):
- csv_writer.writerow({'Time': o['ensemble_time']
- + o['time_function_evaluation']
- - starttime,
- 'Training (Empirical) Performance':
- o['ensemble_error'],
- 'Test Set Performance':
- o['ensemble_test_error']})
+ csv_writer.writerow(
+ {
+ "Time": o["ensemble_time"]
+ + o["time_function_evaluation"]
+ - starttime,
+ "Training (Empirical) Performance": o["ensemble_error"],
+ "Test Set Performance": o["ensemble_test_error"],
+ }
+ )
def evaluate(input_directory, validation_files, test_files, ensemble_size=50):
@@ -187,18 +205,18 @@ def evaluate(input_directory, validation_files, test_files, ensemble_size=50):
# Build the ensemble
start = time.time()
- ensemble_selection = EnsembleSelection(ensemble_size=ensemble_size,
- task_type=D.info['task'],
- metric=score,
- random_state=np.random.RandomState())
+ ensemble_selection = EnsembleSelection(
+ ensemble_size=ensemble_size,
+ task_type=D.info["task"],
+ metric=score,
+ random_state=np.random.RandomState(),
+ )
validation_predictions = np.array([v[0] for v in validation_files])
test_predictions = np.array([t[0] for t in test_files])
- ensemble_selection.fit(validation_predictions, valid_labels,
- identifiers=None)
- y_hat_ensemble = ensemble_selection.predict(np.array(
- validation_predictions))
+ ensemble_selection.fit(validation_predictions, valid_labels, identifiers=None)
+ y_hat_ensemble = ensemble_selection.predict(np.array(validation_predictions))
y_hat_test = ensemble_selection.predict(np.array(test_predictions))
# Compute validation error
@@ -209,21 +227,22 @@ def evaluate(input_directory, validation_files, test_files, ensemble_size=50):
ensemble_time = time.time() - start
- rval = {'ensemble_time': ensemble_time,
- 'time_function_evaluation': time_function_evaluation,
- 'ensemble_error': ensemble_error,
- 'ensemble_test_error': ensemble_test_error}
+ rval = {
+ "ensemble_time": ensemble_time,
+ "time_function_evaluation": time_function_evaluation,
+ "ensemble_error": ensemble_error,
+ "ensemble_test_error": ensemble_test_error,
+ }
return rval
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = ArgumentParser()
- parser.add_argument('--input-directory', type=str,
- required=True, nargs='+')
- parser.add_argument('--task-id', type=int, required=True)
- parser.add_argument('-s', '--seed', type=int)
- parser.add_argument("--output-file", type=str, default='score_ensemble.csv')
+ parser.add_argument("--input-directory", type=str, required=True, nargs="+")
+ parser.add_argument("--task-id", type=int, required=True)
+ parser.add_argument("-s", "--seed", type=int)
+ parser.add_argument("--output-file", type=str, default="score_ensemble.csv")
parser.add_argument("--ensemble-size", type=int, default=50)
parser.add_argument("--n-jobs", type=int, default=1)
args = parser.parse_args()
diff --git a/scripts/2015_nips_paper/setup/get_tasks.py b/scripts/2015_nips_paper/setup/get_tasks.py
index 09f06a0a64..98c4ee085e 100644
--- a/scripts/2015_nips_paper/setup/get_tasks.py
+++ b/scripts/2015_nips_paper/setup/get_tasks.py
@@ -4,30 +4,162 @@
# List of dataset IDs used for the NIPS experiment.
-dataset_ids = [1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1049, 1050, 1053,
- 1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130,
- 1134, 1138, 1139, 1142, 1146, 1161, 1166, 12, 14, 16, 179, 180, 181, 182,
- 184, 185, 18, 21, 22, 23, 24, 26, 273, 28, 293, 300, 30, 31, 32, 351, 354,
- 357, 36, 389, 38, 390, 391, 392, 393, 395, 396, 398, 399, 3, 401, 44, 46,
- 554, 57, 60, 679, 6, 715, 718, 720, 722, 723, 727, 728, 734, 735, 737,
- 740, 741, 743, 751, 752, 761, 772, 797, 799, 803, 806, 807, 813, 816, 819,
- 821, 822, 823, 833, 837, 843, 845, 846, 847, 849, 866, 871, 881, 897, 901,
- 903, 904, 910, 912, 913, 914, 917, 923, 930, 934, 953, 958, 959, 962, 966,
- 971, 976, 977, 978, 979, 980, 991, 993, 995]
+dataset_ids = [
+ 1000,
+ 1002,
+ 1018,
+ 1019,
+ 1020,
+ 1021,
+ 1036,
+ 1040,
+ 1041,
+ 1049,
+ 1050,
+ 1053,
+ 1056,
+ 1067,
+ 1068,
+ 1069,
+ 1111,
+ 1112,
+ 1114,
+ 1116,
+ 1119,
+ 1120,
+ 1128,
+ 1130,
+ 1134,
+ 1138,
+ 1139,
+ 1142,
+ 1146,
+ 1161,
+ 1166,
+ 12,
+ 14,
+ 16,
+ 179,
+ 180,
+ 181,
+ 182,
+ 184,
+ 185,
+ 18,
+ 21,
+ 22,
+ 23,
+ 24,
+ 26,
+ 273,
+ 28,
+ 293,
+ 300,
+ 30,
+ 31,
+ 32,
+ 351,
+ 354,
+ 357,
+ 36,
+ 389,
+ 38,
+ 390,
+ 391,
+ 392,
+ 393,
+ 395,
+ 396,
+ 398,
+ 399,
+ 3,
+ 401,
+ 44,
+ 46,
+ 554,
+ 57,
+ 60,
+ 679,
+ 6,
+ 715,
+ 718,
+ 720,
+ 722,
+ 723,
+ 727,
+ 728,
+ 734,
+ 735,
+ 737,
+ 740,
+ 741,
+ 743,
+ 751,
+ 752,
+ 761,
+ 772,
+ 797,
+ 799,
+ 803,
+ 806,
+ 807,
+ 813,
+ 816,
+ 819,
+ 821,
+ 822,
+ 823,
+ 833,
+ 837,
+ 843,
+ 845,
+ 846,
+ 847,
+ 849,
+ 866,
+ 871,
+ 881,
+ 897,
+ 901,
+ 903,
+ 904,
+ 910,
+ 912,
+ 913,
+ 914,
+ 917,
+ 923,
+ 930,
+ 934,
+ 953,
+ 958,
+ 959,
+ 962,
+ 966,
+ 971,
+ 976,
+ 977,
+ 978,
+ 979,
+ 980,
+ 991,
+ 993,
+ 995,
+]
def get_task_ids(dataset_ids):
# return task ids of corresponding datset ids.
# active tasks
- tasks_a = openml.tasks.list_tasks(task_type_id=1, status='active')
+ tasks_a = openml.tasks.list_tasks(task_type_id=1, status="active")
tasks_a = pd.DataFrame.from_dict(tasks_a, orient="index")
# query only those with holdout as the resampling startegy.
tasks_a = tasks_a[(tasks_a.estimation_procedure == "33% Holdout set")]
# deactivated tasks
- tasks_d = openml.tasks.list_tasks(task_type_id=1, status='deactivated')
+ tasks_d = openml.tasks.list_tasks(task_type_id=1, status="deactivated")
tasks_d = pd.DataFrame.from_dict(tasks_d, orient="index")
tasks_d = tasks_d[(tasks_d.estimation_procedure == "33% Holdout set")]
@@ -47,9 +179,9 @@ def get_task_ids(dataset_ids):
def main():
task_ids = sorted(get_task_ids(dataset_ids))
- string_to_print = ''
+ string_to_print = ""
for tid in task_ids:
- string_to_print += str(tid) + ' '
+ string_to_print += str(tid) + " "
print(string_to_print) # print the task ids for bash script.
diff --git a/scripts/run_auto-sklearn_for_metadata_generation.py b/scripts/run_auto-sklearn_for_metadata_generation.py
index e1fc71a135..6b82b233c7 100644
--- a/scripts/run_auto-sklearn_for_metadata_generation.py
+++ b/scripts/run_auto-sklearn_for_metadata_generation.py
@@ -1,4 +1,4 @@
-if __name__ == '__main__':
+if __name__ == "__main__":
import argparse
import json
@@ -11,27 +11,35 @@
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.regression import AutoSklearnRegressor
from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash
- from autosklearn.metrics import accuracy, balanced_accuracy, roc_auc, log_loss, r2, \
- mean_squared_error, mean_absolute_error, root_mean_squared_error, CLASSIFICATION_METRICS, \
- REGRESSION_METRICS
+ from autosklearn.metrics import (
+ accuracy,
+ balanced_accuracy,
+ roc_auc,
+ log_loss,
+ r2,
+ mean_squared_error,
+ mean_absolute_error,
+ root_mean_squared_error,
+ CLASSIFICATION_METRICS,
+ REGRESSION_METRICS,
+ )
from smac.runhistory.runhistory import RunInfo
from smac.scenario.scenario import Scenario
from smac.stats.stats import Stats
from smac.tae import StatusType
- sys.path.append('.')
+ sys.path.append(".")
from update_metadata_util import load_task
-
parser = argparse.ArgumentParser()
- parser.add_argument('--working-directory', type=str, required=True)
- parser.add_argument('--time-limit', type=int, required=True)
- parser.add_argument('--per-run-time-limit', type=int, required=True)
- parser.add_argument('--task-id', type=int, required=True)
- parser.add_argument('--metric', type=str, required=True)
- parser.add_argument('-s', '--seed', type=int, required=True)
- parser.add_argument('--unittest', action='store_true')
+ parser.add_argument("--working-directory", type=str, required=True)
+ parser.add_argument("--time-limit", type=int, required=True)
+ parser.add_argument("--per-run-time-limit", type=int, required=True)
+ parser.add_argument("--task-id", type=int, required=True)
+ parser.add_argument("--metric", type=str, required=True)
+ parser.add_argument("-s", "--seed", type=int, required=True)
+ parser.add_argument("--unittest", action="store_true")
args = parser.parse_args()
working_directory = args.working_directory
@@ -44,8 +52,9 @@
X_train, y_train, X_test, y_test, cat, task_type, dataset_name = load_task(task_id)
- configuration_output_dir = os.path.join(working_directory, 'configuration',
- task_type)
+ configuration_output_dir = os.path.join(
+ working_directory, "configuration", task_type
+ )
os.makedirs(configuration_output_dir, exist_ok=True)
tmp_dir = os.path.join(configuration_output_dir, str(task_id), metric)
os.makedirs(tmp_dir, exist_ok=True)
@@ -54,49 +63,55 @@
autosklearn_directory = os.path.join(tempdir, "dir")
automl_arguments = {
- 'time_left_for_this_task': time_limit,
- 'per_run_time_limit': per_run_time_limit,
- 'initial_configurations_via_metalearning': 0,
- 'ensemble_size': 0,
- 'ensemble_nbest': 0,
- 'seed': seed,
- 'memory_limit': 3072,
- 'resampling_strategy': 'partial-cv',
- 'delete_tmp_folder_after_terminate': False,
- 'tmp_folder': autosklearn_directory,
- 'disable_evaluator_output': True,
+ "time_left_for_this_task": time_limit,
+ "per_run_time_limit": per_run_time_limit,
+ "initial_configurations_via_metalearning": 0,
+ "ensemble_size": 0,
+ "ensemble_nbest": 0,
+ "seed": seed,
+ "memory_limit": 3072,
+ "resampling_strategy": "partial-cv",
+ "delete_tmp_folder_after_terminate": False,
+ "tmp_folder": autosklearn_directory,
+ "disable_evaluator_output": True,
}
if is_test:
- automl_arguments['resampling_strategy_arguments'] = {'folds': 2}
- if task_type == 'classification':
- include = {'classifier': ['libsvm_svc'], 'feature_preprocessor': ['no_preprocessing']}
- automl_arguments['include'] = include
- elif task_type == 'regression':
- include = {'regressor': ['extra_trees'], 'feature_preprocessor': ['no_preprocessing']}
- automl_arguments['include'] = include
+ automl_arguments["resampling_strategy_arguments"] = {"folds": 2}
+ if task_type == "classification":
+ include = {
+ "classifier": ["libsvm_svc"],
+ "feature_preprocessor": ["no_preprocessing"],
+ }
+ automl_arguments["include"] = include
+ elif task_type == "regression":
+ include = {
+ "regressor": ["extra_trees"],
+ "feature_preprocessor": ["no_preprocessing"],
+ }
+ automl_arguments["include"] = include
else:
- raise ValueError('Unsupported task type: %s' % str(task_type))
+ raise ValueError("Unsupported task type: %s" % str(task_type))
else:
- automl_arguments['resampling_strategy_arguments'] = {'folds': 10}
+ automl_arguments["resampling_strategy_arguments"] = {"folds": 10}
include = None
metric = {
- 'accuracy': accuracy,
- 'balanced_accuracy': balanced_accuracy,
- 'roc_auc': roc_auc,
- 'logloss': log_loss,
- 'r2': r2,
- 'mean_squared_error': mean_squared_error,
- 'root_mean_squared_error': root_mean_squared_error,
- 'mean_absolute_error': mean_absolute_error,
+ "accuracy": accuracy,
+ "balanced_accuracy": balanced_accuracy,
+ "roc_auc": roc_auc,
+ "logloss": log_loss,
+ "r2": r2,
+ "mean_squared_error": mean_squared_error,
+ "root_mean_squared_error": root_mean_squared_error,
+ "mean_absolute_error": mean_absolute_error,
}[metric]
- automl_arguments['metric'] = metric
+ automl_arguments["metric"] = metric
- if task_type == 'classification':
+ if task_type == "classification":
automl = AutoSklearnClassifier(**automl_arguments)
scorer_list = CLASSIFICATION_METRICS
- elif task_type == 'regression':
+ elif task_type == "regression":
automl = AutoSklearnRegressor(**automl_arguments)
scorer_list = REGRESSION_METRICS
else:
@@ -104,8 +119,14 @@
scoring_functions = [scorer for name, scorer in scorer_list.items()]
- automl.fit(X_train, y_train, dataset_name=dataset_name,
- feat_type=cat, X_test=X_test, y_test=y_test)
+ automl.fit(
+ X_train,
+ y_train,
+ dataset_name=dataset_name,
+ feat_type=cat,
+ X_test=X_test,
+ y_test=y_test,
+ )
trajectory = automl.trajectory_
incumbent_id_to_model = {}
@@ -117,40 +138,44 @@
else:
memory_limit_factor = 2
- print('Starting to validate configurations')
+ print("Starting to validate configurations")
for i, entry in enumerate(trajectory):
- print('Starting to validate configuration %d/%d' % (i + 1, len(trajectory)))
+ print("Starting to validate configuration %d/%d" % (i + 1, len(trajectory)))
incumbent_id = entry.incumbent_id
train_performance = entry.train_perf
if incumbent_id not in incumbent_id_to_model:
config = entry.incumbent
- logger = logging.getLogger('Testing:)')
+ logger = logging.getLogger("Testing:)")
stats = Stats(
- Scenario({
- 'cutoff_time': per_run_time_limit * 2,
- 'run_obj': 'quality',
- })
+ Scenario(
+ {
+ "cutoff_time": per_run_time_limit * 2,
+ "run_obj": "quality",
+ }
+ )
)
stats.start_timing()
# To avoid the output "first run crashed"...
stats.submitted_ta_runs += 1
stats.finished_ta_runs += 1
- memory_lim = memory_limit_factor * automl_arguments['memory_limit']
+ memory_lim = memory_limit_factor * automl_arguments["memory_limit"]
pipeline, run_info, run_value = automl.fit_pipeline(
- X=X_train, y=y_train,
- X_test=X_test, y_test=y_test,
- resampling_strategy='test',
+ X=X_train,
+ y=y_train,
+ X_test=X_test,
+ y_test=y_test,
+ resampling_strategy="test",
memory_limit=memory_lim,
disable_file_output=True,
logger=logger,
stats=stats,
scoring_functions=scoring_functions,
include=include,
- metric=automl_arguments['metric'],
- pynisher_context='spawn',
- cutoff=per_run_time_limit*3,
+ metric=automl_arguments["metric"],
+ pynisher_context="spawn",
+ cutoff=per_run_time_limit * 3,
config=config,
)
@@ -159,58 +184,65 @@
# print(additional_run_info)
- validated_trajectory.append(list(entry) + [task_id] +
- [run_value.additional_info])
- print('Finished validating configuration %d/%d' % (i + 1, len(trajectory)))
- print('Finished to validate configurations')
-
- print('Starting to copy data to configuration directory', flush=True)
- validated_trajectory = [entry[:2] + [entry[2].get_dictionary()] + entry[3:]
- for entry in validated_trajectory]
- validated_trajectory_file = os.path.join(tmp_dir, 'validation_trajectory_%d.json' % seed)
- with open(validated_trajectory_file, 'w') as fh:
+ validated_trajectory.append(
+ list(entry) + [task_id] + [run_value.additional_info]
+ )
+ print("Finished validating configuration %d/%d" % (i + 1, len(trajectory)))
+ print("Finished to validate configurations")
+
+ print("Starting to copy data to configuration directory", flush=True)
+ validated_trajectory = [
+ entry[:2] + [entry[2].get_dictionary()] + entry[3:]
+ for entry in validated_trajectory
+ ]
+ validated_trajectory_file = os.path.join(
+ tmp_dir, "validation_trajectory_%d.json" % seed
+ )
+ with open(validated_trajectory_file, "w") as fh:
json.dump(validated_trajectory, fh, indent=4)
-
for dirpath, dirnames, filenames in os.walk(autosklearn_directory, topdown=False):
print(dirpath, dirnames, filenames)
for filename in filenames:
- if filename == 'datamanager.pkl':
+ if filename == "datamanager.pkl":
os.remove(os.path.join(dirpath, filename))
- elif filename == 'configspace.pcs':
+ elif filename == "configspace.pcs":
os.remove(os.path.join(dirpath, filename))
for dirname in dirnames:
- if dirname in ('models', 'cv_models'):
+ if dirname in ("models", "cv_models"):
os.rmdir(os.path.join(dirpath, dirname))
- print('*' * 80)
- print('Going to copy the configuration directory')
- script = 'cp -r %s %s' % (autosklearn_directory, os.path.join(tmp_dir, 'auto-sklearn-output'))
+ print("*" * 80)
+ print("Going to copy the configuration directory")
+ script = "cp -r %s %s" % (
+ autosklearn_directory,
+ os.path.join(tmp_dir, "auto-sklearn-output"),
+ )
proc = subprocess.run(
script,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True,
- executable='/bin/bash',
+ executable="/bin/bash",
)
- print('*' * 80)
+ print("*" * 80)
print(script)
print(proc.stdout)
print(proc.stderr)
- print('Finished copying the configuration directory')
+ print("Finished copying the configuration directory")
- if not tempdir.startswith('/tmp'):
- raise ValueError('%s must not start with /tmp' % tempdir)
- script = 'rm -rf %s' % tempdir
- print('*' * 80)
+ if not tempdir.startswith("/tmp"):
+ raise ValueError("%s must not start with /tmp" % tempdir)
+ script = "rm -rf %s" % tempdir
+ print("*" * 80)
print(script)
proc = subprocess.run(
script,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True,
- executable='/bin/bash',
+ executable="/bin/bash",
)
print(proc.stdout)
print(proc.stderr)
- print('Finished configuring')
+ print("Finished configuring")
diff --git a/scripts/update_metadata_util.py b/scripts/update_metadata_util.py
index 153e63c6cf..8ed99d9bd0 100644
--- a/scripts/update_metadata_util.py
+++ b/scripts/update_metadata_util.py
@@ -3,37 +3,327 @@
classification_tasks = [
- 232, 236, 241, 245, 253, 254, 256, 258, 260, 262, 267, 271, 273, 275, 279, 288, 336,
- 340, 2119, 2120, 2121, 2122, 2123, 2125, 2356, 3044, 3047, 3048, 3049, 3053, 3054,
- 3055, 75089, 75092, 75093, 75098, 75100, 75108, 75109, 75112, 75114, 75115, 75116,
- 75118, 75120, 75121, 75125, 75126, 75129, 75131, 75133, 75134, 75136, 75139, 75141,
- 75142, 75143, 75146, 75147, 75148, 75149, 75153, 75154, 75156, 75157, 75159, 75161,
- 75163, 75166, 75169, 75171, 75173, 75174, 75176, 75178, 75179, 75180, 75184, 75185,
- 75187, 75192, 75195, 75196, 75199, 75210, 75212, 75213, 75215, 75217, 75219, 75221,
- 75223, 75225, 75232, 75233, 75234, 75235, 75236, 75237, 75239, 75250, 126021, 126024,
- 126028, 126030, 126031, 146574, 146575, 146576, 146577, 146578, 146583, 146586,
- 146592, 146593, 146594, 146596, 146597, 146600, 146601, 146602, 146603, 146679,
- 166859, 166866, 166872, 166875, 166882, 166897, 166905, 166906, 166913, 166915,
- 166931, 166932, 166944, 166950, 166951, 166953, 166956, 166957, 166958, 166959,
- 166970, 166996, 167085, 167086, 167087, 167088, 167089, 167090, 167094, 167096,
- 167097, 167099, 167100, 167101, 167103, 167105, 167106, 167202, 167203, 167204,
- 167205, 168785, 168791, 189779, 189786, 189828, 189829, 189836, 189840, 189841,
- 189843, 189844, 189845, 189846, 189857, 189858, 189859, 189863, 189864, 189869,
- 189870, 189875, 189878, 189880, 189881, 189882, 189883, 189884, 189887, 189890,
- 189893, 189894, 189899, 189900, 189902, 190154, 190155, 190156, 190157, 190158,
- 190159, 211720, 211721, 211722, 211723, 211724
+ 232,
+ 236,
+ 241,
+ 245,
+ 253,
+ 254,
+ 256,
+ 258,
+ 260,
+ 262,
+ 267,
+ 271,
+ 273,
+ 275,
+ 279,
+ 288,
+ 336,
+ 340,
+ 2119,
+ 2120,
+ 2121,
+ 2122,
+ 2123,
+ 2125,
+ 2356,
+ 3044,
+ 3047,
+ 3048,
+ 3049,
+ 3053,
+ 3054,
+ 3055,
+ 75089,
+ 75092,
+ 75093,
+ 75098,
+ 75100,
+ 75108,
+ 75109,
+ 75112,
+ 75114,
+ 75115,
+ 75116,
+ 75118,
+ 75120,
+ 75121,
+ 75125,
+ 75126,
+ 75129,
+ 75131,
+ 75133,
+ 75134,
+ 75136,
+ 75139,
+ 75141,
+ 75142,
+ 75143,
+ 75146,
+ 75147,
+ 75148,
+ 75149,
+ 75153,
+ 75154,
+ 75156,
+ 75157,
+ 75159,
+ 75161,
+ 75163,
+ 75166,
+ 75169,
+ 75171,
+ 75173,
+ 75174,
+ 75176,
+ 75178,
+ 75179,
+ 75180,
+ 75184,
+ 75185,
+ 75187,
+ 75192,
+ 75195,
+ 75196,
+ 75199,
+ 75210,
+ 75212,
+ 75213,
+ 75215,
+ 75217,
+ 75219,
+ 75221,
+ 75223,
+ 75225,
+ 75232,
+ 75233,
+ 75234,
+ 75235,
+ 75236,
+ 75237,
+ 75239,
+ 75250,
+ 126021,
+ 126024,
+ 126028,
+ 126030,
+ 126031,
+ 146574,
+ 146575,
+ 146576,
+ 146577,
+ 146578,
+ 146583,
+ 146586,
+ 146592,
+ 146593,
+ 146594,
+ 146596,
+ 146597,
+ 146600,
+ 146601,
+ 146602,
+ 146603,
+ 146679,
+ 166859,
+ 166866,
+ 166872,
+ 166875,
+ 166882,
+ 166897,
+ 166905,
+ 166906,
+ 166913,
+ 166915,
+ 166931,
+ 166932,
+ 166944,
+ 166950,
+ 166951,
+ 166953,
+ 166956,
+ 166957,
+ 166958,
+ 166959,
+ 166970,
+ 166996,
+ 167085,
+ 167086,
+ 167087,
+ 167088,
+ 167089,
+ 167090,
+ 167094,
+ 167096,
+ 167097,
+ 167099,
+ 167100,
+ 167101,
+ 167103,
+ 167105,
+ 167106,
+ 167202,
+ 167203,
+ 167204,
+ 167205,
+ 168785,
+ 168791,
+ 189779,
+ 189786,
+ 189828,
+ 189829,
+ 189836,
+ 189840,
+ 189841,
+ 189843,
+ 189844,
+ 189845,
+ 189846,
+ 189857,
+ 189858,
+ 189859,
+ 189863,
+ 189864,
+ 189869,
+ 189870,
+ 189875,
+ 189878,
+ 189880,
+ 189881,
+ 189882,
+ 189883,
+ 189884,
+ 189887,
+ 189890,
+ 189893,
+ 189894,
+ 189899,
+ 189900,
+ 189902,
+ 190154,
+ 190155,
+ 190156,
+ 190157,
+ 190158,
+ 190159,
+ 211720,
+ 211721,
+ 211722,
+ 211723,
+ 211724,
]
regression_tasks = [
- 359997, 359998, 359999, 360000, 360001, 360002, 360003, 167146, 360004, 360005, 360006,
- 360007, 211696, 360009, 360010, 360011, 360012, 360013, 360014, 360015, 360016, 360017,
- 360018, 360019, 360020, 360021, 360022, 360023, 360024, 360025, 360026, 360027, 360028,
- 360029, 360030, 360031, 360032, 360033, 360034, 360035, 360036, 360037, 360038, 360039,
- 360040, 360041, 360042, 360043, 360044, 360045, 360046, 360047, 360048, 360049, 360050,
- 360051, 360052, 360053, 360054, 360055, 360056, 360057, 360058, 360059, 360060, 360061,
- 360062, 360063, 360064, 360066, 360067, 360068, 360069, 360070, 360071, 360072, 360073,
- 360074, 360075, 360076, 360077, 360078, 360079, 360080, 360081, 360082, 360083, 360084,
- 360085, 360086, 360087, 360088, 360089, 360090, 360091, 360092, 360093, 360094, 360095,
- 360096, 360097, 360098, 360100, 360101, 360102, 360103, 360104, 360105, 360106, 360107,
+ 359997,
+ 359998,
+ 359999,
+ 360000,
+ 360001,
+ 360002,
+ 360003,
+ 167146,
+ 360004,
+ 360005,
+ 360006,
+ 360007,
+ 211696,
+ 360009,
+ 360010,
+ 360011,
+ 360012,
+ 360013,
+ 360014,
+ 360015,
+ 360016,
+ 360017,
+ 360018,
+ 360019,
+ 360020,
+ 360021,
+ 360022,
+ 360023,
+ 360024,
+ 360025,
+ 360026,
+ 360027,
+ 360028,
+ 360029,
+ 360030,
+ 360031,
+ 360032,
+ 360033,
+ 360034,
+ 360035,
+ 360036,
+ 360037,
+ 360038,
+ 360039,
+ 360040,
+ 360041,
+ 360042,
+ 360043,
+ 360044,
+ 360045,
+ 360046,
+ 360047,
+ 360048,
+ 360049,
+ 360050,
+ 360051,
+ 360052,
+ 360053,
+ 360054,
+ 360055,
+ 360056,
+ 360057,
+ 360058,
+ 360059,
+ 360060,
+ 360061,
+ 360062,
+ 360063,
+ 360064,
+ 360066,
+ 360067,
+ 360068,
+ 360069,
+ 360070,
+ 360071,
+ 360072,
+ 360073,
+ 360074,
+ 360075,
+ 360076,
+ 360077,
+ 360078,
+ 360079,
+ 360080,
+ 360081,
+ 360082,
+ 360083,
+ 360084,
+ 360085,
+ 360086,
+ 360087,
+ 360088,
+ 360089,
+ 360090,
+ 360091,
+ 360092,
+ 360093,
+ 360094,
+ 360095,
+ 360096,
+ 360097,
+ 360098,
+ 360100,
+ 360101,
+ 360102,
+ 360103,
+ 360104,
+ 360105,
+ 360106,
+ 360107,
360108,
]
@@ -51,13 +341,13 @@ def load_task(task_id):
name = dataset.name.lower()
del _
del dataset
- cat = {i: 'categorical' if c else 'numerical' for i, c in enumerate(cat)}
+ cat = {i: "categorical" if c else "numerical" for i, c in enumerate(cat)}
if isinstance(task, openml.tasks.OpenMLClassificationTask):
- task_type = 'classification'
+ task_type = "classification"
elif isinstance(task, openml.tasks.OpenMLRegressionTask):
- task_type = 'regression'
+ task_type = "regression"
else:
- raise ValueError('Unknown task type')
+ raise ValueError("Unknown task type")
return X_train, y_train, X_test, y_test, cat, task_type, name
diff --git a/setup.py b/setup.py
index 6107e60321..003b573bd4 100644
--- a/setup.py
+++ b/setup.py
@@ -1,38 +1,46 @@
# -*- encoding: utf-8 -*-
import os
import sys
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
+
+HERE = os.path.abspath(os.path.dirname(__file__))
# Check if Auto-sklearn *could* run on the given system
-if os.name != 'posix':
+if os.name != "posix":
raise ValueError(
- 'Detected unsupported operating system: %s. Please check '
- 'the compability information of auto-sklearn: https://automl.github.io'
- '/auto-sklearn/master/installation.html#windows-osx-compatibility' %
- sys.platform
+ "Detected unsupported operating system: %s. Please check "
+ "the compability information of auto-sklearn: https://automl.github.io"
+ "/auto-sklearn/master/installation.html#windows-osx-compatibility"
+ % sys.platform
)
if sys.version_info < (3, 7):
raise ValueError(
- 'Unsupported Python version %d.%d.%d found. Auto-sklearn requires Python '
- '3.7 or higher.' % (sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
+ "Unsupported Python version %d.%d.%d found. Auto-sklearn requires Python "
+ "3.7 or higher."
+ % (sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
)
-HERE = os.path.abspath(os.path.dirname(__file__))
-with open(os.path.join(HERE, 'requirements.txt')) as fp:
- install_reqs = [r.rstrip() for r in fp.readlines()
- if not r.startswith('#') and not r.startswith('git+')]
+with open(os.path.join(HERE, "requirements.txt")) as fp:
+ install_reqs = [
+ r.rstrip()
+ for r in fp.readlines()
+ if not r.startswith("#") and not r.startswith("git+")
+ ]
-extras_reqs={
+extras_reqs = {
"test": [
"pytest>=4.6",
- "mypy",
+ "pytest-cov",
"pytest-xdist",
"pytest-timeout",
+ "mypy",
+ "isort",
+ "black",
+ "pydocstyle",
"openml",
"pre-commit",
- "pytest-cov",
],
"examples": [
"matplotlib",
@@ -46,32 +54,32 @@
"sphinx_bootstrap_theme",
"numpydoc",
"sphinx_toolbox",
- "docutils==0.16"
+ "docutils==0.16",
],
}
-with open(os.path.join(HERE, 'autosklearn', '__version__.py')) as fh:
+with open(os.path.join(HERE, "autosklearn", "__version__.py")) as fh:
version = fh.readlines()[-1].split()[-1].strip("\"'")
-with open(os.path.join(HERE, 'README.md')) as fh:
+with open(os.path.join(HERE, "README.md")) as fh:
long_description = fh.read()
setup(
- name='auto-sklearn',
- author='Matthias Feurer',
- author_email='feurerm@informatik.uni-freiburg.de',
- description='Automated machine learning.',
+ name="auto-sklearn",
+ author="Matthias Feurer",
+ author_email="feurerm@informatik.uni-freiburg.de",
+ description="Automated machine learning.",
long_description=long_description,
- long_description_content_type='text/markdown',
+ long_description_content_type="text/markdown",
version=version,
- packages=find_packages(exclude=['test', 'scripts', 'examples']),
+ packages=find_packages(exclude=["test", "scripts", "examples"]),
extras_require=extras_reqs,
install_requires=install_reqs,
include_package_data=True,
- license='BSD3',
- platforms=['Linux'],
+ license="BSD3",
+ platforms=["Linux"],
classifiers=[
"Environment :: Console",
"Intended Audience :: Developers",
@@ -83,10 +91,10 @@
"Operating System :: OS Independent",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
- 'Programming Language :: Python :: 3.7',
- 'Programming Language :: Python :: 3.8',
- 'Programming Language :: Python :: 3.9',
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
],
- python_requires='>=3.7',
- url='https://automl.github.io/auto-sklearn',
+ python_requires=">=3.7",
+ url="https://automl.github.io/auto-sklearn",
)
diff --git a/test/conftest.py b/test/conftest.py
index d3df7508cd..16a285b9df 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -3,12 +3,12 @@
import time
import unittest.mock
-from dask.distributed import Client, get_client
import psutil
import pytest
+from dask.distributed import Client, get_client
-from autosklearn.automl_common.common.utils.backend import create, Backend
from autosklearn.automl import AutoML
+from autosklearn.automl_common.common.utils.backend import Backend, create
class AutoMLStub(AutoML):
@@ -36,9 +36,11 @@ def automl_stub(request):
def backend(request):
test_dir = os.path.dirname(__file__)
- tmp = os.path.join(test_dir, '.tmp__%s__%s' % (request.module.__name__, request.node.name))
+ tmp = os.path.join(
+ test_dir, ".tmp__%s__%s" % (request.module.__name__, request.node.name)
+ )
- for dir in (tmp, ):
+ for dir in (tmp,):
for i in range(10):
if os.path.exists(dir):
try:
@@ -49,14 +51,12 @@ def backend(request):
# Make sure the folders we wanna create do not already exist.
backend = create(
- temporary_directory=tmp,
- output_directory=None,
- prefix="auto-sklearn"
+ temporary_directory=tmp, output_directory=None, prefix="auto-sklearn"
)
def get_finalizer(tmp_dir):
def session_run_at_end():
- for dir in (tmp_dir, ):
+ for dir in (tmp_dir,):
for i in range(10):
if os.path.exists(dir):
try:
@@ -64,7 +64,9 @@ def session_run_at_end():
break
except OSError:
time.sleep(1)
+
return session_run_at_end
+
request.addfinalizer(get_finalizer(tmp))
return backend
@@ -72,7 +74,7 @@ def session_run_at_end():
@pytest.fixture(scope="function")
def tmp_dir(request):
- return _dir_fixture('tmp', request)
+ return _dir_fixture("tmp", request)
def _dir_fixture(dir_type, request):
@@ -124,8 +126,10 @@ def session_run_at_end():
client.shutdown()
client.close()
del client
+
return session_run_at_end
- request.addfinalizer(get_finalizer(client.scheduler_info()['address']))
+
+ request.addfinalizer(get_finalizer(client.scheduler_info()["address"]))
return client
@@ -149,8 +153,10 @@ def session_run_at_end():
client.shutdown()
client.close()
del client
+
return session_run_at_end
- request.addfinalizer(get_finalizer(client.scheduler_info()['address']))
+
+ request.addfinalizer(get_finalizer(client.scheduler_info()["address"]))
return client
diff --git a/test/test_automl/automl_utils.py b/test/test_automl/automl_utils.py
index 768f94ff8d..577ea97359 100644
--- a/test/test_automl/automl_utils.py
+++ b/test/test_automl/automl_utils.py
@@ -1,17 +1,17 @@
# -*- encoding: utf-8 -*-
-import re
-import os
-import glob
import typing
-import numpy as np
+import glob
+import os
+import re
+import numpy as np
scores_dict = {
- 'train_single': ["single_best_train_score", "single_best_optimization_score"],
- 'test_single': ["single_best_test_score"],
- 'train_ensamble': ["ensemble_optimization_score"],
- 'test_ensamble': ["ensemble_test_score"]
+ "train_single": ["single_best_train_score", "single_best_optimization_score"],
+ "test_single": ["single_best_test_score"],
+ "train_ensamble": ["ensemble_optimization_score"],
+ "test_ensamble": ["ensemble_test_score"],
}
@@ -19,15 +19,15 @@ def print_debug_information(automl):
# In case it is called with estimator,
# Get the automl object
- if hasattr(automl, 'automl_'):
+ if hasattr(automl, "automl_"):
automl = automl.automl_
# Log file path
- log_file = glob.glob(os.path.join(
- automl._backend.temporary_directory, 'AutoML*.log'))[0]
+ log_file = glob.glob(
+ os.path.join(automl._backend.temporary_directory, "AutoML*.log")
+ )[0]
- include_messages = ['INFO', 'DEBUG', 'WARN',
- 'CRITICAL', 'ERROR', 'FATAL']
+ include_messages = ["INFO", "DEBUG", "WARN", "CRITICAL", "ERROR", "FATAL"]
# There is a lot of content in the log files. Only
# parsing the main message and ignore the metalearning
@@ -37,53 +37,69 @@ def print_debug_information(automl):
content = logfile.readlines()
# Get the messages to debug easier!
- content = [line for line in content if any(
- msg in line for msg in include_messages
- ) and 'metalearning' not in line]
+ content = [
+ line
+ for line in content
+ if any(msg in line for msg in include_messages)
+ and "metalearning" not in line
+ ]
except Exception as e:
return str(e)
# Also add the run history if any
- if hasattr(automl, 'runhistory_') and hasattr(automl.runhistory_, 'data'):
+ if hasattr(automl, "runhistory_") and hasattr(automl.runhistory_, "data"):
for k, v in automl.runhistory_.data.items():
content += ["{}->{}".format(k, v)]
else:
- content += ['No RunHistory']
+ content += ["No RunHistory"]
# Also add the ensemble history if any
if len(automl.ensemble_performance_history) > 0:
content += [str(h) for h in automl.ensemble_performance_history]
else:
- content += ['No Ensemble History']
+ content += ["No Ensemble History"]
return os.linesep.join(content)
def _includes(scores, all_scores):
- return all(score in all_scores for score in scores) and len(scores) == len(all_scores)
+ return all(score in all_scores for score in scores) and len(scores) == len(
+ all_scores
+ )
def count_succeses(cv_results):
return np.sum(
- [status in ['Success', 'Success (but do not advance to higher budget)']
- for status in cv_results['status']]
+ [
+ status in ["Success", "Success (but do not advance to higher budget)"]
+ for status in cv_results["status"]
+ ]
)
def includes_all_scores(scores):
- all_scores = scores_dict["train_single"] + scores_dict["test_single"] + \
- scores_dict["train_ensamble"] + scores_dict["test_ensamble"] + ["Timestamp"]
+ all_scores = (
+ scores_dict["train_single"]
+ + scores_dict["test_single"]
+ + scores_dict["train_ensamble"]
+ + scores_dict["test_ensamble"]
+ + ["Timestamp"]
+ )
return _includes(scores, all_scores)
def include_single_scores(scores):
- all_scores = scores_dict["train_single"] + scores_dict["test_single"] + ["Timestamp"]
+ all_scores = (
+ scores_dict["train_single"] + scores_dict["test_single"] + ["Timestamp"]
+ )
return _includes(scores, all_scores)
def includes_train_scores(scores):
- all_scores = scores_dict["train_single"] + scores_dict["train_ensamble"] + ["Timestamp"]
+ all_scores = (
+ scores_dict["train_single"] + scores_dict["train_ensamble"] + ["Timestamp"]
+ )
return _includes(scores, all_scores)
@@ -113,7 +129,7 @@ def parse_logfile(self) -> typing.List[str]:
assert os.path.exists(self.logfile), "{} not found".format(self.logfile)
with open(self.logfile) as fh:
- content = [line.strip() for line in fh if re.search(r'[\w+]', line)]
+ content = [line.strip() for line in fh if re.search(r"[\w+]", line)]
return content
def count_ensembler_iterations(self) -> int:
@@ -129,11 +145,12 @@ def count_ensembler_iterations(self) -> int:
# We expect the start msg to be something like:
# [DEBUG] [2020-11-26 19:22:42,160:EnsembleBuilder] \
# Function called with argument: (61....
- # [DEBUG] [2020-11-30 11:53:47,069:EnsembleBuilder] Function called with argument:
- # (28.246965646743774, 1, False), {}
+ # [DEBUG] [2020-11-30 11:53:47,069:EnsembleBuilder] \
+ # Function called with argument: (28.246965646743774, 1, False), {}
match = re.search(
- r'EnsembleBuilder]\s+Function called with argument:\s+\(\d+\.\d+, (\d+), \w+',
- line)
+ r"EnsembleBuilder]\s+Function called with argument:\s+\(\d+\.\d+, (\d+), \w+", # noqa: E501
+ line,
+ )
if match:
iterations.append(int(match.group(1)))
@@ -143,19 +160,15 @@ def count_ensembler_iterations(self) -> int:
# time left: 61.266255
# [DEBUG] [2020-11-27 20:27:28,044:EnsembleBuilder] Starting iteration 2,
# time left: 10.603252
- match = re.search(
- r'EnsembleBuilder]\s+Starting iteration (\d+)',
- line)
+ match = re.search(r"EnsembleBuilder]\s+Starting iteration (\d+)", line)
if match:
iterations_from_inside_ensemble_builder.append(int(match.group(1)))
# The ensemble builder might not be called if there is no time.
# Here we expect the msg:
- # [DEBUG] [2020-11-27 20:27:28,044:EnsembleBuilder] Not starting iteration 2,
- # as time left: 1.59324
- match = re.search(
- r'EnsembleBuilder]\s+Not starting iteration (\d+)',
- line)
+ # [DEBUG] [2020-11-27 20:27:28,044:EnsembleBuilder] \
+ # Not starting iteration 2, as time left: 1.59324
+ match = re.search(r"EnsembleBuilder]\s+Not starting iteration (\d+)", line)
if match:
iterations_from_inside_ensemble_builder.append(int(match.group(1)))
@@ -174,49 +187,80 @@ def count_ensembler_success_pynisher_calls(self) -> int:
# [DEBUG] [2020-11-30 11:54:05,984:EnsembleBuilder] return value:
# (([{'Timestamp': Timestamp('2020-11- 30 11:54:05.983837'),
# 'ensemble_optimization_score': 0.9787234042553191}], 50, None, None, None), 0)
- return_msgs = len([line for line in self.lines if re.search(
- r'EnsembleBuilder]\s+return value:.*Timestamp', line)])
+ return_msgs = len(
+ [
+ line
+ for line in self.lines
+ if re.search(r"EnsembleBuilder]\s+return value:.*Timestamp", line)
+ ]
+ )
return return_msgs
def count_tae_pynisher_calls(self) -> int:
# We expect the return msg to be something like:
- # [DEBUG] [2020-12-16 11:57:08,987:Client-pynisher] Function called with argument: ()
- # , {'queue': , 'config': 1
- # [DEBUG] [2020-12-16 11:57:10,537:Client-pynisher] Function called with argument: ()
- # , {'queue': ,
- # 'config': Configuration:
+ """
+ [DEBUG] [2020-12-16 11:57:08,987:Client-pynisher] Function called with argument: (),
+ {'queue': , 'config': 1
+ [DEBUG] [2020-12-16 11:57:10,537:Client-pynisher] Function called with argument: (),
+ {'queue': , 'config': Configuration:
+ """ # noqa: E501
# Only the parenthesis below need to be escaped, ] and { do not.
- call_msgs = len([line for line in self.lines if re.search(
- r'pynisher]\s+Function called with argument: \(\), {', line)])
+ call_msgs = len(
+ [
+ line
+ for line in self.lines
+ if re.search(
+ r"pynisher]\s+Function called with argument: \(\), {", line
+ )
+ ]
+ )
return call_msgs
def count_tae_pynisher_returns(self) -> int:
# We expect the return msg to be something like:
# [DEBUG] [2020-11-30 11:53:11,264:pynisher] return value: (None, 0)
# [DEBUG] [2020-11-30 11:53:13,768:pynisher] return value: (None, 0)
- return_msgs = len([line for line in self.lines if re.search(
- r'pynisher]\s+return value:\s+', line)])
+ return_msgs = len(
+ [
+ line
+ for line in self.lines
+ if re.search(r"pynisher]\s+return value:\s+", line)
+ ]
+ )
# When the pynisher pipe is prematurely closed, we also expect:
# Your function call closed the pipe prematurely
# -> Subprocess probably got an uncatchable signal
# We expect the return msg to be something like:
# OR
# Something else went wrong, sorry.
- premature_msgs = len([line for line in self.lines if re.search(
- r'pynisher]\s+Your function call closed the pipe prematurely', line)])
- failure_msgs = len([line for line in self.lines if re.search(
- r'pynisher]\s+Something else went wrong, sorry.', line)])
+ premature_msgs = len(
+ [
+ line
+ for line in self.lines
+ if re.search(
+ r"pynisher]\s+Your function call closed the pipe prematurely", line
+ )
+ ]
+ )
+ failure_msgs = len(
+ [
+ line
+ for line in self.lines
+ if re.search(r"pynisher]\s+Something else went wrong, sorry.", line)
+ ]
+ )
return return_msgs + premature_msgs + failure_msgs
def get_automl_setting_from_log(self, dataset_name: str, setting: str) -> str:
for line in self.lines:
# We expect messages of the form
- # [DEBUG] [2020-11-30 11:53:10,457:AutoML(5):breast_cancer] ensemble_size: 50
- # [DEBUG] [2020-11-30 11:53:10,457:AutoML(5):breast_cancer] ensemble_nbest: 50
- match = re.search(
- f"{dataset_name}]\\s*{setting}\\s*:\\s*(\\w+)",
- line)
+ """
+ [DEBUG] [2020-11-30 11:53:10,457:AutoML(5):breast_cancer] ensemble_size: 50
+ [DEBUG] [2020-11-30 11:53:10,457:AutoML(5):breast_cancer] ensemble_nbest: 50
+ """ # noqa: E501
+ match = re.search(f"{dataset_name}]\\s*{setting}\\s*:\\s*(\\w+)", line)
if match:
return match.group(1)
+
return None
diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py
index 4e509d8755..37040f0560 100644
--- a/test/test_automl/test_automl.py
+++ b/test/test_automl/test_automl.py
@@ -1,12 +1,11 @@
# -*- encoding: utf-8 -*-
from typing import Dict, List, Union
+import glob
import itertools
import os
import pickle
-import sys
import time
-import glob
import unittest
import unittest.mock
import warnings
@@ -14,35 +13,46 @@
import numpy as np
import pandas as pd
import pytest
-from scipy.sparse import csr_matrix, spmatrix
import sklearn.datasets
-from sklearn.ensemble import VotingRegressor, VotingClassifier
-from smac.scenario.scenario import Scenario
+from scipy.sparse import csr_matrix, spmatrix
+from sklearn.ensemble import VotingClassifier, VotingRegressor
from smac.facade.roar_facade import ROAR
+from smac.scenario.scenario import Scenario
+from smac.tae import StatusType
-from autosklearn.automl import AutoML, AutoMLClassifier, AutoMLRegressor, _model_predict
-from autosklearn.data.validation import InputValidator
import autosklearn.automl
-from autosklearn.data.xy_data_manager import XYDataManager
-from autosklearn.metrics import (
- accuracy, log_loss, balanced_accuracy, default_metric_for_task
-)
-from autosklearn.evaluation.abstract_evaluator import MyDummyClassifier, MyDummyRegressor
-from autosklearn.util.data import default_dataset_compression_arg
-from autosklearn.util.logging_ import PickableLoggerAdapter
import autosklearn.pipeline.util as putil
+from autosklearn.automl import AutoML, AutoMLClassifier, AutoMLRegressor, _model_predict
from autosklearn.constants import (
- MULTICLASS_CLASSIFICATION,
BINARY_CLASSIFICATION,
+ CLASSIFICATION_TASKS,
+ MULTICLASS_CLASSIFICATION,
MULTILABEL_CLASSIFICATION,
- REGRESSION,
MULTIOUTPUT_REGRESSION,
- CLASSIFICATION_TASKS,
+ REGRESSION,
)
-from smac.tae import StatusType
+from autosklearn.data.validation import InputValidator
+from autosklearn.data.xy_data_manager import XYDataManager
+from autosklearn.evaluation.abstract_evaluator import (
+ MyDummyClassifier,
+ MyDummyRegressor,
+)
+from autosklearn.metrics import (
+ accuracy,
+ balanced_accuracy,
+ default_metric_for_task,
+ log_loss,
+)
+from autosklearn.util.data import default_dataset_compression_arg
+from autosklearn.util.logging_ import PickableLoggerAdapter
-sys.path.append(os.path.dirname(__file__))
-from automl_utils import print_debug_information, count_succeses, AutoMLLogParser, includes_all_scores, includes_train_scores, performance_over_time_is_plausible # noqa (E402: module level import not at top of file)
+from test.test_automl.automl_utils import (
+ AutoMLLogParser,
+ count_succeses,
+ includes_train_scores,
+ performance_over_time_is_plausible,
+ print_debug_information,
+)
class AutoMLStub(AutoML):
@@ -57,7 +67,7 @@ def __del__(self):
def test_fit(dask_client):
- X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
automl = autosklearn.automl.AutoML(
seed=0,
time_left_for_this_task=30,
@@ -80,13 +90,7 @@ def test_fit(dask_client):
def test_fit_roar(dask_client_single_worker):
def get_roar_object_callback(
- scenario_dict,
- seed,
- ta,
- ta_kwargs,
- dask_client,
- n_jobs,
- **kwargs
+ scenario_dict, seed, ta, ta_kwargs, dask_client, n_jobs, **kwargs
):
"""Random online adaptive racing.
@@ -101,7 +105,7 @@ def get_roar_object_callback(
n_jobs=n_jobs,
)
- X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
automl = autosklearn.automl.AutoML(
time_left_for_this_task=30,
per_run_time_limit=5,
@@ -126,8 +130,7 @@ def test_refit_shuffle_on_fail(dask_client):
failing_model = unittest.mock.Mock()
failing_model.fit.side_effect = [ValueError(), ValueError(), None]
- failing_model.fit_transformer.side_effect = [
- ValueError(), ValueError(), (None, {})]
+ failing_model.fit_transformer.side_effect = [ValueError(), ValueError(), (None, {})]
failing_model.get_max_iter.return_value = 100
auto = AutoML(30, 5, dask_client=dask_client)
@@ -135,7 +138,7 @@ def test_refit_shuffle_on_fail(dask_client):
ensemble_mock.get_selected_model_identifiers.return_value = [(1, 1, 50.0)]
auto.ensemble_ = ensemble_mock
auto.InputValidator = InputValidator()
- for budget_type in [None, 'iterations']:
+ for budget_type in [None, "iterations"]:
auto._budget_type = budget_type
auto.models_ = {(1, 1, 50.0): failing_model}
@@ -153,12 +156,11 @@ def test_refit_shuffle_on_fail(dask_client):
def test_only_loads_ensemble_models(automl_stub):
-
def side_effect(ids, *args, **kwargs):
return models if ids is identifiers else {}
# Add a resampling strategy as this is required by load_models
- automl_stub._resampling_strategy = 'holdout'
+ automl_stub._resampling_strategy = "holdout"
identifiers = [(1, 2), (3, 4)]
models = [42]
@@ -171,7 +173,7 @@ def side_effect(ids, *args, **kwargs):
assert models == automl_stub.models_
assert automl_stub.cv_models_ is None
- automl_stub._resampling_strategy = 'cv'
+ automl_stub._resampling_strategy = "cv"
models = [42]
automl_stub._backend.load_cv_models_by_identifiers.side_effect = side_effect
@@ -192,7 +194,7 @@ def test_check_for_models_if_no_ensemble(automl_stub):
def test_raises_if_no_models(automl_stub):
automl_stub._backend.load_ensemble.return_value = None
automl_stub._backend.list_all_models.return_value = []
- automl_stub._resampling_strategy = 'holdout'
+ automl_stub._resampling_strategy = "holdout"
automl_stub._disable_evaluator_output = False
with pytest.raises(ValueError):
@@ -205,7 +207,7 @@ def test_raises_if_no_models(automl_stub):
def test_delete_non_candidate_models(dask_client):
seed = 555
- X, Y, _, _ = putil.get_dataset('iris')
+ X, Y, _, _ = putil.get_dataset("iris")
automl = autosklearn.automl.AutoML(
delete_tmp_folder_after_terminate=False,
time_left_for_this_task=60,
@@ -213,11 +215,8 @@ def test_delete_non_candidate_models(dask_client):
ensemble_nbest=3,
seed=seed,
initial_configurations_via_metalearning=0,
- resampling_strategy='holdout',
- include={
- 'classifier': ['sgd'],
- 'feature_preprocessor': ['no_preprocessing']
- },
+ resampling_strategy="holdout",
+ include={"classifier": ["sgd"], "feature_preprocessor": ["no_preprocessing"]},
metric=accuracy,
dask_client=dask_client,
# Force model to be deleted. That is, from 50 which is the
@@ -229,23 +228,31 @@ def test_delete_non_candidate_models(dask_client):
# Assert at least one model file has been deleted and that there were no
# deletion errors
- log_file_path = glob.glob(os.path.join(
- automl._backend.temporary_directory, 'AutoML(' + str(seed) + '):*.log'))
+ log_file_path = glob.glob(
+ os.path.join(
+ automl._backend.temporary_directory, "AutoML(" + str(seed) + "):*.log"
+ )
+ )
with open(log_file_path[0]) as log_file:
log_content = log_file.read()
- assert 'Deleted files of non-candidate model' in log_content, log_content
- assert 'Failed to delete files of non-candidate model' not in log_content, log_content
- assert 'Failed to lock model' not in log_content, log_content
+ assert "Deleted files of non-candidate model" in log_content, log_content
+ assert (
+ "Failed to delete files of non-candidate model" not in log_content
+ ), log_content
+ assert "Failed to lock model" not in log_content, log_content
# Assert that the files of the models used by the ensemble weren't deleted
model_files = automl._backend.list_all_models(seed=seed)
model_files_idx = set()
for m_file in model_files:
# Extract the model identifiers from the filename
- m_file = os.path.split(m_file)[1].replace('.model', '').split('.', 2)
+ m_file = os.path.split(m_file)[1].replace(".model", "").split(".", 2)
model_files_idx.add((int(m_file[0]), int(m_file[1]), float(m_file[2])))
ensemble_members_idx = set(automl.ensemble_.identifiers_)
- assert ensemble_members_idx.issubset(model_files_idx), (ensemble_members_idx, model_files_idx)
+ assert ensemble_members_idx.issubset(model_files_idx), (
+ ensemble_members_idx,
+ model_files_idx,
+ )
del automl
@@ -257,17 +264,23 @@ def test_binary_score_and_include(dask_client):
"""
data = sklearn.datasets.make_classification(
- n_samples=400, n_features=10, n_redundant=1, n_informative=3,
- n_repeated=1, n_clusters_per_class=2, random_state=1)
+ n_samples=400,
+ n_features=10,
+ n_redundant=1,
+ n_informative=3,
+ n_repeated=1,
+ n_clusters_per_class=2,
+ random_state=1,
+ )
X_train = data[0][:200]
Y_train = data[1][:200]
X_test = data[0][200:]
Y_test = data[1][200:]
automl = autosklearn.automl.AutoML(
- 20, 5,
- include={'classifier': ['sgd'],
- 'feature_preprocessor': ['no_preprocessing']},
+ 20,
+ 5,
+ include={"classifier": ["sgd"], "feature_preprocessor": ["no_preprocessing"]},
metric=accuracy,
dask_client=dask_client,
)
@@ -286,10 +299,11 @@ def test_binary_score_and_include(dask_client):
def test_automl_outputs(dask_client):
- X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
- name = 'iris'
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
+ name = "iris"
auto = autosklearn.automl.AutoML(
- 30, 5,
+ 30,
+ 5,
initial_configurations_via_metalearning=0,
seed=100,
metric=accuracy,
@@ -307,59 +321,70 @@ def test_automl_outputs(dask_client):
)
data_manager_file = os.path.join(
- auto._backend.temporary_directory,
- '.auto-sklearn',
- 'datamanager.pkl'
+ auto._backend.temporary_directory, ".auto-sklearn", "datamanager.pkl"
)
# pickled data manager (without one hot encoding!)
- with open(data_manager_file, 'rb') as fh:
+ with open(data_manager_file, "rb") as fh:
D = pickle.load(fh)
- assert np.allclose(D.data['X_train'], X_train)
+ assert np.allclose(D.data["X_train"], X_train)
# Check that all directories are there
fixture = [
- 'true_targets_ensemble.npy',
- 'start_time_100',
- 'datamanager.pkl',
- 'ensemble_read_preds.pkl',
- 'ensemble_read_losses.pkl',
- 'runs',
- 'ensembles',
- 'ensemble_history.json',
+ "true_targets_ensemble.npy",
+ "start_time_100",
+ "datamanager.pkl",
+ "ensemble_read_preds.pkl",
+ "ensemble_read_losses.pkl",
+ "runs",
+ "ensembles",
+ "ensemble_history.json",
]
- assert (
- sorted(os.listdir(os.path.join(auto._backend.temporary_directory,
- '.auto-sklearn')))
- == sorted(fixture)
- )
+ assert sorted(
+ os.listdir(os.path.join(auto._backend.temporary_directory, ".auto-sklearn"))
+ ) == sorted(fixture)
# At least one ensemble, one validation, one test prediction and one
# model and one ensemble
- fixture = glob.glob(os.path.join(
- auto._backend.temporary_directory,
- '.auto-sklearn', 'runs', '*', 'predictions_ensemble*npy',
- ))
+ fixture = glob.glob(
+ os.path.join(
+ auto._backend.temporary_directory,
+ ".auto-sklearn",
+ "runs",
+ "*",
+ "predictions_ensemble*npy",
+ )
+ )
assert len(fixture) > 0
- fixture = glob.glob(os.path.join(auto._backend.temporary_directory, '.auto-sklearn',
- 'runs', '*', '100.*.model'))
+ fixture = glob.glob(
+ os.path.join(
+ auto._backend.temporary_directory,
+ ".auto-sklearn",
+ "runs",
+ "*",
+ "100.*.model",
+ )
+ )
assert len(fixture) > 0
- fixture = os.listdir(os.path.join(auto._backend.temporary_directory,
- '.auto-sklearn', 'ensembles'))
- assert '100.0000000000.ensemble' in fixture
+ fixture = os.listdir(
+ os.path.join(auto._backend.temporary_directory, ".auto-sklearn", "ensembles")
+ )
+ assert "100.0000000000.ensemble" in fixture
# Start time
- start_time_file_path = os.path.join(auto._backend.temporary_directory,
- '.auto-sklearn', "start_time_100")
- with open(start_time_file_path, 'r') as fh:
+ start_time_file_path = os.path.join(
+ auto._backend.temporary_directory, ".auto-sklearn", "start_time_100"
+ )
+ with open(start_time_file_path, "r") as fh:
start_time = float(fh.read())
assert time.time() - start_time >= 10, print_debug_information(auto)
# Then check that the logger matches the run expectation
- logfile = glob.glob(os.path.join(
- auto._backend.temporary_directory, 'AutoML*.log'))[0]
+ logfile = glob.glob(os.path.join(auto._backend.temporary_directory, "AutoML*.log"))[
+ 0
+ ]
parser = AutoMLLogParser(logfile)
# The number of ensemble trajectories properly in log file
@@ -381,42 +406,61 @@ def test_automl_outputs(dask_client):
# Dummy not in run history
total_calls_to_pynisher_log = parser.count_tae_pynisher_calls() - 1
total_returns_from_pynisher_log = parser.count_tae_pynisher_returns() - 1
- total_elements_rh = len([run_value for run_value in auto.runhistory_.data.values(
- ) if run_value.status == StatusType.RUNNING])
+ total_elements_rh = len(
+ [
+ run_value
+ for run_value in auto.runhistory_.data.values()
+ if run_value.status == StatusType.RUNNING
+ ]
+ )
# Make sure we register all calls to pynisher
# The less than or equal here is added as a WA as
# https://github.com/automl/SMAC3/pull/712 is not yet integrated
- assert total_elements_rh <= total_calls_to_pynisher_log, print_debug_information(auto)
+ assert total_elements_rh <= total_calls_to_pynisher_log, print_debug_information(
+ auto
+ )
# Make sure we register all returns from pynisher
- assert total_elements_rh <= total_returns_from_pynisher_log, print_debug_information(auto)
+ assert (
+ total_elements_rh <= total_returns_from_pynisher_log
+ ), print_debug_information(auto)
# Lastly check that settings are print to logfile
- ensemble_size = parser.get_automl_setting_from_log(auto._dataset_name, 'ensemble_size')
+ ensemble_size = parser.get_automl_setting_from_log(
+ auto._dataset_name, "ensemble_size"
+ )
assert auto._ensemble_size == int(ensemble_size)
del auto
-@pytest.mark.parametrize("datasets", [('breast_cancer', BINARY_CLASSIFICATION),
- ('wine', MULTICLASS_CLASSIFICATION),
- ('diabetes', REGRESSION)])
+@pytest.mark.parametrize(
+ "datasets",
+ [
+ ("breast_cancer", BINARY_CLASSIFICATION),
+ ("wine", MULTICLASS_CLASSIFICATION),
+ ("diabetes", REGRESSION),
+ ],
+)
def test_do_dummy_prediction(dask_client, datasets):
name, task = datasets
X_train, Y_train, X_test, Y_test = putil.get_dataset(name)
datamanager = XYDataManager(
- X_train, Y_train,
- X_test, Y_test,
+ X_train,
+ Y_train,
+ X_test,
+ Y_test,
task=task,
dataset_name=name,
- feat_type={i: 'numerical' for i in range(X_train.shape[1])},
+ feat_type={i: "numerical" for i in range(X_train.shape[1])},
)
auto = autosklearn.automl.AutoML(
- 20, 5,
+ 20,
+ 5,
initial_configurations_via_metalearning=25,
metric=accuracy,
dask_client=dask_client,
@@ -433,18 +477,18 @@ def test_do_dummy_prediction(dask_client, datasets):
D = auto._backend.load_datamanager()
# Check if data manager is correcly loaded
- assert D.info['task'] == datamanager.info['task']
+ assert D.info["task"] == datamanager.info["task"]
auto._do_dummy_prediction(D, 1)
# Ensure that the dummy predictions are not in the current working
# directory, but in the temporary directory.
- unexpected_directory = os.path.join(os.getcwd(), '.auto-sklearn')
+ unexpected_directory = os.path.join(os.getcwd(), ".auto-sklearn")
expected_directory = os.path.join(
auto._backend.temporary_directory,
- '.auto-sklearn',
- 'runs',
- '1_1_0.0',
- 'predictions_ensemble_1_1_0.0.npy'
+ ".auto-sklearn",
+ "runs",
+ "1_1_0.0",
+ "predictions_ensemble_1_1_0.0.npy",
)
assert not os.path.exists(unexpected_directory)
assert os.path.exists(expected_directory)
@@ -454,27 +498,30 @@ def test_do_dummy_prediction(dask_client, datasets):
del auto
-@unittest.mock.patch('autosklearn.evaluation.ExecuteTaFuncWithQueue.run')
+@unittest.mock.patch("autosklearn.evaluation.ExecuteTaFuncWithQueue.run")
def test_fail_if_dummy_prediction_fails(ta_run_mock, dask_client):
- X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
datamanager = XYDataManager(
- X_train, Y_train,
- X_test, Y_test,
+ X_train,
+ Y_train,
+ X_test,
+ Y_test,
task=2,
- feat_type={i: 'Numerical' for i in range(X_train.shape[1])},
- dataset_name='iris',
+ feat_type={i: "Numerical" for i in range(X_train.shape[1])},
+ dataset_name="iris",
)
time_for_this_task = 30
per_run_time = 10
- auto = autosklearn.automl.AutoML(time_for_this_task,
- per_run_time,
- initial_configurations_via_metalearning=25,
- metric=accuracy,
- dask_client=dask_client,
- delete_tmp_folder_after_terminate=False,
- )
+ auto = autosklearn.automl.AutoML(
+ time_for_this_task,
+ per_run_time,
+ initial_configurations_via_metalearning=25,
+ metric=accuracy,
+ dask_client=dask_client,
+ delete_tmp_folder_after_terminate=False,
+ )
auto._backend = auto._create_backend()
auto._backend._make_internals_directory()
auto._backend.save_datamanager(datamanager)
@@ -497,55 +544,55 @@ def test_fail_if_dummy_prediction_fails(ta_run_mock, dask_client):
auto._do_dummy_prediction(datamanager, 1)
except ValueError:
raised = True
- assert not raised, 'Exception raised'
+ assert not raised, "Exception raised"
# Case 2. Check that if statustype returned by ta.run() != success,
# the function raises error.
ta_run_mock.return_value = StatusType.CRASHED, None, None, {}
with pytest.raises(
ValueError,
- match='Dummy prediction failed with run state StatusType.CRASHED and additional output: {}.' # noqa
+ match="Dummy prediction failed with run state StatusType.CRASHED and additional output: {}.", # noqa
):
auto._do_dummy_prediction(datamanager, 1)
ta_run_mock.return_value = StatusType.ABORT, None, None, {}
with pytest.raises(
ValueError,
- match='Dummy prediction failed with run state StatusType.ABORT '
- 'and additional output: {}.',
+ match="Dummy prediction failed with run state StatusType.ABORT "
+ "and additional output: {}.",
):
auto._do_dummy_prediction(datamanager, 1)
ta_run_mock.return_value = StatusType.TIMEOUT, None, None, {}
with pytest.raises(
ValueError,
- match='Dummy prediction failed with run state StatusType.TIMEOUT '
- 'and additional output: {}.'
+ match="Dummy prediction failed with run state StatusType.TIMEOUT "
+ "and additional output: {}.",
):
auto._do_dummy_prediction(datamanager, 1)
ta_run_mock.return_value = StatusType.MEMOUT, None, None, {}
with pytest.raises(
ValueError,
- match='Dummy prediction failed with run state StatusType.MEMOUT '
- 'and additional output: {}.',
+ match="Dummy prediction failed with run state StatusType.MEMOUT "
+ "and additional output: {}.",
):
auto._do_dummy_prediction(datamanager, 1)
ta_run_mock.return_value = StatusType.CAPPED, None, None, {}
with pytest.raises(
ValueError,
- match='Dummy prediction failed with run state StatusType.CAPPED '
- 'and additional output: {}.'
+ match="Dummy prediction failed with run state StatusType.CAPPED "
+ "and additional output: {}.",
):
auto._do_dummy_prediction(datamanager, 1)
- ta_run_mock.return_value = StatusType.CRASHED, None, None, {'exitcode': -6}
+ ta_run_mock.return_value = StatusType.CRASHED, None, None, {"exitcode": -6}
with pytest.raises(
ValueError,
- match='The error suggests that the provided memory limits were too tight.',
+ match="The error suggests that the provided memory limits are too tight.",
):
auto._do_dummy_prediction(datamanager, 1)
-@unittest.mock.patch('autosklearn.smbo.AutoMLSMBO.run_smbo')
+@unittest.mock.patch("autosklearn.smbo.AutoMLSMBO.run_smbo")
def test_exceptions_inside_log_in_smbo(smbo_run_mock, dask_client):
# Below importing and shutdown is a workaround, to make sure
@@ -553,6 +600,7 @@ def test_exceptions_inside_log_in_smbo(smbo_run_mock, dask_client):
# this test with multiple other test at the same time causes this
# test to fail. This resets the singletons of the logging class
import logging
+
logging.shutdown()
automl = autosklearn.automl.AutoML(
@@ -563,15 +611,15 @@ def test_exceptions_inside_log_in_smbo(smbo_run_mock, dask_client):
delete_tmp_folder_after_terminate=False,
)
- dataset_name = 'test_exceptions_inside_log'
+ dataset_name = "test_exceptions_inside_log"
# Create a custom exception to prevent other errors to slip in
class MyException(Exception):
pass
- X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
# The first call is on dummy predictor failure
- message = str(np.random.randint(100)) + '_run_smbo'
+ message = str(np.random.randint(100)) + "_run_smbo"
smbo_run_mock.side_effect = MyException(message)
with pytest.raises(MyException):
@@ -583,10 +631,12 @@ class MyException(Exception):
)
# make sure that the logfile was created
- logger_name = 'AutoML(%d):%s' % (1, dataset_name)
+ logger_name = "AutoML(%d):%s" % (1, dataset_name)
logger = logging.getLogger(logger_name)
- logfile = os.path.join(automl._backend.temporary_directory, logger_name + '.log')
- assert os.path.exists(logfile), print_debug_information(automl) + str(automl._clean_logger())
+ logfile = os.path.join(automl._backend.temporary_directory, logger_name + ".log")
+ assert os.path.exists(logfile), print_debug_information(automl) + str(
+ automl._clean_logger()
+ )
# Give some time for the error message to be printed in the
# log file
@@ -604,19 +654,21 @@ class MyException(Exception):
automl._clean_logger()
if not found_message:
- pytest.fail("Did not find {} in the log file {} for logger {}/{}/{}".format(
- message,
- print_debug_information(automl),
- vars(automl._logger.logger),
- vars(logger),
- vars(logging.getLogger())
- ))
+ pytest.fail(
+ "Did not find {} in the log file {} for logger {}/{}/{}".format(
+ message,
+ print_debug_information(automl),
+ vars(automl._logger.logger),
+ vars(logger),
+ vars(logging.getLogger()),
+ )
+ )
@pytest.mark.parametrize("metric", [log_loss, balanced_accuracy])
def test_load_best_individual_model(metric, dask_client):
- X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
automl = autosklearn.automl.AutoML(
time_left_for_this_task=30,
per_run_time_limit=5,
@@ -645,9 +697,9 @@ def test_load_best_individual_model(metric, dask_client):
assert get_models_with_weights[0][0] == 1.0
# Match a toy dataset
- if metric.name == 'balanced_accuracy':
+ if metric.name == "balanced_accuracy":
assert automl.score(X_test, Y_test) > 0.9
- elif metric.name == 'log_loss':
+ elif metric.name == "log_loss":
# Seen values in github actions of 0.6978304740364537
assert automl.score(X_test, Y_test) < 0.7
else:
@@ -667,17 +719,18 @@ def test_fail_if_feat_type_on_pandas_input(dask_client):
dask_client=dask_client,
)
- X_train = pd.DataFrame({'a': [1, 1], 'c': [1, 2]})
+ X_train = pd.DataFrame({"a": [1, 1], "c": [1, 2]})
y_train = [1, 0]
- with pytest.raises(
- ValueError,
- match=""
- "providing the option feat_type to the fit method is not supported when using a Dataframe"
- ):
+ msg = (
+ "providing the option feat_type to the fit method is not supported"
+ " when using a Dataframe."
+ )
+ with pytest.raises(ValueError, match=msg):
automl.fit(
- X_train, y_train,
+ X_train,
+ y_train,
task=BINARY_CLASSIFICATION,
- feat_type={1: 'Categorical', 2: 'Numerical'},
+ feat_type={1: "Categorical", 2: "Numerical"},
)
@@ -686,7 +739,7 @@ def data_input_and_target_types():
# Create valid inputs
X_ndarray = np.random.random(size=(n_rows, 5))
- X_ndarray[X_ndarray < .9] = 0
+ X_ndarray[X_ndarray < 0.9] = 0
# Binary Classificaiton
y_binary_ndarray = np.random.random(size=n_rows)
@@ -696,7 +749,9 @@ def data_input_and_target_types():
# Multiclass classification
y_multiclass_ndarray = np.random.random(size=n_rows)
y_multiclass_ndarray[y_multiclass_ndarray > 0.66] = 2
- y_multiclass_ndarray[(y_multiclass_ndarray <= 0.66) & (y_multiclass_ndarray >= 0.33)] = 1
+ y_multiclass_ndarray[
+ (y_multiclass_ndarray <= 0.66) & (y_multiclass_ndarray >= 0.33)
+ ] = 1
y_multiclass_ndarray[y_multiclass_ndarray < 0.33] = 0
# Multilabel classificaiton
@@ -789,11 +844,7 @@ def test_input_and_target_types(dask_client, X, y, X_test, y_test, task):
# To save time fitting and only validate the inputs we only return
# the configuration space
automl.fit(
- X=X,
- y=y,
- X_test=X_test,
- y_test=y_test,
- only_return_configuration_space=True
+ X=X, y=y, X_test=X_test, y_test=y_test, only_return_configuration_space=True
)
assert automl._task == task
assert automl._metric.name == default_metric_for_task[task].name
@@ -801,21 +852,15 @@ def test_input_and_target_types(dask_client, X, y, X_test, y_test, task):
def data_test_model_predict_outsputs_correct_shapes():
datasets = sklearn.datasets
- binary = datasets.make_classification(
- n_samples=5, n_classes=2, random_state=0
- )
+ binary = datasets.make_classification(n_samples=5, n_classes=2, random_state=0)
multiclass = datasets.make_classification(
n_samples=5, n_informative=3, n_classes=3, random_state=0
)
multilabel = datasets.make_multilabel_classification(
n_samples=5, n_classes=3, random_state=0
)
- regression = datasets.make_regression(
- n_samples=5, random_state=0
- )
- multioutput = datasets.make_regression(
- n_samples=5, n_targets=3, random_state=0
- )
+ regression = datasets.make_regression(n_samples=5, random_state=0)
+ multioutput = datasets.make_regression(n_samples=5, n_targets=3, random_state=0)
# TODO issue 1169
# While testing output shapes, realised all models are wrapped to provide
@@ -841,17 +886,15 @@ def regressor(X, y):
# How cross validation models are currently grouped together
def voting_classifier(X, y):
classifiers = [
- MyDummyClassifier(config=1, random_state=0).fit(X, y)
- for _ in range(5)
+ MyDummyClassifier(config=1, random_state=0).fit(X, y) for _ in range(5)
]
- vc = VotingClassifier(estimators=None, voting='soft')
+ vc = VotingClassifier(estimators=None, voting="soft")
vc.estimators_ = classifiers
return vc
def voting_regressor(X, y):
regressors = [
- MyDummyRegressor(config=1, random_state=0).fit(X, y)
- for _ in range(5)
+ MyDummyRegressor(config=1, random_state=0).fit(X, y) for _ in range(5)
]
vr = VotingRegressor(estimators=None)
vr.estimators_ = regressors
@@ -859,41 +902,41 @@ def voting_regressor(X, y):
test_data = {
BINARY_CLASSIFICATION: {
- 'models': [classifier(*binary), voting_classifier(*binary)],
- 'data': binary,
+ "models": [classifier(*binary), voting_classifier(*binary)],
+ "data": binary,
# prob of false/true for the one class
- 'expected_output_shape': (len(binary[0]), 2)
+ "expected_output_shape": (len(binary[0]), 2),
},
MULTICLASS_CLASSIFICATION: {
- 'models': [classifier(*multiclass), voting_classifier(*multiclass)],
- 'data': multiclass,
+ "models": [classifier(*multiclass), voting_classifier(*multiclass)],
+ "data": multiclass,
# prob of true for each possible class
- 'expected_output_shape': (len(multiclass[0]), 3)
+ "expected_output_shape": (len(multiclass[0]), 3),
},
MULTILABEL_CLASSIFICATION: {
- 'models': [classifier(*multilabel), voting_classifier(*multilabel)],
- 'data': multilabel,
+ "models": [classifier(*multilabel), voting_classifier(*multilabel)],
+ "data": multilabel,
# probability of true for each binary label
- 'expected_output_shape': (len(multilabel[0]), 3) # type: ignore
+ "expected_output_shape": (len(multilabel[0]), 3), # type: ignore
},
REGRESSION: {
- 'models': [regressor(*regression), voting_regressor(*regression)],
- 'data': regression,
+ "models": [regressor(*regression), voting_regressor(*regression)],
+ "data": regression,
# array of single outputs
- 'expected_output_shape': (len(regression[0]), )
+ "expected_output_shape": (len(regression[0]),),
},
MULTIOUTPUT_REGRESSION: {
- 'models': [regressor(*multioutput), voting_regressor(*multioutput)],
- 'data': multioutput,
+ "models": [regressor(*multioutput), voting_regressor(*multioutput)],
+ "data": multioutput,
# array of vector otuputs
- 'expected_output_shape': (len(multioutput[0]), 3)
- }
+ "expected_output_shape": (len(multioutput[0]), 3),
+ },
}
return itertools.chain.from_iterable(
[
- (model, cfg['data'], task, cfg['expected_output_shape'])
- for model in cfg['models']
+ (model, cfg["data"], task, cfg["expected_output_shape"])
+ for model in cfg["models"]
]
for task, cfg in test_data.items()
)
@@ -901,7 +944,7 @@ def voting_regressor(X, y):
@pytest.mark.parametrize(
"model, data, task, expected_output_shape",
- data_test_model_predict_outsputs_correct_shapes()
+ data_test_model_predict_outsputs_correct_shapes(),
)
def test_model_predict_outputs_correct_shapes(model, data, task, expected_output_shape):
X, y = data
@@ -912,12 +955,12 @@ def test_model_predict_outputs_correct_shapes(model, data, task, expected_output
def test_model_predict_outputs_warnings_to_logs():
X = list(range(20))
task = REGRESSION
- logger = PickableLoggerAdapter('test_model_predict_correctly_outputs_warnings')
+ logger = PickableLoggerAdapter("test_model_predict_correctly_outputs_warnings")
logger.warning = unittest.mock.Mock()
class DummyModel:
def predict(self, x):
- warnings.warn('test warning', Warning)
+ warnings.warn("test warning", Warning)
return x
model = DummyModel()
@@ -933,7 +976,7 @@ def test_model_predict_outputs_to_stdout_if_no_logger():
class DummyModel:
def predict(self, x):
- warnings.warn('test warning', Warning)
+ warnings.warn("test warning", Warning)
return x
model = DummyModel()
@@ -959,7 +1002,7 @@ def test_param_dataset_compression_false(dataset_compression: bool) -> None:
auto = AutoMLRegressor(
time_left_for_this_task=30,
per_run_time_limit=5,
- dataset_compression=dataset_compression
+ dataset_compression=dataset_compression,
)
assert auto._dataset_compression is None
@@ -980,14 +1023,16 @@ def test_construction_param_dataset_compression_true(dataset_compression: bool)
auto = AutoMLRegressor(
time_left_for_this_task=30,
per_run_time_limit=5,
- dataset_compression=dataset_compression
+ dataset_compression=dataset_compression,
)
assert auto._dataset_compression == default_dataset_compression_arg
@pytest.mark.parametrize("dataset_compression", [{"memory_allocation": 0.2}])
-def test_construction_param_dataset_compression_valid_dict(dataset_compression: Dict) -> None:
+def test_construction_param_dataset_compression_valid_dict(
+ dataset_compression: Dict,
+) -> None:
"""
Parameters
----------
@@ -1001,7 +1046,7 @@ def test_construction_param_dataset_compression_valid_dict(dataset_compression:
auto = AutoMLRegressor(
time_left_for_this_task=30,
per_run_time_limit=5,
- dataset_compression=dataset_compression
+ dataset_compression=dataset_compression,
)
expected_memory_allocation = dataset_compression["memory_allocation"]
@@ -1012,7 +1057,9 @@ def test_construction_param_dataset_compression_valid_dict(dataset_compression:
assert auto._dataset_compression["methods"] == expected_methods
-@pytest.mark.parametrize("dataset_compression", [{"methods": ["precision", "subsample"]}])
+@pytest.mark.parametrize(
+ "dataset_compression", [{"methods": ["precision", "subsample"]}]
+)
@pytest.mark.parametrize("X", [np.ones((100, 10), dtype=int)])
@pytest.mark.parametrize("y", [np.random.random((100,))])
@unittest.mock.patch("autosklearn.automl.reduce_dataset_size_if_too_large")
@@ -1020,7 +1067,7 @@ def test_fit_performs_dataset_compression_without_precision_with_int(
mock_reduce_dataset: unittest.mock.MagicMock,
dataset_compression: Dict,
X: np.ndarray,
- y: np.ndarray
+ y: np.ndarray,
) -> None:
"""We can't reduce the precision of ints as we do with floats. Suppose someone
was to pass a column with `max_int64` and `min_int64`, any reduction of bits will
@@ -1053,7 +1100,7 @@ def test_fit_performs_dataset_compression_without_precision_with_int(
auto = AutoMLRegressor(
time_left_for_this_task=30, # not used but required
per_run_time_limit=5, # not used but required
- dataset_compression=dataset_compression
+ dataset_compression=dataset_compression,
)
# To prevent fitting anything we use `only_return_configuration_space`
@@ -1066,36 +1113,48 @@ def test_fit_performs_dataset_compression_without_precision_with_int(
@pytest.mark.parametrize("dataset_compression", [True])
-@pytest.mark.parametrize("X", [
- np.empty((10, 10)),
- csr_matrix(np.identity(10)),
- pytest.param(
- np.empty((10, 10)).tolist(),
- marks=pytest.mark.xfail(reason="Converted to dataframe by InputValidator")
- ),
- pytest.param(
- pd.DataFrame(np.empty((10, 10))),
- marks=pytest.mark.xfail(reason="No pandas support yet for dataset compression")
- )
-])
-@pytest.mark.parametrize("y", [
- np.random.random((10, 1)),
- np.random.random((10, 1)).tolist(),
- pytest.param(
- pd.Series(np.random.random((10,))),
- marks=pytest.mark.xfail(reason="No pandas support yet for dataset compression")
- ),
- pytest.param(
- pd.DataFrame(np.random.random((10, 10))),
- marks=pytest.mark.xfail(reason="No pandas support yet for dataset compression")
- )
-])
+@pytest.mark.parametrize(
+ "X",
+ [
+ np.empty((10, 10)),
+ csr_matrix(np.identity(10)),
+ pytest.param(
+ np.empty((10, 10)).tolist(),
+ marks=pytest.mark.xfail(reason="Converted to dataframe by InputValidator"),
+ ),
+ pytest.param(
+ pd.DataFrame(np.empty((10, 10))),
+ marks=pytest.mark.xfail(
+ reason="No pandas support yet for dataset compression"
+ ),
+ ),
+ ],
+)
+@pytest.mark.parametrize(
+ "y",
+ [
+ np.random.random((10, 1)),
+ np.random.random((10, 1)).tolist(),
+ pytest.param(
+ pd.Series(np.random.random((10,))),
+ marks=pytest.mark.xfail(
+ reason="No pandas support yet for dataset compression"
+ ),
+ ),
+ pytest.param(
+ pd.DataFrame(np.random.random((10, 10))),
+ marks=pytest.mark.xfail(
+ reason="No pandas support yet for dataset compression"
+ ),
+ ),
+ ],
+)
@unittest.mock.patch("autosklearn.automl.reduce_dataset_size_if_too_large")
def test_fit_performs_dataset_compression(
mock_reduce_dataset: unittest.mock.MagicMock,
dataset_compression: bool,
X: Union[np.ndarray, spmatrix, List, pd.DataFrame],
- y: Union[np.ndarray, List, pd.Series, pd.DataFrame]
+ y: Union[np.ndarray, List, pd.Series, pd.DataFrame],
) -> None:
"""
Parameters
@@ -1122,7 +1181,7 @@ def test_fit_performs_dataset_compression(
auto = AutoMLRegressor(
time_left_for_this_task=30, # not used but required
per_run_time_limit=5, # not used but required
- dataset_compression=dataset_compression
+ dataset_compression=dataset_compression,
)
# To prevent fitting anything we use `only_return_configuration_space`
diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py
index b32d1d0026..ac60e51472 100644
--- a/test/test_automl/test_estimators.py
+++ b/test/test_automl/test_estimators.py
@@ -3,67 +3,74 @@
import copy
import glob
import importlib
-import os
import inspect
import itertools
+import os
import pickle
import re
import sys
import tempfile
import unittest
import unittest.mock
-import pytest
-from ConfigSpace.configuration_space import Configuration
import joblib
-from joblib import cpu_count
import numpy as np
import numpy.ma as npma
import pandas as pd
+import pytest
import sklearn
-import sklearn.model_selection as model_selection
-import sklearn.dummy
import sklearn.datasets
-from sklearn.base import clone
-from sklearn.base import ClassifierMixin, RegressorMixin
-from sklearn.base import is_classifier
-from smac.tae import StatusType
+import sklearn.dummy
+import sklearn.model_selection as model_selection
+from ConfigSpace.configuration_space import Configuration
from dask.distributed import Client
+from joblib import cpu_count
+from sklearn.base import ClassifierMixin, RegressorMixin, clone, is_classifier
+from smac.tae import StatusType
-from autosklearn.data.validation import InputValidator
+import autosklearn.estimators # noqa F401
import autosklearn.pipeline.util as putil
+from autosklearn.automl import AutoMLClassifier
+from autosklearn.data.validation import InputValidator
from autosklearn.ensemble_builder import MODEL_FN_RE
-import autosklearn.estimators # noqa F401
from autosklearn.estimators import (
- AutoSklearnEstimator, AutoSklearnRegressor, AutoSklearnClassifier
+ AutoSklearnClassifier,
+ AutoSklearnEstimator,
+ AutoSklearnRegressor,
)
-from autosklearn.metrics import accuracy, f1_macro, mean_squared_error, r2
-from autosklearn.automl import AutoMLClassifier
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
+from autosklearn.metrics import accuracy, f1_macro, mean_squared_error, r2
from autosklearn.smbo import get_smac_object
sys.path.append(os.path.dirname(__file__))
-from automl_utils import print_debug_information, count_succeses, includes_train_scores, includes_all_scores, include_single_scores, performance_over_time_is_plausible # noqa (E402: module level import not at top of file)
+from automl_utils import ( # noqa (E402: module level import not at top of file)
+ count_succeses,
+ include_single_scores,
+ includes_all_scores,
+ includes_train_scores,
+ performance_over_time_is_plausible,
+ print_debug_information,
+)
def test_fit_n_jobs(tmp_dir):
- X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer')
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("breast_cancer")
# test parallel Classifier to predict classes, not only indices
Y_train += 1
Y_test += 1
class get_smac_object_wrapper:
-
def __call__(self, *args, **kwargs):
- self.n_jobs = kwargs['n_jobs']
+ self.n_jobs = kwargs["n_jobs"]
smac = get_smac_object(*args, **kwargs)
self.dask_n_jobs = smac.solver.tae_runner.n_workers
self.dask_client_n_jobs = len(
- smac.solver.tae_runner.client.scheduler_info()['workers']
+ smac.solver.tae_runner.client.scheduler_info()["workers"]
)
return smac
+
get_smac_object_wrapper_instance = get_smac_object_wrapper()
automl = AutoSklearnClassifier(
@@ -75,8 +82,7 @@ def __call__(self, *args, **kwargs):
initial_configurations_via_metalearning=0,
ensemble_size=5,
n_jobs=2,
- include={'classifier': ['sgd'],
- 'feature_preprocessor': ['no_preprocessing']},
+ include={"classifier": ["sgd"], "feature_preprocessor": ["no_preprocessing"]},
get_smac_object_callback=get_smac_object_wrapper_instance,
max_models_on_disc=None,
)
@@ -84,17 +90,24 @@ def __call__(self, *args, **kwargs):
automl.fit(X_train, Y_train)
# Test that the argument is correctly passed to SMAC
- assert getattr(get_smac_object_wrapper_instance, 'n_jobs') == 2
- assert getattr(get_smac_object_wrapper_instance, 'dask_n_jobs') == 2
- assert getattr(get_smac_object_wrapper_instance, 'dask_client_n_jobs') == 2
+ assert getattr(get_smac_object_wrapper_instance, "n_jobs") == 2
+ assert getattr(get_smac_object_wrapper_instance, "dask_n_jobs") == 2
+ assert getattr(get_smac_object_wrapper_instance, "dask_client_n_jobs") == 2
available_num_runs = set()
for run_key, run_value in automl.automl_.runhistory_.data.items():
- if run_value.additional_info is not None and 'num_run' in run_value.additional_info:
- available_num_runs.add(run_value.additional_info['num_run'])
+ if (
+ run_value.additional_info is not None
+ and "num_run" in run_value.additional_info
+ ):
+ available_num_runs.add(run_value.additional_info["num_run"])
available_predictions = set()
predictions = glob.glob(
- os.path.join(automl.automl_._backend.get_runs_directory(), '*', 'predictions_ensemble*.npy')
+ os.path.join(
+ automl.automl_._backend.get_runs_directory(),
+ "*",
+ "predictions_ensemble*.npy",
+ )
)
seeds = set()
for prediction in predictions:
@@ -117,7 +130,7 @@ def __call__(self, *args, **kwargs):
seeds = set()
for ensemble_file in ensembles:
- seeds.add(int(ensemble_file.split('.')[0].split('_')[0]))
+ seeds.add(int(ensemble_file.split(".")[0].split("_")[0]))
assert len(seeds) == 1
assert count_succeses(automl.cv_results_) > 0
@@ -132,7 +145,7 @@ def test_feat_type_wrong_arguments():
# Every Auto-Sklearn estimator has a backend, that allows a single
# call to fit
X = np.zeros((100, 100))
- y = np.zeros((100, ))
+ y = np.zeros((100,))
cls = AutoSklearnClassifier(ensemble_size=0)
expected_msg = r".*feat_type does not have same number of "
@@ -143,43 +156,55 @@ def test_feat_type_wrong_arguments():
cls = AutoSklearnClassifier(ensemble_size=0)
expected_msg = r".*feat_type must only contain strings.*"
with pytest.raises(ValueError, match=expected_msg):
- cls.fit(X=X, y=y, feat_type=[True]*100)
+ cls.fit(X=X, y=y, feat_type=[True] * 100)
cls = AutoSklearnClassifier(ensemble_size=0)
expected_msg = r".*Only `Categorical`, `Numerical` and `String` are"
"valid feature types, you passed `Car`.*"
with pytest.raises(ValueError, match=expected_msg):
- cls.fit(X=X, y=y, feat_type=['Car']*100)
+ cls.fit(X=X, y=y, feat_type=["Car"] * 100)
# Mock AutoSklearnEstimator.fit so the test doesn't actually run fit().
-@unittest.mock.patch('autosklearn.estimators.AutoSklearnEstimator.fit')
+@unittest.mock.patch("autosklearn.estimators.AutoSklearnEstimator.fit")
def test_type_of_target(mock_estimator):
# Test that classifier raises error for illegal target types.
- X = np.array([[1, 2],
- [2, 3],
- [3, 4],
- [4, 5],
- ])
+ X = np.array(
+ [
+ [1, 2],
+ [2, 3],
+ [3, 4],
+ [4, 5],
+ ]
+ )
# Possible target types
y_binary = np.array([0, 0, 1, 1])
y_continuous = np.array([0.1, 1.3, 2.1, 4.0])
y_multiclass = np.array([0, 1, 2, 0])
- y_multilabel = np.array([[0, 1],
- [1, 1],
- [1, 0],
- [0, 0],
- ])
- y_multiclass_multioutput = np.array([[0, 1],
- [1, 3],
- [2, 2],
- [5, 3],
- ])
- y_continuous_multioutput = np.array([[0.1, 1.5],
- [1.2, 3.5],
- [2.7, 2.7],
- [5.5, 3.9],
- ])
+ y_multilabel = np.array(
+ [
+ [0, 1],
+ [1, 1],
+ [1, 0],
+ [0, 0],
+ ]
+ )
+ y_multiclass_multioutput = np.array(
+ [
+ [0, 1],
+ [1, 3],
+ [2, 2],
+ [5, 3],
+ ]
+ )
+ y_continuous_multioutput = np.array(
+ [
+ [0.1, 1.5],
+ [1.2, 3.5],
+ [2.7, 2.7],
+ [5.5, 3.9],
+ ]
+ )
cls = AutoSklearnClassifier(ensemble_size=0)
cls.automl_ = unittest.mock.Mock()
@@ -208,20 +233,19 @@ def test_type_of_target(mock_estimator):
try:
cls.fit(X, y_binary)
except ValueError:
- pytest.fail("cls.fit() raised ValueError while fitting "
- "binary targets")
+ pytest.fail("cls.fit() raised ValueError while fitting " "binary targets")
try:
cls.fit(X, y_multiclass)
except ValueError:
- pytest.fail("cls.fit() raised ValueError while fitting "
- "multiclass targets")
+ pytest.fail("cls.fit() raised ValueError while fitting " "multiclass targets")
try:
cls.fit(X, y_multilabel)
except ValueError:
- pytest.fail("cls.fit() raised ValueError while fitting "
- "multilabel-indicator targets")
+ pytest.fail(
+ "cls.fit() raised ValueError while fitting " "multilabel-indicator targets"
+ )
# Test that regressor raises error for illegal target types.
reg = AutoSklearnRegressor(ensemble_size=0)
@@ -230,12 +254,18 @@ def test_type_of_target(mock_estimator):
expected_msg = r".*Regression with data of type"
" multilabel-indicator is not supported.*"
with pytest.raises(ValueError, match=expected_msg):
- reg.fit(X=X, y=y_multilabel,)
+ reg.fit(
+ X=X,
+ y=y_multilabel,
+ )
expected_msg = r".*Regression with data of type"
" multiclass-multioutput is not supported.*"
with pytest.raises(ValueError, match=expected_msg):
- reg.fit(X=X, y=y_multiclass_multioutput,)
+ reg.fit(
+ X=X,
+ y=y_multiclass_multioutput,
+ )
# Legal target types: continuous, multiclass,
# continuous-multioutput,
@@ -243,37 +273,38 @@ def test_type_of_target(mock_estimator):
try:
reg.fit(X, y_continuous)
except ValueError:
- pytest.fail("reg.fit() raised ValueError while fitting "
- "continuous targets")
+ pytest.fail("reg.fit() raised ValueError while fitting " "continuous targets")
try:
reg.fit(X, y_multiclass)
except ValueError:
- pytest.fail("reg.fit() raised ValueError while fitting "
- "multiclass targets")
+ pytest.fail("reg.fit() raised ValueError while fitting " "multiclass targets")
try:
reg.fit(X, y_continuous_multioutput)
except ValueError:
- pytest.fail("reg.fit() raised ValueError while fitting "
- "continuous_multioutput targets")
+ pytest.fail(
+ "reg.fit() raised ValueError while fitting "
+ "continuous_multioutput targets"
+ )
try:
reg.fit(X, y_binary)
except ValueError:
- pytest.fail("reg.fit() raised ValueError while fitting "
- "binary targets")
+ pytest.fail("reg.fit() raised ValueError while fitting " "binary targets")
def test_performance_over_time_no_ensemble(tmp_dir):
- X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
- cls = AutoSklearnClassifier(time_left_for_this_task=30,
- per_run_time_limit=5,
- tmp_folder=tmp_dir,
- seed=1,
- initial_configurations_via_metalearning=0,
- ensemble_size=0,)
+ cls = AutoSklearnClassifier(
+ time_left_for_this_task=30,
+ per_run_time_limit=5,
+ tmp_folder=tmp_dir,
+ seed=1,
+ initial_configurations_via_metalearning=0,
+ ensemble_size=0,
+ )
cls.fit(X_train, Y_train, X_test, Y_test)
@@ -285,16 +316,17 @@ def test_performance_over_time_no_ensemble(tmp_dir):
def test_cv_results(tmp_dir):
# TODO restructure and actually use real SMAC output from a long run
# to do this unittest!
- X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
- cls = AutoSklearnClassifier(time_left_for_this_task=30,
- per_run_time_limit=5,
- tmp_folder=tmp_dir,
- seed=1,
- initial_configurations_via_metalearning=0,
- ensemble_size=0,
- scoring_functions=[autosklearn.metrics.precision,
- autosklearn.metrics.roc_auc])
+ cls = AutoSklearnClassifier(
+ time_left_for_this_task=30,
+ per_run_time_limit=5,
+ tmp_folder=tmp_dir,
+ seed=1,
+ initial_configurations_via_metalearning=0,
+ ensemble_size=0,
+ scoring_functions=[autosklearn.metrics.precision, autosklearn.metrics.roc_auc],
+ )
params = cls.get_params()
original_params = copy.deepcopy(params)
@@ -303,23 +335,27 @@ def test_cv_results(tmp_dir):
cv_results = cls.cv_results_
assert isinstance(cv_results, dict), type(cv_results)
- assert isinstance(cv_results['mean_test_score'], np.ndarray), type(
- cv_results['mean_test_score'])
- assert isinstance(cv_results['mean_fit_time'], np.ndarray), type(
- cv_results['mean_fit_time']
+ assert isinstance(cv_results["mean_test_score"], np.ndarray), type(
+ cv_results["mean_test_score"]
)
- assert isinstance(cv_results['params'], list), type(cv_results['params'])
- assert isinstance(cv_results['rank_test_scores'], np.ndarray), type(
- cv_results['rank_test_scores']
+ assert isinstance(cv_results["mean_fit_time"], np.ndarray), type(
+ cv_results["mean_fit_time"]
)
- assert isinstance(cv_results['metric_precision'], npma.MaskedArray), type(
- cv_results['metric_precision']
+ assert isinstance(cv_results["params"], list), type(cv_results["params"])
+ assert isinstance(cv_results["rank_test_scores"], np.ndarray), type(
+ cv_results["rank_test_scores"]
)
- assert isinstance(cv_results['metric_roc_auc'], npma.MaskedArray), type(
- cv_results['metric_roc_auc']
+ assert isinstance(cv_results["metric_precision"], npma.MaskedArray), type(
+ cv_results["metric_precision"]
)
- cv_result_items = [isinstance(val, npma.MaskedArray) for key, val in
- cv_results.items() if key.startswith('param_')]
+ assert isinstance(cv_results["metric_roc_auc"], npma.MaskedArray), type(
+ cv_results["metric_roc_auc"]
+ )
+ cv_result_items = [
+ isinstance(val, npma.MaskedArray)
+ for key, val in cv_results.items()
+ if key.startswith("param_")
+ ]
assert all(cv_result_items), cv_results.items()
# Compare the state of the model parameters with the original parameters
@@ -337,21 +373,20 @@ def test_cv_results(tmp_dir):
assert joblib.hash(new_value) == joblib.hash(original_value), (
"Estimator %s should not change or mutate "
" the parameter %s from %s to %s during fit."
- % (cls, param_name, original_value, new_value))
+ % (cls, param_name, original_value, new_value)
+ )
# Comply with https://scikit-learn.org/dev/glossary.html#term-classes
is_classifier(cls)
- assert hasattr(cls, 'classes_')
+ assert hasattr(cls, "classes_")
-@pytest.mark.parametrize('estimator_type,dataset_name', [
- (AutoSklearnClassifier, 'iris'),
- (AutoSklearnRegressor, 'boston')
-])
+@pytest.mark.parametrize(
+ "estimator_type,dataset_name",
+ [(AutoSklearnClassifier, "iris"), (AutoSklearnRegressor, "boston")],
+)
def test_leaderboard(
- tmp_dir: str,
- estimator_type: Type[AutoSklearnEstimator],
- dataset_name: str
+ tmp_dir: str, estimator_type: Type[AutoSklearnEstimator], dataset_name: str
):
# Comprehensive test tasks a substantial amount of time, manually set if
# required.
@@ -361,16 +396,16 @@ def test_leaderboard(
# Create a dict of all possible param values for each param
# with some invalid one's of the incorrect type
include_combinations = itertools.chain(
- itertools.combinations(column_types['all'], item_count)
+ itertools.combinations(column_types["all"], item_count)
for item_count in range(1, MAX_COMBO_SIZE_FOR_INCLUDE_PARAM)
)
valid_params = {
- 'detailed': [True, False],
- 'ensemble_only': [True, False],
- 'top_k': [-10, 0, 1, 10, 'all'],
- 'sort_by': [*column_types['all'], 'invalid'],
- 'sort_order': ['ascending', 'descending', 'auto', 'invalid', None],
- 'include': itertools.chain([None, 'invalid', 'type'], include_combinations),
+ "detailed": [True, False],
+ "ensemble_only": [True, False],
+ "top_k": [-10, 0, 1, 10, "all"],
+ "sort_by": [*column_types["all"], "invalid"],
+ "sort_order": ["ascending", "descending", "auto", "invalid", None],
+ "include": itertools.chain([None, "invalid", "type"], include_combinations),
}
# Create a generator of all possible combinations of valid_params
@@ -381,55 +416,49 @@ def test_leaderboard(
X_train, Y_train, _, _ = putil.get_dataset(dataset_name)
model = estimator_type(
- time_left_for_this_task=30,
- per_run_time_limit=5,
- tmp_folder=tmp_dir,
- seed=1
+ time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, seed=1
)
model.fit(X_train, Y_train)
for params in params_generator:
# Convert from iterator to solid list
- if params['include'] is not None and not isinstance(params['include'], str):
- params['include'] = list(params['include'])
+ if params["include"] is not None and not isinstance(params["include"], str):
+ params["include"] = list(params["include"])
# Invalid top_k should raise an error, is a positive int or 'all'
- if not (params['top_k'] == 'all' or params['top_k'] > 0):
+ if not (params["top_k"] == "all" or params["top_k"] > 0):
with pytest.raises(ValueError):
model.leaderboard(**params)
# Invalid sort_by column
- elif params['sort_by'] not in column_types['all']:
+ elif params["sort_by"] not in column_types["all"]:
with pytest.raises(ValueError):
model.leaderboard(**params)
# Shouldn't accept an invalid sort order
- elif params['sort_order'] not in ['ascending', 'descending', 'auto']:
+ elif params["sort_order"] not in ["ascending", "descending", "auto"]:
with pytest.raises(ValueError):
model.leaderboard(**params)
# include is single str but not valid
elif (
- isinstance(params['include'], str)
- and params['include'] not in column_types['all']
+ isinstance(params["include"], str)
+ and params["include"] not in column_types["all"]
):
with pytest.raises(ValueError):
model.leaderboard(**params)
# Crash if include is list but contains invalid column
elif (
- isinstance(params['include'], list)
- and len(set(params['include']) - set(column_types['all'])) != 0
+ isinstance(params["include"], list)
+ and len(set(params["include"]) - set(column_types["all"])) != 0
):
with pytest.raises(ValueError):
model.leaderboard(**params)
# Can't have just model_id, in both single str and list case
- elif (
- params['include'] == 'model_id'
- or params['include'] == ['model_id']
- ):
+ elif params["include"] == "model_id" or params["include"] == ["model_id"]:
with pytest.raises(ValueError):
model.leaderboard(**params)
@@ -439,8 +468,8 @@ def test_leaderboard(
# top_k should never be less than the rows given back
# It can however be larger
- if isinstance(params['top_k'], int):
- assert params['top_k'] >= len(leaderboard)
+ if isinstance(params["top_k"], int):
+ assert params["top_k"] >= len(leaderboard)
# Check the right columns are present and in the right order
# The model_id is set as the index, not included in pandas columns
@@ -449,43 +478,47 @@ def test_leaderboard(
def exclude(lst, s):
return [x for x in lst if x != s]
- if params['include'] is not None:
+ if params["include"] is not None:
# Include with only single str should be the only column
- if isinstance(params['include'], str):
- assert params['include'] in columns and len(columns) == 1
+ if isinstance(params["include"], str):
+ assert params["include"] in columns and len(columns) == 1
# Include as a list should have all the columns without model_id
else:
- assert columns == exclude(params['include'], 'model_id')
- elif params['detailed']:
- assert columns == exclude(column_types['detailed'], 'model_id')
+ assert columns == exclude(params["include"], "model_id")
+ elif params["detailed"]:
+ assert columns == exclude(column_types["detailed"], "model_id")
else:
- assert columns == exclude(column_types['simple'], 'model_id')
+ assert columns == exclude(column_types["simple"], "model_id")
# Ensure that if it's ensemble only
# Can only check if 'ensemble_weight' is present
- if (
- params['ensemble_only']
- and 'ensemble_weight' in columns
- ):
- assert all(leaderboard['ensemble_weight'] > 0)
-
-
-@pytest.mark.parametrize('estimator', [AutoSklearnRegressor])
-@pytest.mark.parametrize('resampling_strategy', ['holdout'])
-@pytest.mark.parametrize('X', [
- np.asarray([[1.0, 1.0, 1.0]] * 25 + [[2.0, 2.0, 2.0]] * 25 +
- [[3.0, 3.0, 3.0]] * 25 + [[4.0, 4.0, 4.0]] * 25)
-])
-@pytest.mark.parametrize('y', [
- np.asarray([1.0] * 25 + [2.0] * 25 + [3.0] * 25 + [4.0] * 25)
-])
+ if params["ensemble_only"] and "ensemble_weight" in columns:
+ assert all(leaderboard["ensemble_weight"] > 0)
+
+
+@pytest.mark.parametrize("estimator", [AutoSklearnRegressor])
+@pytest.mark.parametrize("resampling_strategy", ["holdout"])
+@pytest.mark.parametrize(
+ "X",
+ [
+ np.asarray(
+ [[1.0, 1.0, 1.0]] * 25
+ + [[2.0, 2.0, 2.0]] * 25
+ + [[3.0, 3.0, 3.0]] * 25
+ + [[4.0, 4.0, 4.0]] * 25
+ )
+ ],
+)
+@pytest.mark.parametrize(
+ "y", [np.asarray([1.0] * 25 + [2.0] * 25 + [3.0] * 25 + [4.0] * 25)]
+)
def test_show_models_with_holdout(
tmp_dir: str,
dask_client: Client,
estimator: AutoSklearnEstimator,
resampling_strategy: str,
X: np.ndarray,
- y: np.ndarray
+ y: np.ndarray,
) -> None:
"""
Parameters
@@ -521,39 +554,44 @@ def test_show_models_with_holdout(
per_run_time_limit=5,
tmp_folder=tmp_dir,
resampling_strategy=resampling_strategy,
- dask_client=dask_client
+ dask_client=dask_client,
)
automl.fit(X, y)
models = automl.show_models().values()
- model_keys = set([
- 'model_id', 'rank', 'cost', 'ensemble_weight',
- 'data_preprocessor', 'feature_preprocessor',
- 'regressor', 'sklearn_regressor'
- ])
+ model_keys = set(
+ [
+ "model_id",
+ "rank",
+ "cost",
+ "ensemble_weight",
+ "data_preprocessor",
+ "feature_preprocessor",
+ "regressor",
+ "sklearn_regressor",
+ ]
+ )
assert all([model_keys == set(model.keys()) for model in models])
- assert all([model['regressor'] for model in models])
- assert all([model['sklearn_regressor'] for model in models])
+ assert all([model["regressor"] for model in models])
+ assert all([model["sklearn_regressor"] for model in models])
assert not any([None in model.values() for model in models])
-@pytest.mark.parametrize('estimator', [AutoSklearnClassifier])
-@pytest.mark.parametrize('resampling_strategy', ['cv'])
-@pytest.mark.parametrize('X', [
- np.asarray([[1.0, 1.0, 1.0]] * 50 + [[2.0, 2.0, 2.0]] * 50)
-])
-@pytest.mark.parametrize('y', [
- np.asarray([1] * 50 + [2] * 50)
-])
+@pytest.mark.parametrize("estimator", [AutoSklearnClassifier])
+@pytest.mark.parametrize("resampling_strategy", ["cv"])
+@pytest.mark.parametrize(
+ "X", [np.asarray([[1.0, 1.0, 1.0]] * 50 + [[2.0, 2.0, 2.0]] * 50)]
+)
+@pytest.mark.parametrize("y", [np.asarray([1] * 50 + [2] * 50)])
def test_show_models_with_cv(
tmp_dir: str,
dask_client: Client,
estimator: AutoSklearnEstimator,
resampling_strategy: str,
X: np.ndarray,
- y: np.ndarray
+ y: np.ndarray,
) -> None:
"""
Parameters
@@ -578,12 +616,12 @@ def test_show_models_with_cv(
Expects
-------
- * Expects all the model dictionaries to have ``model_keys``
- * Expects no model to have any ``None`` value
- * Expects all the estimators in a model to have ``estimator_keys``
- * Expects all model estimators to have an auto-sklearn wrapped model ``classifier``
- * Expects all model estimators to have a sklearn wrapped model ``sklearn_classifier``
- * Expects no estimator to have ``None`` value
+ * Expects all the model dictionaries to have `model_keys`
+ * Expects no model to have any `None` value
+ * Expects all the estimators in a model to have `estimator_keys`
+ * Expects all model estimators to have an auto-sklearn wrapped model `classifier`
+ * Expects all model estimators to have a sklearn wrapped model `sklearn_classifier`
+ * Expects no estimator to have None
"""
automl = estimator(
@@ -591,37 +629,59 @@ def test_show_models_with_cv(
per_run_time_limit=5,
tmp_folder=tmp_dir,
resampling_strategy=resampling_strategy,
- dask_client=dask_client
+ dask_client=dask_client,
)
automl.fit(X, y)
models = automl.show_models().values()
- model_keys = set([
- 'model_id', 'rank',
- 'cost', 'ensemble_weight',
- 'voting_model', 'estimators'
- ])
+ model_keys = set(
+ ["model_id", "rank", "cost", "ensemble_weight", "voting_model", "estimators"]
+ )
- estimator_keys = set([
- 'data_preprocessor', 'balancing',
- 'feature_preprocessor', 'classifier',
- 'sklearn_classifier'
- ])
+ estimator_keys = set(
+ [
+ "data_preprocessor",
+ "balancing",
+ "feature_preprocessor",
+ "classifier",
+ "sklearn_classifier",
+ ]
+ )
assert all([model_keys == set(model.keys()) for model in models])
assert not any([None in model.values() for model in models])
- assert all([estimator_keys == set(estimator.keys())
- for model in models for estimator in model['estimators']])
- assert all([estimator['classifier']
- for model in models for estimator in model['estimators']])
- assert all([estimator['sklearn_classifier']
- for model in models for estimator in model['estimators']])
- assert not any([None in estimator.values()
- for model in models for estimator in model['estimators']])
+ assert all(
+ [
+ estimator_keys == set(estimator.keys())
+ for model in models
+ for estimator in model["estimators"]
+ ]
+ )
+ assert all(
+ [
+ estimator["classifier"]
+ for model in models
+ for estimator in model["estimators"]
+ ]
+ )
+ assert all(
+ [
+ estimator["sklearn_classifier"]
+ for model in models
+ for estimator in model["estimators"]
+ ]
+ )
+ assert not any(
+ [
+ None in estimator.values()
+ for model in models
+ for estimator in model["estimators"]
+ ]
+ )
-@unittest.mock.patch('autosklearn.estimators.AutoSklearnEstimator.build_automl')
+@unittest.mock.patch("autosklearn.estimators.AutoSklearnEstimator.build_automl")
def test_fit_n_jobs_negative(build_automl_patch):
n_cores = cpu_count()
cls = AutoSklearnEstimator(n_jobs=-1, ensemble_size=0)
@@ -634,12 +694,17 @@ def test_get_number_of_available_cores():
assert n_cores >= 1, n_cores
-@unittest.mock.patch('autosklearn.automl.AutoML.predict')
+@unittest.mock.patch("autosklearn.automl.AutoML.predict")
def test_multiclass_prediction(predict_mock, dask_client):
- predicted_probabilities = [[0, 0, 0.99], [0, 0.99, 0], [0.99, 0, 0],
- [0, 0.99, 0], [0, 0, 0.99]]
+ predicted_probabilities = [
+ [0, 0, 0.99],
+ [0, 0.99, 0],
+ [0.99, 0, 0],
+ [0, 0.99, 0],
+ [0, 0, 0.99],
+ ]
predicted_indexes = [2, 1, 0, 1, 2]
- expected_result = ['c', 'b', 'a', 'b', 'c']
+ expected_result = ["c", "b", "a", "b", "c"]
predict_mock.return_value = np.array(predicted_probabilities)
@@ -650,7 +715,7 @@ def test_multiclass_prediction(predict_mock, dask_client):
)
classifier.InputValidator = InputValidator(is_classification=True)
classifier.InputValidator.target_validator.fit(
- pd.DataFrame(expected_result, dtype='category'),
+ pd.DataFrame(expected_result, dtype="category"),
)
classifier.InputValidator._is_fitted = True
@@ -659,13 +724,15 @@ def test_multiclass_prediction(predict_mock, dask_client):
np.testing.assert_array_equal(expected_result, actual_result)
-@unittest.mock.patch('autosklearn.automl.AutoML.predict')
+@unittest.mock.patch("autosklearn.automl.AutoML.predict")
def test_multilabel_prediction(predict_mock, dask_client):
- predicted_probabilities = [[0.99, 0],
- [0.99, 0],
- [0, 0.99],
- [0.99, 0.99],
- [0.99, 0.99]]
+ predicted_probabilities = [
+ [0.99, 0],
+ [0.99, 0],
+ [0, 0.99],
+ [0.99, 0.99],
+ [0.99, 0.99],
+ ]
predicted_indexes = np.array([[1, 0], [1, 0], [0, 1], [1, 1], [1, 1]])
predict_mock.return_value = np.array(predicted_probabilities)
@@ -677,11 +744,14 @@ def test_multilabel_prediction(predict_mock, dask_client):
)
classifier.InputValidator = InputValidator(is_classification=True)
classifier.InputValidator.target_validator.fit(
- pd.DataFrame(predicted_indexes, dtype='int64'),
+ pd.DataFrame(predicted_indexes, dtype="int64"),
)
classifier.InputValidator._is_fitted = True
- assert classifier.InputValidator.target_validator.type_of_target == 'multilabel-indicator'
+ assert (
+ classifier.InputValidator.target_validator.type_of_target
+ == "multilabel-indicator"
+ )
actual_result = classifier.predict([None] * len(predicted_indexes))
@@ -689,68 +759,66 @@ def test_multilabel_prediction(predict_mock, dask_client):
def test_can_pickle_classifier(tmp_dir, dask_client):
- X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
- automl = AutoSklearnClassifier(time_left_for_this_task=30,
- delete_tmp_folder_after_terminate=False,
- per_run_time_limit=5,
- tmp_folder=tmp_dir,
- dask_client=dask_client,
- )
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
+ automl = AutoSklearnClassifier(
+ time_left_for_this_task=30,
+ delete_tmp_folder_after_terminate=False,
+ per_run_time_limit=5,
+ tmp_folder=tmp_dir,
+ dask_client=dask_client,
+ )
automl.fit(X_train, Y_train)
initial_predictions = automl.predict(X_test)
- initial_accuracy = sklearn.metrics.accuracy_score(Y_test,
- initial_predictions)
+ initial_accuracy = sklearn.metrics.accuracy_score(Y_test, initial_predictions)
assert initial_accuracy >= 0.75
assert count_succeses(automl.cv_results_) > 0
assert includes_train_scores(automl.performance_over_time_.columns) is True
assert performance_over_time_is_plausible(automl.performance_over_time_) is True
# Test pickle
- dump_file = os.path.join(tmp_dir, 'automl.dump.pkl')
+ dump_file = os.path.join(tmp_dir, "automl.dump.pkl")
- with open(dump_file, 'wb') as f:
+ with open(dump_file, "wb") as f:
pickle.dump(automl, f)
- with open(dump_file, 'rb') as f:
+ with open(dump_file, "rb") as f:
restored_automl = pickle.load(f)
restored_predictions = restored_automl.predict(X_test)
- restored_accuracy = sklearn.metrics.accuracy_score(Y_test,
- restored_predictions)
+ restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions)
assert restored_accuracy >= 0.75
assert initial_accuracy == restored_accuracy
# Test joblib
- dump_file = os.path.join(tmp_dir, 'automl.dump.joblib')
+ dump_file = os.path.join(tmp_dir, "automl.dump.joblib")
joblib.dump(automl, dump_file)
restored_automl = joblib.load(dump_file)
restored_predictions = restored_automl.predict(X_test)
- restored_accuracy = sklearn.metrics.accuracy_score(Y_test,
- restored_predictions)
+ restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions)
assert restored_accuracy >= 0.75
assert initial_accuracy == restored_accuracy
def test_multilabel(tmp_dir, dask_client):
- X_train, Y_train, X_test, Y_test = putil.get_dataset(
- 'iris', make_multilabel=True)
- automl = AutoSklearnClassifier(time_left_for_this_task=30,
- per_run_time_limit=5,
- tmp_folder=tmp_dir,
- dask_client=dask_client,
- )
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris", make_multilabel=True)
+ automl = AutoSklearnClassifier(
+ time_left_for_this_task=30,
+ per_run_time_limit=5,
+ tmp_folder=tmp_dir,
+ dask_client=dask_client,
+ )
automl.fit(X_train, Y_train)
predictions = automl.predict(X_test)
assert predictions.shape == (50, 3), print_debug_information(automl)
- assert count_succeses(automl.cv_results_) > 0, print_debug_information(automl)
+ assert count_succeses(automl.cv_results_) > 0, print_debug_information(automl)
assert includes_train_scores(automl.performance_over_time_.columns) is True
assert performance_over_time_is_plausible(automl.performance_over_time_) is True
@@ -763,20 +831,25 @@ def test_multilabel(tmp_dir, dask_client):
def test_binary(tmp_dir, dask_client):
- X_train, Y_train, X_test, Y_test = putil.get_dataset(
- 'iris', make_binary=True)
- automl = AutoSklearnClassifier(time_left_for_this_task=40,
- delete_tmp_folder_after_terminate=False,
- per_run_time_limit=10,
- tmp_folder=tmp_dir,
- dask_client=dask_client,
- )
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris", make_binary=True)
+ automl = AutoSklearnClassifier(
+ time_left_for_this_task=40,
+ delete_tmp_folder_after_terminate=False,
+ per_run_time_limit=10,
+ tmp_folder=tmp_dir,
+ dask_client=dask_client,
+ )
- automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test,
- dataset_name='binary_test_dataset')
+ automl.fit(
+ X_train,
+ Y_train,
+ X_test=X_test,
+ y_test=Y_test,
+ dataset_name="binary_test_dataset",
+ )
predictions = automl.predict(X_test)
- assert predictions.shape == (50, ), print_debug_information(automl)
+ assert predictions.shape == (50,), print_debug_information(automl)
score = accuracy(Y_test, predictions)
assert score > 0.9, print_debug_information(automl)
@@ -794,7 +867,7 @@ def test_classification_pandas_support(tmp_dir, dask_client):
)
# Drop NAN!!
- X = X.dropna(axis='columns')
+ X = X.dropna(axis="columns")
# This test only make sense if input is dataframe
assert isinstance(X, pd.DataFrame)
@@ -802,7 +875,7 @@ def test_classification_pandas_support(tmp_dir, dask_client):
automl = AutoSklearnClassifier(
time_left_for_this_task=30,
per_run_time_limit=5,
- exclude={'classifier': ['libsvm_svc']},
+ exclude={"classifier": ["libsvm_svc"]},
dask_client=dask_client,
seed=5,
tmp_folder=tmp_dir,
@@ -828,12 +901,13 @@ def test_classification_pandas_support(tmp_dir, dask_client):
def test_regression(tmp_dir, dask_client):
- X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
- automl = AutoSklearnRegressor(time_left_for_this_task=30,
- per_run_time_limit=5,
- tmp_folder=tmp_dir,
- dask_client=dask_client,
- )
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("boston")
+ automl = AutoSklearnRegressor(
+ time_left_for_this_task=30,
+ per_run_time_limit=5,
+ tmp_folder=tmp_dir,
+ dask_client=dask_client,
+ )
automl.fit(X_train, Y_train)
@@ -842,8 +916,9 @@ def test_regression(tmp_dir, dask_client):
score = mean_squared_error(Y_test, predictions)
# On average np.sqrt(30) away from the target -> ~5.5 on average
- # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds
- # constraint. With more time_left_for_this_task this is no longer an issue
+ # Results with select rates drops avg score to a range of -32.40 to -37,
+ # on 30 seconds constraint.
+ # With more time_left_for_this_task this is no longer an issue
assert score >= -37, print_debug_information(automl)
assert count_succeses(automl.cv_results_) > 0
assert includes_train_scores(automl.performance_over_time_.columns) is True
@@ -856,13 +931,16 @@ def test_cv_regression(tmp_dir, dask_client):
a regressor
"""
- X_train, Y_train, X_test, Y_test = putil.get_dataset('boston', train_size_maximum=300)
- automl = AutoSklearnRegressor(time_left_for_this_task=60,
- per_run_time_limit=10,
- resampling_strategy='cv',
- tmp_folder=tmp_dir,
- dask_client=dask_client,
- )
+ X_train, Y_train, X_test, Y_test = putil.get_dataset(
+ "boston", train_size_maximum=300
+ )
+ automl = AutoSklearnRegressor(
+ time_left_for_this_task=60,
+ per_run_time_limit=10,
+ resampling_strategy="cv",
+ tmp_folder=tmp_dir,
+ dask_client=dask_client,
+ )
automl.fit(X_train, Y_train)
@@ -913,13 +991,15 @@ def test_autosklearn_classification_methods_returns_self(dask_client):
Currently this method only tests that the methods of AutoSklearnClassifier
is able to fit using fit(), fit_ensemble() and refit()
"""
- X_train, y_train, X_test, y_test = putil.get_dataset('iris')
- automl = AutoSklearnClassifier(time_left_for_this_task=60,
- delete_tmp_folder_after_terminate=False,
- per_run_time_limit=10,
- ensemble_size=0,
- dask_client=dask_client,
- exclude={'feature_preprocessor': ['fast_ica']})
+ X_train, y_train, X_test, y_test = putil.get_dataset("iris")
+ automl = AutoSklearnClassifier(
+ time_left_for_this_task=60,
+ delete_tmp_folder_after_terminate=False,
+ per_run_time_limit=10,
+ ensemble_size=0,
+ dask_client=dask_client,
+ exclude={"feature_preprocessor": ["fast_ica"]},
+ )
automl_fitted = automl.fit(X_train, y_train)
@@ -936,12 +1016,14 @@ def test_autosklearn_classification_methods_returns_self(dask_client):
# Currently this class only tests that the methods of AutoSklearnRegressor
# that should return self actually return self.
def test_autosklearn_regression_methods_returns_self(dask_client):
- X_train, y_train, X_test, y_test = putil.get_dataset('boston')
- automl = AutoSklearnRegressor(time_left_for_this_task=30,
- delete_tmp_folder_after_terminate=False,
- per_run_time_limit=5,
- dask_client=dask_client,
- ensemble_size=0)
+ X_train, y_train, X_test, y_test = putil.get_dataset("boston")
+ automl = AutoSklearnRegressor(
+ time_left_for_this_task=30,
+ delete_tmp_folder_after_terminate=False,
+ per_run_time_limit=5,
+ dask_client=dask_client,
+ ensemble_size=0,
+ )
automl_fitted = automl.fit(X_train, y_train)
assert automl is automl_fitted
@@ -954,10 +1036,13 @@ def test_autosklearn_regression_methods_returns_self(dask_client):
def test_autosklearn2_classification_methods_returns_self(dask_client):
- X_train, y_train, X_test, y_test = putil.get_dataset('iris')
- automl = AutoSklearn2Classifier(time_left_for_this_task=60, ensemble_size=0,
- delete_tmp_folder_after_terminate=False,
- dask_client=dask_client)
+ X_train, y_train, X_test, y_test = putil.get_dataset("iris")
+ automl = AutoSklearn2Classifier(
+ time_left_for_this_task=60,
+ ensemble_size=0,
+ delete_tmp_folder_after_terminate=False,
+ dask_client=dask_client,
+ )
automl_fitted = automl.fit(X_train, y_train)
@@ -971,18 +1056,23 @@ def test_autosklearn2_classification_methods_returns_self(dask_client):
assert automl is automl_refitted
predictions = automl_fitted.predict(X_test)
- assert sklearn.metrics.accuracy_score(
- y_test, predictions
- ) >= 2 / 3, print_debug_information(automl)
+ assert (
+ sklearn.metrics.accuracy_score(y_test, predictions) >= 2 / 3
+ ), print_debug_information(automl)
pickle.dumps(automl_fitted)
def test_autosklearn2_classification_methods_returns_self_sparse(dask_client):
- X_train, y_train, X_test, y_test = putil.get_dataset('breast_cancer', make_sparse=True)
- automl = AutoSklearn2Classifier(time_left_for_this_task=60, ensemble_size=0,
- delete_tmp_folder_after_terminate=False,
- dask_client=dask_client)
+ X_train, y_train, X_test, y_test = putil.get_dataset(
+ "breast_cancer", make_sparse=True
+ )
+ automl = AutoSklearn2Classifier(
+ time_left_for_this_task=60,
+ ensemble_size=0,
+ delete_tmp_folder_after_terminate=False,
+ dask_client=dask_client,
+ )
automl_fitted = automl.fit(X_train, y_train)
@@ -996,32 +1086,39 @@ def test_autosklearn2_classification_methods_returns_self_sparse(dask_client):
assert automl is automl_refitted
predictions = automl_fitted.predict(X_test)
- assert sklearn.metrics.accuracy_score(
- y_test, predictions
- ) >= 2 / 3, print_debug_information(automl)
+ assert (
+ sklearn.metrics.accuracy_score(y_test, predictions) >= 2 / 3
+ ), print_debug_information(automl)
assert "boosting" not in str(automl.get_configuration_space(X=X_train, y=y_train))
pickle.dumps(automl_fitted)
-@pytest.mark.parametrize("class_", [AutoSklearnClassifier, AutoSklearnRegressor,
- AutoSklearn2Classifier])
+@pytest.mark.parametrize(
+ "class_", [AutoSklearnClassifier, AutoSklearnRegressor, AutoSklearn2Classifier]
+)
def test_check_estimator_signature(class_):
# Make sure signature is store in self
- expected_subclass = ClassifierMixin if 'Classifier' in str(class_) else RegressorMixin
+ expected_subclass = (
+ ClassifierMixin if "Classifier" in str(class_) else RegressorMixin
+ )
assert issubclass(class_, expected_subclass)
estimator = class_()
for expected in list(inspect.signature(class_).parameters):
assert hasattr(estimator, expected)
-@pytest.mark.parametrize("selector_path", [None, # No XDG_CACHE_HOME provided
- '/', # XDG_CACHE_HOME has no permission
- tempfile.gettempdir(), # in the user cache
- ])
+@pytest.mark.parametrize(
+ "selector_path",
+ [
+ None, # No XDG_CACHE_HOME provided
+ "/", # XDG_CACHE_HOME has no permission
+ tempfile.gettempdir(), # in the user cache
+ ],
+)
def test_selector_file_askl2_can_be_created(selector_path):
- with unittest.mock.patch('os.environ.get') as mock_foo:
+ with unittest.mock.patch("os.environ.get") as mock_foo:
mock_foo.return_value = selector_path
if selector_path is not None and not os.access(selector_path, os.W_OK):
with pytest.raises(PermissionError):
@@ -1029,7 +1126,9 @@ def test_selector_file_askl2_can_be_created(selector_path):
else:
importlib.reload(autosklearn.experimental.askl2)
for metric in autosklearn.experimental.askl2.metrics:
- assert os.path.exists(autosklearn.experimental.askl2.selector_files[metric.name])
+ assert os.path.exists(
+ autosklearn.experimental.askl2.selector_files[metric.name]
+ )
if selector_path is None or not os.access(selector_path, os.W_OK):
# We default to home in worst case
assert os.path.expanduser("~") in str(
@@ -1047,34 +1146,38 @@ def test_selector_file_askl2_can_be_created(selector_path):
def test_check_askl2_same_arguments_as_askl() -> None:
"""Check the asklearn2 has the same args as asklearn1
- This test is useful for when adding args to asklearn1 to make sure we update asklearn2
-
Expects
-------
- * The set of arguments for AutoSklearnClassifier is the same as AutoSklearn2Classifier
- except for a few expected arugments
+ * The set of arguments for AutoSklearnClassifier is the same as
+ AutoSklearn2Classifier except for a few expected arugments.
"""
- autosklearn1_classifier_args = set(inspect.getfullargspec(AutoSklearnEstimator.__init__).args)
- autosklearn2_classifier_args = set(inspect.getfullargspec(AutoSklearn2Classifier.__init__).args)
+ autosklearn1_classifier_args = set(
+ inspect.getfullargspec(AutoSklearnEstimator.__init__).args
+ )
+ autosklearn2_classifier_args = set(
+ inspect.getfullargspec(AutoSklearn2Classifier.__init__).args
+ )
extra_arguments = autosklearn1_classifier_args - autosklearn2_classifier_args
- expected_extra_args = set([
- 'exclude',
- 'include',
- 'resampling_strategy_arguments',
- 'get_smac_object_callback',
- 'initial_configurations_via_metalearning',
- 'resampling_strategy',
- 'metadata_directory',
- 'get_trials_callback',
- ])
+ expected_extra_args = set(
+ [
+ "exclude",
+ "include",
+ "resampling_strategy_arguments",
+ "get_smac_object_callback",
+ "initial_configurations_via_metalearning",
+ "resampling_strategy",
+ "metadata_directory",
+ "get_trials_callback",
+ ]
+ )
unexpected_args = extra_arguments - expected_extra_args
assert len(unexpected_args) == 0, unexpected_args
-@pytest.mark.parametrize("task_type", ['classification', 'regression'])
-@pytest.mark.parametrize("resampling_strategy", ['test', 'cv', 'holdout'])
+@pytest.mark.parametrize("task_type", ["classification", "regression"])
+@pytest.mark.parametrize("resampling_strategy", ["test", "cv", "holdout"])
@pytest.mark.parametrize("disable_file_output", [True, False])
def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_output):
"""
@@ -1082,14 +1185,16 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_
space, fit a classification pipeline with an acceptable score
"""
X_train, y_train, X_test, y_test = putil.get_dataset(
- 'iris' if task_type == 'classification' else 'boston'
+ "iris" if task_type == "classification" else "boston"
+ )
+ estimator = (
+ AutoSklearnClassifier if task_type == "classification" else AutoSklearnRegressor
)
- estimator = AutoSklearnClassifier if task_type == 'classification' else AutoSklearnRegressor
seed = 3
if task_type == "classification":
- include = {'classifier': ['random_forest']}
+ include = {"classifier": ["random_forest"]}
else:
- include = {'regressor': ['random_forest']}
+ include = {"regressor": ["random_forest"]}
automl = estimator(
delete_tmp_folder_after_terminate=False,
time_left_for_this_task=120,
@@ -1101,11 +1206,16 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_
include=include,
seed=seed,
# We cannot get the configuration space with 'test' not fit with it
- resampling_strategy=resampling_strategy if resampling_strategy != 'test' else 'holdout',
+ resampling_strategy=resampling_strategy
+ if resampling_strategy != "test"
+ else "holdout",
)
- config = automl.get_configuration_space(X_train, y_train,
- X_test=X_test, y_test=y_test,
- ).get_default_configuration()
+ config = automl.get_configuration_space(
+ X_train,
+ y_train,
+ X_test=X_test,
+ y_test=y_test,
+ ).get_default_configuration()
pipeline, run_info, run_value = automl.fit_pipeline(
X=X_train,
@@ -1114,7 +1224,7 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_
X_test=X_test,
y_test=y_test,
disable_file_output=disable_file_output,
- resampling_strategy=resampling_strategy
+ resampling_strategy=resampling_strategy,
)
assert isinstance(run_info.config, Configuration)
@@ -1124,20 +1234,20 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_
assert run_value.cost < 0.2
# Make sure that the pipeline can be pickled
- dump_file = os.path.join(tempfile.gettempdir(), 'automl.dump.pkl')
- with open(dump_file, 'wb') as f:
+ dump_file = os.path.join(tempfile.gettempdir(), "automl.dump.pkl")
+ with open(dump_file, "wb") as f:
pickle.dump(pipeline, f)
- if resampling_strategy == 'test' or disable_file_output:
+ if resampling_strategy == "test" or disable_file_output:
# We do not produce a pipeline in 'test'
assert pipeline is None
- elif resampling_strategy == 'cv':
+ elif resampling_strategy == "cv":
# We should have fitted a Voting estimator
- assert hasattr(pipeline, 'estimators_')
+ assert hasattr(pipeline, "estimators_")
else:
# We should have fitted a pipeline with named_steps
- assert hasattr(pipeline, 'named_steps')
- assert 'RandomForest' in pipeline.steps[-1][-1].choice.__class__.__name__
+ assert hasattr(pipeline, "named_steps")
+ assert "RandomForest" in pipeline.steps[-1][-1].choice.__class__.__name__
# Num run should be 2, as 1 is for dummy classifier and we have not launch
# another pipeline
@@ -1145,25 +1255,30 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_
# Check the re-sampling strategy
num_run_dir = automl.automl_._backend.get_numrun_directory(
- seed, num_run, budget=0.0)
- cv_model_path = os.path.join(num_run_dir, automl.automl_._backend.get_cv_model_filename(
- seed, num_run, budget=0.0))
- model_path = os.path.join(num_run_dir, automl.automl_._backend.get_model_filename(
- seed, num_run, budget=0.0))
- if resampling_strategy == 'test' or disable_file_output:
+ seed, num_run, budget=0.0
+ )
+ cv_model_path = os.path.join(
+ num_run_dir,
+ automl.automl_._backend.get_cv_model_filename(seed, num_run, budget=0.0),
+ )
+ model_path = os.path.join(
+ num_run_dir,
+ automl.automl_._backend.get_model_filename(seed, num_run, budget=0.0),
+ )
+ if resampling_strategy == "test" or disable_file_output:
# No file output is expected
assert not os.path.exists(num_run_dir)
else:
# We expect the model path always
# And the cv model only on 'cv'
assert os.path.exists(model_path)
- if resampling_strategy == 'cv':
+ if resampling_strategy == "cv":
assert os.path.exists(cv_model_path)
- elif resampling_strategy == 'holdout':
+ elif resampling_strategy == "holdout":
assert not os.path.exists(cv_model_path)
-@pytest.mark.parametrize("data_type", ['pandas', 'numpy'])
+@pytest.mark.parametrize("data_type", ["pandas", "numpy"])
@pytest.mark.parametrize("include_categorical", [True, False])
def test_pass_categorical_and_numeric_columns_to_pipeline(
dask_client, data_type, include_categorical
@@ -1179,17 +1294,17 @@ def test_pass_categorical_and_numeric_columns_to_pipeline(
if include_categorical:
X = np.insert(X, n_features, values=0, axis=1)
- if data_type == 'pandas':
+ if data_type == "pandas":
X = pd.DataFrame(X)
y = pd.DataFrame(y, dtype="category")
# Set the last column to categorical
if include_categorical:
- X.loc[:, n_features] = X.loc[:, n_features].astype('category') # type: ignore
+ X.loc[:, n_features] = X.loc[:, n_features].astype("category")
# Specify the feature_types
- if data_type == 'numpy' and include_categorical:
- feat_type = ['numerical'] * n_features + ['categorical']
+ if data_type == "numpy" and include_categorical:
+ feat_type = ["numerical"] * n_features + ["categorical"]
else:
feat_type = None
@@ -1207,17 +1322,25 @@ def test_pass_categorical_and_numeric_columns_to_pipeline(
ensemble_size=0,
seed=0,
dask_client=dask_client,
- include={'classifier': ['random_forest']},
+ include={"classifier": ["random_forest"]},
)
config_space = automl.get_configuration_space(
- X_train, y_train, X_test=X_test, y_test=y_test, feat_type=feat_type,
+ X_train,
+ y_train,
+ X_test=X_test,
+ y_test=y_test,
+ feat_type=feat_type,
)
config = config_space.get_default_configuration()
pipeline, _, run_value = automl.fit_pipeline(
- X=X_train, y=y_train, X_test=X_test, y_test=y_test,
- config=config, feat_type=feat_type,
+ X=X_train,
+ y=y_train,
+ X_test=X_test,
+ y_test=y_test,
+ config=config,
+ feat_type=feat_type,
)
assert pipeline is not None, "Expected a pipeline from automl.fit_pipeline"
@@ -1237,18 +1360,17 @@ def test_pass_categorical_and_numeric_columns_to_pipeline(
if include_categorical:
expected_feat_types = {
i: feature_type
- for i, feature_type
- in enumerate(['numerical'] * (n_columns-1) + ['categorical'])
+ for i, feature_type in enumerate(
+ ["numerical"] * (n_columns - 1) + ["categorical"]
+ )
}
else:
expected_feat_types = {
- i: feature_type
- for i, feature_type
- in enumerate(['numerical'] * n_columns)
+ i: feature_type for i, feature_type in enumerate(["numerical"] * n_columns)
}
- pipeline_feat_types = pipeline.named_steps['data_preprocessor'].choice.feat_type
+ pipeline_feat_types = pipeline.named_steps["data_preprocessor"].choice.feat_type
assert expected_feat_types == pipeline_feat_types
@@ -1260,20 +1382,27 @@ def test_autosklearn_anneal(as_frame):
so is a good testcase for unit-testing
"""
X, y = sklearn.datasets.fetch_openml(data_id=2, return_X_y=True, as_frame=as_frame)
- automl = AutoSklearnClassifier(time_left_for_this_task=60, ensemble_size=0,
- delete_tmp_folder_after_terminate=False,
- initial_configurations_via_metalearning=0,
- smac_scenario_args={'runcount_limit': 6},
- resampling_strategy='holdout-iterative-fit')
+ automl = AutoSklearnClassifier(
+ time_left_for_this_task=60,
+ ensemble_size=0,
+ delete_tmp_folder_after_terminate=False,
+ initial_configurations_via_metalearning=0,
+ smac_scenario_args={"runcount_limit": 6},
+ resampling_strategy="holdout-iterative-fit",
+ )
if as_frame:
# Let autosklearn calculate the feat types
automl_fitted = automl.fit(X, y)
else:
- X_, y_ = sklearn.datasets.fetch_openml(data_id=2, return_X_y=True, as_frame=True)
- feat_type = ['categorical' if X_[col].dtype.name == 'category' else 'numerical'
- for col in X_.columns]
+ X_, y_ = sklearn.datasets.fetch_openml(
+ data_id=2, return_X_y=True, as_frame=True
+ )
+ feat_type = [
+ "categorical" if X_[col].dtype.name == "category" else "numerical"
+ for col in X_.columns
+ ]
automl_fitted = automl.fit(X, y, feat_type=feat_type)
@@ -1289,9 +1418,9 @@ def test_autosklearn_anneal(as_frame):
assert automl_fitted.score(X, y) > 0.75
-@pytest.mark.parametrize("dataset_compression", [
- False, True, {"memory_allocation": 0.2}
-])
+@pytest.mark.parametrize(
+ "dataset_compression", [False, True, {"memory_allocation": 0.2}]
+)
def test_param_dataset_compression(dataset_compression: Union[bool, Dict[str, Any]]):
"""We expect this does not get parsed and modified until it gets to the AutoML class,
In the meantime, it's value remains whatever was passed in.
diff --git a/test/test_data/__init__.py b/test/test_data/__init__.py
index cc3cd7becd..e298f0f075 100644
--- a/test/test_data/__init__.py
+++ b/test/test_data/__init__.py
@@ -1,2 +1,2 @@
# -*- encoding: utf-8 -*-
-__author__ = 'feurerm'
+__author__ = "feurerm"
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
index 012ef1a179..0414cd31b4 100644
--- a/test/test_data/test_feature_validator.py
+++ b/test/test_data/test_feature_validator.py
@@ -1,14 +1,10 @@
import numpy as np
-
import pandas as pd
-from pandas.api.types import is_numeric_dtype, is_categorical_dtype, is_string_dtype
-
import pytest
-
-from scipy import sparse
-
import sklearn.datasets
import sklearn.model_selection
+from pandas.api.types import is_categorical_dtype, is_numeric_dtype, is_string_dtype
+from scipy import sparse
from autosklearn.data.feature_validator import FeatureValidator
@@ -16,118 +12,135 @@
# Fixtures to be used in this class. By default all elements have 100 datapoints
@pytest.fixture
def input_data_featuretest(request):
- if request.param == 'numpy_categoricalonly_nonan':
+ if request.param == "numpy_categoricalonly_nonan":
return np.random.randint(10, size=(100, 10))
- elif request.param == 'numpy_numericalonly_nonan':
+ elif request.param == "numpy_numericalonly_nonan":
return np.random.uniform(10, size=(100, 10))
- elif request.param == 'numpy_mixed_nonan':
- return np.column_stack([
- np.random.uniform(10, size=(100, 3)),
- np.random.randint(10, size=(100, 3)),
- np.random.uniform(10, size=(100, 3)),
- np.random.randint(10, size=(100, 1)),
- ])
- elif request.param == 'numpy_string_nonan':
- return np.array([
- ['a', 'b', 'c', 'a', 'b', 'c'],
- ['a', 'b', 'd', 'r', 'b', 'c'],
- ])
- elif request.param == 'numpy_categoricalonly_nan':
- array = np.random.randint(10, size=(100, 10)).astype('float')
+ elif request.param == "numpy_mixed_nonan":
+ return np.column_stack(
+ [
+ np.random.uniform(10, size=(100, 3)),
+ np.random.randint(10, size=(100, 3)),
+ np.random.uniform(10, size=(100, 3)),
+ np.random.randint(10, size=(100, 1)),
+ ]
+ )
+ elif request.param == "numpy_string_nonan":
+ return np.array(
+ [
+ ["a", "b", "c", "a", "b", "c"],
+ ["a", "b", "d", "r", "b", "c"],
+ ]
+ )
+ elif request.param == "numpy_categoricalonly_nan":
+ array = np.random.randint(10, size=(100, 10)).astype("float")
array[50, 0:5] = np.nan
return array
- elif request.param == 'numpy_numericalonly_nan':
- array = np.random.uniform(10, size=(100, 10)).astype('float')
+ elif request.param == "numpy_numericalonly_nan":
+ array = np.random.uniform(10, size=(100, 10)).astype("float")
array[50, 0:5] = np.nan
# Somehow array is changed to dtype object after np.nan
- return array.astype('float')
- elif request.param == 'numpy_mixed_nan':
- array = np.column_stack([
- np.random.uniform(10, size=(100, 3)),
- np.random.randint(10, size=(100, 3)),
- np.random.uniform(10, size=(100, 3)),
- np.random.randint(10, size=(100, 1)),
- ])
+ return array.astype("float")
+ elif request.param == "numpy_mixed_nan":
+ array = np.column_stack(
+ [
+ np.random.uniform(10, size=(100, 3)),
+ np.random.randint(10, size=(100, 3)),
+ np.random.uniform(10, size=(100, 3)),
+ np.random.randint(10, size=(100, 1)),
+ ]
+ )
array[50, 0:5] = np.nan
return array
- elif request.param == 'numpy_string_nan':
- return np.array([
- ['a', 'b', 'c', 'a', 'b', 'c'],
- [np.nan, 'b', 'd', 'r', 'b', 'c'],
- ])
- elif request.param == 'pandas_categoricalonly_nonan':
- return pd.DataFrame([
- {'A': 1, 'B': 2},
- {'A': 3, 'B': 4},
- ], dtype='category')
- elif request.param == 'pandas_numericalonly_nonan':
- return pd.DataFrame([
- {'A': 1, 'B': 2},
- {'A': 3, 'B': 4},
- ], dtype='float')
- elif request.param == 'pandas_mixed_nonan':
- frame = pd.DataFrame([
- {'A': 1, 'B': 2},
- {'A': 3, 'B': 4},
- ], dtype='category')
- frame['B'] = pd.to_numeric(frame['B'])
+ elif request.param == "numpy_string_nan":
+ return np.array(
+ [
+ ["a", "b", "c", "a", "b", "c"],
+ [np.nan, "b", "d", "r", "b", "c"],
+ ]
+ )
+ elif request.param == "pandas_categoricalonly_nonan":
+ return pd.DataFrame(
+ [
+ {"A": 1, "B": 2},
+ {"A": 3, "B": 4},
+ ],
+ dtype="category",
+ )
+ elif request.param == "pandas_numericalonly_nonan":
+ return pd.DataFrame(
+ [
+ {"A": 1, "B": 2},
+ {"A": 3, "B": 4},
+ ],
+ dtype="float",
+ )
+ elif request.param == "pandas_mixed_nonan":
+ frame = pd.DataFrame(
+ [
+ {"A": 1, "B": 2},
+ {"A": 3, "B": 4},
+ ],
+ dtype="category",
+ )
+ frame["B"] = pd.to_numeric(frame["B"])
return frame
- elif request.param == 'pandas_categoricalonly_nan':
- return pd.DataFrame([
- {'A': 1, 'B': 2, 'C': np.nan},
- {'A': 3, 'C': np.nan},
- ], dtype='category')
- elif request.param == 'pandas_numericalonly_nan':
- return pd.DataFrame([
- {'A': 1, 'B': 2, 'C': np.nan},
- {'A': 3, 'C': np.nan},
- ], dtype='float')
- elif request.param == 'pandas_mixed_nan':
- frame = pd.DataFrame([
- {'A': 1, 'B': 2, 'C': 8},
- {'A': 3, 'B': 4},
- ], dtype='category')
- frame['B'] = pd.to_numeric(frame['B'])
+ elif request.param == "pandas_categoricalonly_nan":
+ return pd.DataFrame(
+ [
+ {"A": 1, "B": 2, "C": np.nan},
+ {"A": 3, "C": np.nan},
+ ],
+ dtype="category",
+ )
+ elif request.param == "pandas_numericalonly_nan":
+ return pd.DataFrame(
+ [
+ {"A": 1, "B": 2, "C": np.nan},
+ {"A": 3, "C": np.nan},
+ ],
+ dtype="float",
+ )
+ elif request.param == "pandas_mixed_nan":
+ frame = pd.DataFrame(
+ [
+ {"A": 1, "B": 2, "C": 8},
+ {"A": 3, "B": 4},
+ ],
+ dtype="category",
+ )
+ frame["B"] = pd.to_numeric(frame["B"])
return frame
- elif request.param == 'pandas_string_nonan':
- return pd.DataFrame([
- {'A': 1, 'B': 2},
- {'A': 3, 'B': 4},
- ], dtype='string')
- elif request.param == 'list_categoricalonly_nonan':
- return [
- ['a', 'b', 'c', 'd'],
- ['e', 'f', 'c', 'd'],
- ]
- elif request.param == 'list_numericalonly_nonan':
- return [
- [1, 2, 3, 4],
- [5, 6, 7, 8]
- ]
- elif request.param == 'list_mixed_nonan':
- return [
- ['a', 2, 3, 4],
- ['b', 6, 7, 8]
- ]
- elif request.param == 'list_categoricalonly_nan':
- return [
- ['a', 'b', 'c', np.nan],
- ['e', 'f', 'c', 'd'],
- ]
- elif request.param == 'list_numericalonly_nan':
+ elif request.param == "pandas_string_nonan":
+ return pd.DataFrame(
+ [
+ {"A": 1, "B": 2},
+ {"A": 3, "B": 4},
+ ],
+ dtype="string",
+ )
+ elif request.param == "list_categoricalonly_nonan":
return [
- [1, 2, 3, np.nan],
- [5, 6, 7, 8]
+ ["a", "b", "c", "d"],
+ ["e", "f", "c", "d"],
]
- elif request.param == 'list_mixed_nan':
+ elif request.param == "list_numericalonly_nonan":
+ return [[1, 2, 3, 4], [5, 6, 7, 8]]
+ elif request.param == "list_mixed_nonan":
+ return [["a", 2, 3, 4], ["b", 6, 7, 8]]
+ elif request.param == "list_categoricalonly_nan":
return [
- ['a', np.nan, 3, 4],
- ['b', 6, 7, 8]
+ ["a", "b", "c", np.nan],
+ ["e", "f", "c", "d"],
]
- elif 'sparse' in request.param:
+ elif request.param == "list_numericalonly_nan":
+ return [[1, 2, 3, np.nan], [5, 6, 7, 8]]
+ elif request.param == "list_mixed_nan":
+ return [["a", np.nan, 3, 4], ["b", 6, 7, 8]]
+ elif "sparse" in request.param:
# We expect the names to be of the type sparse_csc_nonan
- sparse_, type_, nan_ = request.param.split('_')
- if 'nonan' in nan_:
+ sparse_, type_, nan_ = request.param.split("_")
+ if "nonan" in nan_:
data = np.ones(3)
else:
data = np.array([1, 2, np.nan])
@@ -135,26 +148,27 @@ def input_data_featuretest(request):
# Then the type of sparse
row_ind = np.array([0, 1, 2])
col_ind = np.array([1, 2, 1])
- if 'csc' in type_:
+ if "csc" in type_:
return sparse.csc_matrix((data, (row_ind, col_ind)))
- elif 'csr' in type_:
+ elif "csr" in type_:
return sparse.csr_matrix((data, (row_ind, col_ind)))
- elif 'coo' in type_:
+ elif "coo" in type_:
return sparse.coo_matrix((data, (row_ind, col_ind)))
- elif 'bsr' in type_:
+ elif "bsr" in type_:
return sparse.bsr_matrix((data, (row_ind, col_ind)))
- elif 'lil' in type_:
+ elif "lil" in type_:
return sparse.lil_matrix((data))
- elif 'dok' in type_:
+ elif "dok" in type_:
return sparse.dok_matrix(np.vstack((data, data, data)))
- elif 'dia' in type_:
+ elif "dia" in type_:
return sparse.dia_matrix(np.vstack((data, data, data)))
else:
ValueError("Unsupported indirect fixture {}".format(request.param))
- elif 'openml' in request.param:
- _, openml_id = request.param.split('_')
- X, y = sklearn.datasets.fetch_openml(data_id=int(openml_id),
- return_X_y=True, as_frame=True)
+ elif "openml" in request.param:
+ _, openml_id = request.param.split("_")
+ X, y = sklearn.datasets.fetch_openml(
+ data_id=int(openml_id), return_X_y=True, as_frame=True
+ )
return X
else:
ValueError("Unsupported indirect fixture {}".format(request.param))
@@ -162,37 +176,37 @@ def input_data_featuretest(request):
# Actual checks for the features
@pytest.mark.parametrize(
- 'input_data_featuretest',
+ "input_data_featuretest",
(
- 'numpy_categoricalonly_nonan',
- 'numpy_numericalonly_nonan',
- 'numpy_mixed_nonan',
- 'numpy_categoricalonly_nan',
- 'numpy_numericalonly_nan',
- 'numpy_mixed_nan',
- 'pandas_categoricalonly_nonan',
- 'pandas_numericalonly_nonan',
- 'pandas_mixed_nonan',
- 'pandas_numericalonly_nan',
- 'list_numericalonly_nonan',
- 'list_numericalonly_nan',
- 'sparse_bsr_nonan',
- 'sparse_bsr_nan',
- 'sparse_coo_nonan',
- 'sparse_coo_nan',
- 'sparse_csc_nonan',
- 'sparse_csc_nan',
- 'sparse_csr_nonan',
- 'sparse_csr_nan',
- 'sparse_dia_nonan',
- 'sparse_dia_nan',
- 'sparse_dok_nonan',
- 'sparse_dok_nan',
- 'sparse_lil_nonan',
- 'sparse_lil_nan',
- 'openml_40981', # Australian
+ "numpy_categoricalonly_nonan",
+ "numpy_numericalonly_nonan",
+ "numpy_mixed_nonan",
+ "numpy_categoricalonly_nan",
+ "numpy_numericalonly_nan",
+ "numpy_mixed_nan",
+ "pandas_categoricalonly_nonan",
+ "pandas_numericalonly_nonan",
+ "pandas_mixed_nonan",
+ "pandas_numericalonly_nan",
+ "list_numericalonly_nonan",
+ "list_numericalonly_nan",
+ "sparse_bsr_nonan",
+ "sparse_bsr_nan",
+ "sparse_coo_nonan",
+ "sparse_coo_nan",
+ "sparse_csc_nonan",
+ "sparse_csc_nan",
+ "sparse_csr_nonan",
+ "sparse_csr_nan",
+ "sparse_dia_nonan",
+ "sparse_dia_nan",
+ "sparse_dok_nonan",
+ "sparse_dok_nan",
+ "sparse_lil_nonan",
+ "sparse_lil_nan",
+ "openml_40981", # Australian
),
- indirect=True
+ indirect=True,
)
def test_featurevalidator_supported_types(input_data_featuretest):
validator = FeatureValidator()
@@ -209,43 +223,45 @@ def test_featurevalidator_supported_types(input_data_featuretest):
@pytest.mark.parametrize(
- 'input_data_featuretest',
+ "input_data_featuretest",
(
- 'numpy_string_nonan',
- 'numpy_string_nan',
+ "numpy_string_nonan",
+ "numpy_string_nan",
),
- indirect=True
+ indirect=True,
)
def test_featurevalidator_unsupported_numpy(input_data_featuretest):
validator = FeatureValidator()
- with pytest.raises(ValueError, match=r".*When providing a numpy array.*not supported."):
+ with pytest.raises(
+ ValueError, match=r".*When providing a numpy array.*not supported."
+ ):
validator.fit(input_data_featuretest)
@pytest.mark.parametrize(
- 'input_data_featuretest',
+ "input_data_featuretest",
(
- 'numpy_categoricalonly_nonan',
- 'numpy_mixed_nonan',
- 'numpy_categoricalonly_nan',
- 'numpy_mixed_nan',
- 'pandas_categoricalonly_nonan',
- 'pandas_mixed_nonan',
- 'sparse_bsr_nonan',
- 'sparse_bsr_nan',
- 'sparse_coo_nonan',
- 'sparse_coo_nan',
- 'sparse_csc_nonan',
- 'sparse_csc_nan',
- 'sparse_csr_nonan',
- 'sparse_csr_nan',
- 'sparse_dia_nonan',
- 'sparse_dia_nan',
- 'sparse_dok_nonan',
- 'sparse_dok_nan',
- 'sparse_lil_nonan',
+ "numpy_categoricalonly_nonan",
+ "numpy_mixed_nonan",
+ "numpy_categoricalonly_nan",
+ "numpy_mixed_nan",
+ "pandas_categoricalonly_nonan",
+ "pandas_mixed_nonan",
+ "sparse_bsr_nonan",
+ "sparse_bsr_nan",
+ "sparse_coo_nonan",
+ "sparse_coo_nan",
+ "sparse_csc_nonan",
+ "sparse_csc_nan",
+ "sparse_csr_nonan",
+ "sparse_csr_nan",
+ "sparse_dia_nonan",
+ "sparse_dia_nan",
+ "sparse_dok_nonan",
+ "sparse_dok_nan",
+ "sparse_lil_nonan",
),
- indirect=True
+ indirect=True,
)
def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
"""
@@ -276,20 +292,24 @@ def test_featurevalidatorget_feat_type_from_columns():
"""
validator = FeatureValidator()
- df = pd.DataFrame([
- {'int': 1, 'float': 1.0, 'category': 'one', 'bool': True},
- {'int': 2, 'float': 2.0, 'category': 'two', 'bool': False},
- ])
+ df = pd.DataFrame(
+ [
+ {"int": 1, "float": 1.0, "category": "one", "bool": True},
+ {"int": 2, "float": 2.0, "category": "two", "bool": False},
+ ]
+ )
for col in df.columns:
df[col] = df[col].astype(col)
feature_types = validator.get_feat_type_from_columns(df)
- assert feature_types == {'int': 'numerical',
- 'float': 'numerical',
- 'category': 'categorical',
- 'bool': 'categorical'}
+ assert feature_types == {
+ "int": "numerical",
+ "float": "numerical",
+ "category": "categorical",
+ "bool": "categorical",
+ }
def test_features_unsupported_calls_are_raised():
@@ -300,28 +320,37 @@ def test_features_unsupported_calls_are_raised():
"""
validator = FeatureValidator()
with pytest.raises(ValueError, match=r"Auto-sklearn does not support time"):
+ validator.fit(pd.DataFrame({"datetime": [pd.Timestamp("20180310")]}))
+ with pytest.raises(
+ ValueError, match=r"Auto-sklearn only supports.*yet, the provided input"
+ ):
+ validator.fit({"input1": 1, "input2": 2})
+ validator = FeatureValidator()
+ with pytest.raises(
+ ValueError, match=r"The feature dimensionality of the train and test"
+ ):
validator.fit(
- pd.DataFrame({'datetime': [pd.Timestamp('20180310')]})
+ X_train=np.array([[1, 2, 3], [4, 5, 6]]),
+ X_test=np.array([[1, 2, 3, 4], [4, 5, 6, 7]]),
)
- with pytest.raises(ValueError, match=r"Auto-sklearn only supports.*yet, the provided input"):
- validator.fit({'input1': 1, 'input2': 2})
- validator = FeatureValidator()
- with pytest.raises(ValueError, match=r"The feature dimensionality of the train and test"):
- validator.fit(X_train=np.array([[1, 2, 3], [4, 5, 6]]),
- X_test=np.array([[1, 2, 3, 4], [4, 5, 6, 7]]),
- )
- with pytest.raises(ValueError, match=r"Cannot call transform on a validator that is not fit"):
+ with pytest.raises(
+ ValueError, match=r"Cannot call transform on a validator that is not fit"
+ ):
validator.transform(np.array([[1, 2, 3], [4, 5, 6]]))
- validator = FeatureValidator(feat_type=['Numerical'])
- with pytest.raises(ValueError, match=r"providing the option feat_type to the fit method is.*"):
+ validator = FeatureValidator(feat_type=["Numerical"])
+ with pytest.raises(
+ ValueError, match=r"providing the option feat_type to the fit method is.*"
+ ):
validator.fit(pd.DataFrame([[1, 2, 3], [4, 5, 6]]))
with pytest.raises(ValueError, match=r"feat_type does not have same number of.*"):
validator.fit(np.array([[1, 2, 3], [4, 5, 6]]))
validator = FeatureValidator(feat_type=[1, 2, 3])
with pytest.raises(ValueError, match=r"feat_type must only contain strings.*"):
validator.fit(np.array([[1, 2, 3], [4, 5, 6]]))
- validator = FeatureValidator(feat_type=['1', '2', '3'])
- with pytest.raises(ValueError, match=r"Only `Categorical`, `Numerical` and `String` are.*"):
+ validator = FeatureValidator(feat_type=["1", "2", "3"])
+ with pytest.raises(
+ ValueError, match=r"Only `Categorical`, `Numerical` and `String` are.*"
+ ):
validator.fit(np.array([[1, 2, 3], [4, 5, 6]]))
@@ -331,16 +360,16 @@ def test_no_new_category_after_fit():
without throwing an error
"""
# Then make sure we catch categorical extra categories
- x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, dtype='category')
+ x = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, dtype="category")
validator = FeatureValidator()
validator.fit(x)
- x['A'] = x['A'].apply(lambda x: x*x)
+ x["A"] = x["A"].apply(lambda x: x * x)
validator.transform(x)
# Actual checks for the features
@pytest.mark.parametrize(
- 'openml_id',
+ "openml_id",
(
40981, # Australian
3, # kr-vs-kp
@@ -349,32 +378,37 @@ def test_no_new_category_after_fit():
40984, # Segment
),
)
-@pytest.mark.parametrize('train_data_type', ('numpy', 'pandas', 'list'))
-@pytest.mark.parametrize('test_data_type', ('numpy', 'pandas', 'list'))
-def test_featurevalidator_new_data_after_fit(openml_id,
- train_data_type, test_data_type):
+@pytest.mark.parametrize("train_data_type", ("numpy", "pandas", "list"))
+@pytest.mark.parametrize("test_data_type", ("numpy", "pandas", "list"))
+def test_featurevalidator_new_data_after_fit(
+ openml_id, train_data_type, test_data_type
+):
# List is currently not supported as infer_objects
# cast list objects to type objects
- if train_data_type == 'list' or test_data_type == 'list':
+ if train_data_type == "list" or test_data_type == "list":
pytest.skip()
validator = FeatureValidator()
- if train_data_type == 'numpy':
- X, y = sklearn.datasets.fetch_openml(data_id=openml_id,
- return_X_y=True, as_frame=False)
- elif train_data_type == 'pandas':
- X, y = sklearn.datasets.fetch_openml(data_id=openml_id,
- return_X_y=True, as_frame=True)
+ if train_data_type == "numpy":
+ X, y = sklearn.datasets.fetch_openml(
+ data_id=openml_id, return_X_y=True, as_frame=False
+ )
+ elif train_data_type == "pandas":
+ X, y = sklearn.datasets.fetch_openml(
+ data_id=openml_id, return_X_y=True, as_frame=True
+ )
else:
- X, y = sklearn.datasets.fetch_openml(data_id=openml_id,
- return_X_y=True, as_frame=True)
+ X, y = sklearn.datasets.fetch_openml(
+ data_id=openml_id, return_X_y=True, as_frame=True
+ )
X = X.values.tolist()
y = y.values.tolist()
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
- X, y, random_state=1)
+ X, y, random_state=1
+ )
validator.fit(X_train)
@@ -391,7 +425,7 @@ def test_featurevalidator_new_data_after_fit(openml_id,
@pytest.mark.parametrize(
- 'openml_id',
+ "openml_id",
(
40981, # Australian
3, # kr-vs-kp
@@ -403,10 +437,12 @@ def test_featurevalidator_new_data_after_fit(openml_id,
)
def test_list_to_dataframe(openml_id):
- X_pandas, y_pandas = sklearn.datasets.fetch_openml(data_id=openml_id,
- return_X_y=True, as_frame=True)
+ X_pandas, y_pandas = sklearn.datasets.fetch_openml(
+ data_id=openml_id, return_X_y=True, as_frame=True
+ )
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
- X_pandas, y_pandas, random_state=1)
+ X_pandas, y_pandas, random_state=1
+ )
X_list = X_train.values.tolist()
validator = FeatureValidator()
@@ -439,24 +475,24 @@ def test_list_to_dataframe(openml_id):
@pytest.mark.parametrize(
- 'input_data_featuretest',
+ "input_data_featuretest",
(
- 'sparse_bsr_nonan',
- 'sparse_bsr_nan',
- 'sparse_coo_nonan',
- 'sparse_coo_nan',
- 'sparse_csc_nonan',
- 'sparse_csc_nan',
- 'sparse_csr_nonan',
- 'sparse_csr_nan',
- 'sparse_dia_nonan',
- 'sparse_dia_nan',
- 'sparse_dok_nonan',
- 'sparse_dok_nan',
- 'sparse_lil_nonan',
- 'sparse_lil_nan',
+ "sparse_bsr_nonan",
+ "sparse_bsr_nan",
+ "sparse_coo_nonan",
+ "sparse_coo_nan",
+ "sparse_csc_nonan",
+ "sparse_csc_nan",
+ "sparse_csr_nonan",
+ "sparse_csr_nan",
+ "sparse_dia_nonan",
+ "sparse_dia_nan",
+ "sparse_dok_nonan",
+ "sparse_dok_nan",
+ "sparse_lil_nonan",
+ "sparse_lil_nan",
),
- indirect=True
+ indirect=True,
)
def test_sparse_output_is_csr(input_data_featuretest):
validator = FeatureValidator()
@@ -467,7 +503,9 @@ def test_sparse_output_is_csr(input_data_featuretest):
def test_unsupported_dataframe_sparse():
- df = pd.DataFrame({'A': pd.Series(pd.arrays.SparseArray(np.random.randn(10)))})
+ df = pd.DataFrame({"A": pd.Series(pd.arrays.SparseArray(np.random.randn(10)))})
validator = FeatureValidator()
- with pytest.raises(ValueError, match=r"Auto-sklearn does not yet support sparse pandas"):
+ with pytest.raises(
+ ValueError, match=r"Auto-sklearn does not yet support sparse pandas"
+ ):
validator.fit(df)
diff --git a/test/test_data/test_target_validator.py b/test/test_data/test_target_validator.py
index 09e075b85f..e57f464c72 100644
--- a/test/test_data/test_target_validator.py
+++ b/test/test_data/test_target_validator.py
@@ -1,14 +1,10 @@
import numpy as np
-
import pandas as pd
-
import pytest
-from pandas.api.types import is_numeric_dtype, is_bool_dtype
-
-from scipy import sparse
-
import sklearn.datasets
import sklearn.model_selection
+from pandas.api.types import is_bool_dtype, is_numeric_dtype
+from scipy import sparse
from sklearn.utils.multiclass import type_of_target
from autosklearn.data.target_validator import TargetValidator
@@ -17,80 +13,85 @@
# Fixtures to be used in this class. By default all elements have 100 datapoints
@pytest.fixture
def input_data_targettest(request):
- if request.param == 'series_binary':
+ if request.param == "series_binary":
return pd.Series([1, -1, -1, 1])
- elif request.param == 'series_multiclass':
+ elif request.param == "series_multiclass":
return pd.Series([1, 0, 2])
- elif request.param == 'series_multilabel':
+ elif request.param == "series_multilabel":
return pd.Series([[1, 0], [0, 1]])
- elif request.param == 'series_continuous':
+ elif request.param == "series_continuous":
return pd.Series([0.1, 0.6, 0.7])
- elif request.param == 'series_continuous-multioutput':
+ elif request.param == "series_continuous-multioutput":
return pd.Series([[1.5, 2.0], [3.0, 1.6]])
- elif request.param == 'pandas_binary':
+ elif request.param == "pandas_binary":
return pd.DataFrame([1, -1, -1, 1])
- elif request.param == 'pandas_multiclass':
+ elif request.param == "pandas_multiclass":
return pd.DataFrame([1, 0, 2])
- elif request.param == 'pandas_multilabel':
+ elif request.param == "pandas_multilabel":
return pd.DataFrame([[1, 0], [0, 1]])
- elif request.param == 'pandas_continuous':
+ elif request.param == "pandas_continuous":
return pd.DataFrame([0.1, 0.6, 0.7])
- elif request.param == 'pandas_continuous-multioutput':
+ elif request.param == "pandas_continuous-multioutput":
return pd.DataFrame([[1.5, 2.0], [3.0, 1.6]])
- elif request.param == 'numpy_binary':
+ elif request.param == "numpy_binary":
return np.array([1, -1, -1, 1])
- elif request.param == 'numpy_multiclass':
+ elif request.param == "numpy_multiclass":
return np.array([1, 0, 2])
- elif request.param == 'numpy_multilabel':
+ elif request.param == "numpy_multilabel":
return np.array([[1, 0], [0, 1]])
- elif request.param == 'numpy_continuous':
+ elif request.param == "numpy_continuous":
return np.array([0.1, 0.6, 0.7])
- elif request.param == 'numpy_continuous-multioutput':
+ elif request.param == "numpy_continuous-multioutput":
return np.array([[1.5, 2.0], [3.0, 1.6]])
- elif request.param == 'list_binary':
+ elif request.param == "list_binary":
return [1, -1, -1, 1]
- elif request.param == 'list_multiclass':
+ elif request.param == "list_multiclass":
return [1, 0, 2]
- elif request.param == 'list_multilabel':
+ elif request.param == "list_multilabel":
return [[0, 1], [1, 0]]
- elif request.param == 'list_continuous':
+ elif request.param == "list_continuous":
return [0.1, 0.6, 0.7]
- elif request.param == 'list_continuous-multioutput':
+ elif request.param == "list_continuous-multioutput":
return [[1.5, 2.0], [3.0, 1.6]]
- elif 'openml' in request.param:
- _, openml_id = request.param.split('_')
- X, y = sklearn.datasets.fetch_openml(data_id=int(openml_id),
- return_X_y=True, as_frame=True)
- if len(y.shape) > 1 and y.shape[1] > 1 and np.any(y.eq('TRUE').any(1).to_numpy()):
+ elif "openml" in request.param:
+ _, openml_id = request.param.split("_")
+ X, y = sklearn.datasets.fetch_openml(
+ data_id=int(openml_id), return_X_y=True, as_frame=True
+ )
+ if (
+ len(y.shape) > 1
+ and y.shape[1] > 1
+ and np.any(y.eq("TRUE").any(1).to_numpy())
+ ):
# This 'if' is only asserted for multi-label data
# Force the downloaded data to be interpreted as multilabel
y = y.dropna()
- y.replace('FALSE', 0, inplace=True)
- y.replace('TRUE', 1, inplace=True)
+ y.replace("FALSE", 0, inplace=True)
+ y.replace("TRUE", 1, inplace=True)
y = y.astype(int)
return y
- elif 'sparse' in request.param:
+ elif "sparse" in request.param:
# We expect the names to be of the type sparse_csc_nonan
- sparse_, type_, nan_ = request.param.split('_')
- if 'nonan' in nan_:
+ sparse_, type_, nan_ = request.param.split("_")
+ if "nonan" in nan_:
data = np.ones(3)
else:
data = np.array([1, 2, np.nan])
# Then the type of sparse
- if 'csc' in type_:
+ if "csc" in type_:
return sparse.csc_matrix(data)
- elif 'csr' in type_:
+ elif "csr" in type_:
return sparse.csr_matrix(data)
- elif 'coo' in type_:
+ elif "coo" in type_:
return sparse.coo_matrix(data)
- elif 'bsr' in type_:
+ elif "bsr" in type_:
return sparse.bsr_matrix(data)
- elif 'lil' in type_:
+ elif "lil" in type_:
return sparse.lil_matrix(data)
- elif 'dok' in type_:
+ elif "dok" in type_:
return sparse.dok_matrix(np.vstack((data, data, data)))
- elif 'dia' in type_:
+ elif "dia" in type_:
return sparse.dia_matrix(np.vstack((data, data, data)))
else:
ValueError("Unsupported indirect fixture {}".format(request.param))
@@ -100,29 +101,29 @@ def input_data_targettest(request):
# Actual checks for the targets
@pytest.mark.parametrize(
- 'input_data_targettest',
+ "input_data_targettest",
(
- 'series_binary',
- 'series_multiclass',
- 'series_continuous',
- 'pandas_binary',
- 'pandas_multiclass',
- 'pandas_multilabel',
- 'pandas_continuous',
- 'pandas_continuous-multioutput',
- 'numpy_binary',
- 'numpy_multiclass',
- 'numpy_multilabel',
- 'numpy_continuous',
- 'numpy_continuous-multioutput',
- 'list_binary',
- 'list_multiclass',
- 'list_multilabel',
- 'list_continuous',
- 'list_continuous-multioutput',
- 'openml_204',
+ "series_binary",
+ "series_multiclass",
+ "series_continuous",
+ "pandas_binary",
+ "pandas_multiclass",
+ "pandas_multilabel",
+ "pandas_continuous",
+ "pandas_continuous-multioutput",
+ "numpy_binary",
+ "numpy_multiclass",
+ "numpy_multilabel",
+ "numpy_continuous",
+ "numpy_continuous-multioutput",
+ "list_binary",
+ "list_multiclass",
+ "list_multilabel",
+ "list_continuous",
+ "list_continuous-multioutput",
+ "openml_204",
),
- indirect=True
+ indirect=True,
)
def test_targetvalidator_supported_types_noclassification(input_data_targettest):
y = input_data_targettest
@@ -146,19 +147,19 @@ def test_targetvalidator_supported_types_noclassification(input_data_targettest)
@pytest.mark.parametrize(
- 'input_data_targettest',
+ "input_data_targettest",
(
- 'series_binary',
- 'series_multiclass',
- 'pandas_binary',
- 'pandas_multiclass',
- 'numpy_binary',
- 'numpy_multiclass',
- 'list_binary',
- 'list_multiclass',
- 'openml_2',
+ "series_binary",
+ "series_multiclass",
+ "pandas_binary",
+ "pandas_multiclass",
+ "numpy_binary",
+ "numpy_multiclass",
+ "list_binary",
+ "list_multiclass",
+ "openml_2",
),
- indirect=True
+ indirect=True,
)
def test_targetvalidator_supported_types_classification(input_data_targettest):
y = input_data_targettest # Just to remove visual clutter
@@ -177,10 +178,7 @@ def test_targetvalidator_supported_types_classification(input_data_targettest):
assert isinstance(y_inverse, np.ndarray)
# Assert that y_encoded is numeric and not boolean
- assert (
- is_numeric_dtype(y_encoded.dtype)
- and not is_bool_dtype(y_encoded.dtype)
- )
+ assert is_numeric_dtype(y_encoded.dtype) and not is_bool_dtype(y_encoded.dtype)
# Assert dtype is presevered with y -> y_encoded -> y_inverse
def dtype(arr):
@@ -205,7 +203,7 @@ def dtype(arr):
if len(shape) == 2 and shape[1] == 1:
# For cases where y = [[1], [2], [3]],
# we expect y_inverse, y_encodedd to have been flattened to [1,2,3]
- expected_shape = (shape[0], )
+ expected_shape = (shape[0],)
else:
expected_shape = shape
@@ -221,7 +219,7 @@ def dtype(arr):
#
# As a result of this, we don't encode 'multilabel-indicator' labels and
# there is nothing else to check here
- if validator.type_of_target == 'multilabel-indicator':
+ if validator.type_of_target == "multilabel-indicator":
assert validator.encoder is None
else:
@@ -242,112 +240,112 @@ def dtype(arr):
@pytest.mark.parametrize(
- 'input_data_targettest',
+ "input_data_targettest",
(
- 'series_binary',
- 'pandas_binary',
- 'numpy_binary',
- 'list_binary',
- 'openml_1066',
+ "series_binary",
+ "pandas_binary",
+ "numpy_binary",
+ "list_binary",
+ "openml_1066",
),
- indirect=True
+ indirect=True,
)
def test_targetvalidator_binary(input_data_targettest):
- assert type_of_target(input_data_targettest) == 'binary'
+ assert type_of_target(input_data_targettest) == "binary"
validator = TargetValidator(is_classification=True)
# Test the X_test also!
validator.fit(input_data_targettest, input_data_targettest)
transformed_y = validator.transform(input_data_targettest)
- assert type_of_target(transformed_y) == 'binary'
+ assert type_of_target(transformed_y) == "binary"
@pytest.mark.parametrize(
- 'input_data_targettest',
+ "input_data_targettest",
(
- 'series_multiclass',
- 'pandas_multiclass',
- 'numpy_multiclass',
- 'list_multiclass',
- 'openml_54',
+ "series_multiclass",
+ "pandas_multiclass",
+ "numpy_multiclass",
+ "list_multiclass",
+ "openml_54",
),
- indirect=True
+ indirect=True,
)
def test_targetvalidator_multiclass(input_data_targettest):
- assert type_of_target(input_data_targettest) == 'multiclass'
+ assert type_of_target(input_data_targettest) == "multiclass"
validator = TargetValidator(is_classification=True)
# Test the X_test also!
validator.fit(input_data_targettest, input_data_targettest)
transformed_y = validator.transform(input_data_targettest)
- assert type_of_target(transformed_y) == 'multiclass'
+ assert type_of_target(transformed_y) == "multiclass"
@pytest.mark.parametrize(
- 'input_data_targettest',
+ "input_data_targettest",
(
- 'pandas_multilabel',
- 'numpy_multilabel',
- 'list_multilabel',
- 'openml_40594',
+ "pandas_multilabel",
+ "numpy_multilabel",
+ "list_multilabel",
+ "openml_40594",
),
- indirect=True
+ indirect=True,
)
def test_targetvalidator_multilabel(input_data_targettest):
- assert type_of_target(input_data_targettest) == 'multilabel-indicator'
+ assert type_of_target(input_data_targettest) == "multilabel-indicator"
validator = TargetValidator(is_classification=True)
# Test the X_test also!
validator.fit(input_data_targettest, input_data_targettest)
transformed_y = validator.transform(input_data_targettest)
- assert type_of_target(transformed_y) == 'multilabel-indicator'
+ assert type_of_target(transformed_y) == "multilabel-indicator"
@pytest.mark.parametrize(
- 'input_data_targettest',
+ "input_data_targettest",
(
- 'series_continuous',
- 'pandas_continuous',
- 'numpy_continuous',
- 'list_continuous',
- 'openml_531',
+ "series_continuous",
+ "pandas_continuous",
+ "numpy_continuous",
+ "list_continuous",
+ "openml_531",
),
- indirect=True
+ indirect=True,
)
def test_targetvalidator_continuous(input_data_targettest):
- assert type_of_target(input_data_targettest) == 'continuous'
+ assert type_of_target(input_data_targettest) == "continuous"
validator = TargetValidator(is_classification=False)
# Test the X_test also!
validator.fit(input_data_targettest, input_data_targettest)
transformed_y = validator.transform(input_data_targettest)
- assert type_of_target(transformed_y) == 'continuous'
+ assert type_of_target(transformed_y) == "continuous"
@pytest.mark.parametrize(
- 'input_data_targettest',
+ "input_data_targettest",
(
- 'pandas_continuous-multioutput',
- 'numpy_continuous-multioutput',
- 'list_continuous-multioutput',
- 'openml_41483',
+ "pandas_continuous-multioutput",
+ "numpy_continuous-multioutput",
+ "list_continuous-multioutput",
+ "openml_41483",
),
- indirect=True
+ indirect=True,
)
def test_targetvalidator_continuous_multioutput(input_data_targettest):
- assert type_of_target(input_data_targettest) == 'continuous-multioutput'
+ assert type_of_target(input_data_targettest) == "continuous-multioutput"
validator = TargetValidator(is_classification=False)
# Test the X_test also!
validator.fit(input_data_targettest, input_data_targettest)
transformed_y = validator.transform(input_data_targettest)
- assert type_of_target(transformed_y) == 'continuous-multioutput'
+ assert type_of_target(transformed_y) == "continuous-multioutput"
@pytest.mark.parametrize(
- 'input_data_targettest',
+ "input_data_targettest",
(
- 'series_binary',
- 'pandas_binary',
- 'numpy_binary',
- 'list_binary',
+ "series_binary",
+ "pandas_binary",
+ "numpy_binary",
+ "list_binary",
),
- indirect=True
+ indirect=True,
)
def test_targetvalidator_fitontypeA_transformtypeB(input_data_targettest):
"""
@@ -370,12 +368,12 @@ def test_targetvalidator_fitontypeA_transformtypeB(input_data_targettest):
@pytest.mark.parametrize(
- 'input_data_targettest',
+ "input_data_targettest",
(
- 'series_multilabel',
- 'series_continuous-multioutput',
+ "series_multilabel",
+ "series_continuous-multioutput",
),
- indirect=True
+ indirect=True,
)
def test_type_of_target_unsupported(input_data_targettest):
"""
@@ -393,43 +391,63 @@ def test_target_unsupported():
when providing not supported data input
"""
validator = TargetValidator(is_classification=True)
- with pytest.raises(ValueError, match=r"The dimensionality of the train and test targets"):
+ with pytest.raises(
+ ValueError, match=r"The dimensionality of the train and test targets"
+ ):
validator.fit(
np.array([[0, 1, 0], [0, 1, 1]]),
np.array([[0, 1, 0, 0], [0, 1, 1, 1]]),
)
- with pytest.raises(ValueError, match=r"Train and test targets must both have the same dtypes"):
+ with pytest.raises(
+ ValueError, match=r"Train and test targets must both have the same dtypes"
+ ):
validator.fit(
- pd.DataFrame({'a': [1, 2, 3]}),
- pd.DataFrame({'a': [True, False, False]}),
+ pd.DataFrame({"a": [1, 2, 3]}),
+ pd.DataFrame({"a": [True, False, False]}),
)
with pytest.raises(ValueError, match=r"Provided targets are not supported.*"):
validator.fit(
np.array([[0, 1, 2], [0, 3, 4]]),
np.array([[0, 1, 2, 5], [0, 3, 4, 6]]),
)
- with pytest.raises(ValueError, match="Train and test targets must both have the same"):
+ with pytest.raises(
+ ValueError, match="Train and test targets must both have the same"
+ ):
validator.fit(
- pd.DataFrame({'string': ['foo']}),
- pd.DataFrame({'int': [1]}),
+ pd.DataFrame({"string": ["foo"]}),
+ pd.DataFrame({"int": [1]}),
)
- with pytest.raises(ValueError, match=r"Auto-sklearn only supports Numpy arrays, .*"):
- validator.fit({'input1': 1, 'input2': 2})
- with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"):
+ with pytest.raises(
+ ValueError, match=r"Auto-sklearn only supports Numpy arrays, .*"
+ ):
+ validator.fit({"input1": 1, "input2": 2})
+ with pytest.raises(
+ ValueError, match=r"arget values cannot contain missing/NaN values"
+ ):
validator.fit(np.array([np.nan, 1, 2]))
- with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"):
+ with pytest.raises(
+ ValueError, match=r"arget values cannot contain missing/NaN values"
+ ):
validator.fit(sparse.csr_matrix(np.array([1, 2, np.nan])))
- with pytest.raises(ValueError, match=r"TargetValidator must have fit\(\) called first"):
+ with pytest.raises(
+ ValueError, match=r"TargetValidator must have fit\(\) called first"
+ ):
validator.transform(np.array([1, 2, 3]))
- with pytest.raises(ValueError, match=r"TargetValidator must have fit\(\) called first"):
+ with pytest.raises(
+ ValueError, match=r"TargetValidator must have fit\(\) called first"
+ ):
validator.inverse_transform(np.array([1, 2, 3]))
- with pytest.raises(ValueError, match=r"Multi-dimensional classification is not yet supported"):
+ with pytest.raises(
+ ValueError, match=r"Multi-dimensional classification is not yet supported"
+ ):
validator._fit(np.array([[1, 2, 3], [1, 5, 6]]))
# Dia/ DOK are not supported as type of target makes calls len on the array
# which causes TypeError: len() of unsized object. Basically, sparse data as
# multi-label is the only thing that makes sense in this format.
- with pytest.raises(ValueError, match=r"The provided data could not be interpreted by Sklearn"):
+ with pytest.raises(
+ ValueError, match=r"The provided data could not be interpreted by Sklearn"
+ ):
validator.fit(sparse.dia_matrix(np.array([1, 2, 3])))
validator.fit(np.array([[0, 1, 0], [0, 1, 1]]))
@@ -443,22 +461,21 @@ def test_targetvalidator_inversetransform():
"""
validator = TargetValidator(is_classification=True)
validator.fit(
- pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
+ pd.DataFrame(data=["a", "a", "b", "c", "a"], dtype="category"),
)
y = validator.transform(
- pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
+ pd.DataFrame(data=["a", "a", "b", "c", "a"], dtype="category"),
)
np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y)
y_decoded = validator.inverse_transform(y)
- assert ['a', 'a', 'b', 'c', 'a'] == y_decoded.tolist()
+ assert ["a", "a", "b", "c", "a"] == y_decoded.tolist()
- assert validator.classes_.tolist() == ['a', 'b', 'c']
+ assert validator.classes_.tolist() == ["a", "b", "c"]
validator = TargetValidator(is_classification=True)
multi_label = pd.DataFrame(
- np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]),
- dtype=bool
+ np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]), dtype=bool
)
validator.fit(multi_label)
y = validator.transform(multi_label)
@@ -473,18 +490,18 @@ def test_targetvalidator_inversetransform():
# Actual checks for the targets
@pytest.mark.parametrize(
- 'input_data_targettest',
+ "input_data_targettest",
(
- 'series_binary',
- 'series_multiclass',
- 'pandas_binary',
- 'pandas_multiclass',
- 'numpy_binary',
- 'numpy_multiclass',
- 'list_binary',
- 'list_multiclass',
+ "series_binary",
+ "series_multiclass",
+ "pandas_binary",
+ "pandas_multiclass",
+ "numpy_binary",
+ "numpy_multiclass",
+ "list_binary",
+ "list_multiclass",
),
- indirect=True
+ indirect=True,
)
def test_unknown_categories_in_targets(input_data_targettest):
validator = TargetValidator(is_classification=True)
diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py
index 7bc2cb3dc5..4d09c65075 100644
--- a/test/test_data/test_validation.py
+++ b/test/test_data/test_validation.py
@@ -1,34 +1,33 @@
import numpy as np
-
import pandas as pd
-
import pytest
-
-from scipy import sparse
-
import sklearn.datasets
import sklearn.model_selection
+from scipy import sparse
from autosklearn.data.validation import InputValidator
-@pytest.mark.parametrize('openmlid', [2, 40975, 40984])
-@pytest.mark.parametrize('as_frame', [True, False])
+@pytest.mark.parametrize("openmlid", [2, 40975, 40984])
+@pytest.mark.parametrize("as_frame", [True, False])
def test_data_validation_for_classification(openmlid, as_frame):
- x, y = sklearn.datasets.fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame)
+ x, y = sklearn.datasets.fetch_openml(
+ data_id=openmlid, return_X_y=True, as_frame=as_frame
+ )
validator = InputValidator(is_classification=True)
if as_frame:
# NaN is not supported in categories, so
# drop columns with them.
nan_cols = [i for i in x.columns if x[i].isnull().any()]
- cat_cols = [i for i in x.columns if x[i].dtype.name in ['category', 'bool']]
+ cat_cols = [i for i in x.columns if x[i].dtype.name in ["category", "bool"]]
unsupported_columns = list(set(nan_cols) & set(cat_cols))
if len(unsupported_columns) > 0:
x.drop(unsupported_columns, axis=1, inplace=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
- x, y, test_size=0.33, random_state=0)
+ x, y, test_size=0.33, random_state=0
+ )
validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
@@ -49,23 +48,26 @@ def test_data_validation_for_classification(openmlid, as_frame):
validator.feature_validator.feat_type is not None
-@pytest.mark.parametrize('openmlid', [505, 546, 531])
-@pytest.mark.parametrize('as_frame', [True, False])
+@pytest.mark.parametrize("openmlid", [505, 546, 531])
+@pytest.mark.parametrize("as_frame", [True, False])
def test_data_validation_for_regression(openmlid, as_frame):
- x, y = sklearn.datasets.fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame)
+ x, y = sklearn.datasets.fetch_openml(
+ data_id=openmlid, return_X_y=True, as_frame=as_frame
+ )
validator = InputValidator(is_classification=False)
if as_frame:
# NaN is not supported in categories, so
# drop columns with them.
nan_cols = [i for i in x.columns if x[i].isnull().any()]
- cat_cols = [i for i in x.columns if x[i].dtype.name in ['category', 'bool']]
+ cat_cols = [i for i in x.columns if x[i].dtype.name in ["category", "bool"]]
unsupported_columns = list(set(nan_cols) & set(cat_cols))
if len(unsupported_columns) > 0:
x.drop(unsupported_columns, axis=1, inplace=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
- x, y, test_size=0.33, random_state=0)
+ x, y, test_size=0.33, random_state=0
+ )
validator.fit(X_train=X_train, y_train=y_train)
@@ -83,7 +85,9 @@ def test_data_validation_for_regression(openmlid, as_frame):
def test_sparse_data_validation_for_regression():
- X, y = sklearn.datasets.make_regression(n_samples=100, n_features=50, random_state=0)
+ X, y = sklearn.datasets.make_regression(
+ n_samples=100, n_features=50, random_state=0
+ )
X_sp = sparse.coo_matrix(X)
validator = InputValidator(is_classification=False)
@@ -118,7 +122,9 @@ def test_validation_unsupported():
X_test=np.array([[0, 1, 0], [0, 1, 1]]),
y_test=np.array([0, 1, 0, 0, 0, 0]),
)
- with pytest.raises(ValueError, match=r"Cannot call transform on a validator .*fitted"):
+ with pytest.raises(
+ ValueError, match=r"Cannot call transform on a validator .*fitted"
+ ):
validator.transform(
X=np.array([[0, 1, 0], [0, 1, 1]]),
y=np.array([0, 1]),
diff --git a/test/test_ensemble_builder/__init__.py b/test/test_ensemble_builder/__init__.py
index 51b8efdf22..b74c2a5ccb 100644
--- a/test/test_ensemble_builder/__init__.py
+++ b/test/test_ensemble_builder/__init__.py
@@ -1,2 +1,2 @@
# -*- encoding: utf-8 -*-
-__author__ = 'mlindauer'
+__author__ = "mlindauer"
diff --git a/test/test_ensemble_builder/ensemble_utils.py b/test/test_ensemble_builder/ensemble_utils.py
index b98021c7bd..fa0f22e9e7 100644
--- a/test/test_ensemble_builder/ensemble_utils.py
+++ b/test/test_ensemble_builder/ensemble_utils.py
@@ -5,47 +5,55 @@
import numpy as np
-from autosklearn.automl_common.common.ensemble_building.abstract_ensemble import AbstractEnsemble
-
-from autosklearn.metrics import make_scorer
+from autosklearn.automl_common.common.ensemble_building.abstract_ensemble import (
+ AbstractEnsemble,
+)
from autosklearn.ensemble_builder import EnsembleBuilder
+from autosklearn.metrics import make_scorer
def scorer_function(a, b):
return 0.9
-MockMetric = make_scorer('mock', scorer_function)
+MockMetric = make_scorer("mock", scorer_function)
class BackendMock(object):
-
def __init__(self, target_directory):
- this_directory = os.path.abspath(
- os.path.dirname(__file__)
+ this_directory = os.path.abspath(os.path.dirname(__file__))
+ shutil.copytree(
+ os.path.join(this_directory, "data"), os.path.join(target_directory)
)
- shutil.copytree(os.path.join(this_directory, 'data'), os.path.join(target_directory))
self.temporary_directory = target_directory
- self.internals_directory = os.path.join(self.temporary_directory, '.auto-sklearn')
+ self.internals_directory = os.path.join(
+ self.temporary_directory, ".auto-sklearn"
+ )
def load_datamanager(self):
manager = unittest.mock.Mock()
manager.__reduce__ = lambda self: (unittest.mock.MagicMock, ())
- array = np.load(os.path.join(
- self.temporary_directory,
- '.auto-sklearn',
- 'runs', '0_3_100.0',
- 'predictions_test_0_3_100.0.npy'
- ))
+ array = np.load(
+ os.path.join(
+ self.temporary_directory,
+ ".auto-sklearn",
+ "runs",
+ "0_3_100.0",
+ "predictions_test_0_3_100.0.npy",
+ )
+ )
manager.data.get.return_value = array
return manager
def load_targets_ensemble(self):
- with open(os.path.join(
- self.temporary_directory,
- ".auto-sklearn",
- "predictions_ensemble_true.npy"
- ), "rb") as fp:
+ with open(
+ os.path.join(
+ self.temporary_directory,
+ ".auto-sklearn",
+ "predictions_ensemble_true.npy",
+ ),
+ "rb",
+ ) as fp:
y = np.load(fp, allow_pickle=True)
return y
@@ -56,13 +64,15 @@ def save_predictions_as_txt(self, predictions, subset, idx, prefix, precision):
return
def get_runs_directory(self) -> str:
- return os.path.join(self.temporary_directory, '.auto-sklearn', 'runs')
+ return os.path.join(self.temporary_directory, ".auto-sklearn", "runs")
def get_numrun_directory(self, seed: int, num_run: int, budget: float) -> str:
- return os.path.join(self.get_runs_directory(), '%d_%d_%s' % (seed, num_run, budget))
+ return os.path.join(
+ self.get_runs_directory(), "%d_%d_%s" % (seed, num_run, budget)
+ )
def get_model_filename(self, seed: int, idx: int, budget: float) -> str:
- return '%s.%s.%s.model' % (seed, idx, budget)
+ return "%s.%s.%s.model" % (seed, idx, budget)
def compare_read_preds(read_preds1, read_preds2):
@@ -91,13 +101,15 @@ def compare_read_preds(read_preds1, read_preds2):
class EnsembleBuilderMemMock(EnsembleBuilder):
-
def fit_ensemble(self, selected_keys):
return True
- def predict(self, set_: str,
- ensemble: AbstractEnsemble,
- selected_keys: list,
- n_preds: int,
- index_run: int):
+ def predict(
+ self,
+ set_: str,
+ ensemble: AbstractEnsemble,
+ selected_keys: list,
+ n_preds: int,
+ index_run: int,
+ ):
np.ones([10000000, 1000000])
diff --git a/test/test_ensemble_builder/test_ensemble.py b/test/test_ensemble_builder/test_ensemble.py
index 335c07eca2..3533da37cd 100644
--- a/test/test_ensemble_builder/test_ensemble.py
+++ b/test/test_ensemble_builder/test_ensemble.py
@@ -1,35 +1,40 @@
import os
+import pickle
+import shutil
import sys
import time
import unittest.mock
-import pickle
-import pytest
-import shutil
import dask.distributed
import numpy as np
import pandas as pd
-from smac.runhistory.runhistory import RunValue, RunKey, RunHistory
+import pytest
+from smac.runhistory.runhistory import RunHistory, RunKey, RunValue
-from autosklearn.constants import MULTILABEL_CLASSIFICATION, BINARY_CLASSIFICATION
-from autosklearn.metrics import roc_auc, accuracy, log_loss
+from autosklearn.constants import BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION
from autosklearn.ensemble_builder import (
- EnsembleBuilder,
- EnsembleBuilderManager,
Y_ENSEMBLE,
- Y_VALID,
Y_TEST,
+ Y_VALID,
+ EnsembleBuilder,
+ EnsembleBuilderManager,
)
from autosklearn.ensembles.singlebest_ensemble import SingleBest
+from autosklearn.metrics import accuracy, log_loss, roc_auc
this_directory = os.path.dirname(__file__)
sys.path.append(this_directory)
-from ensemble_utils import BackendMock, compare_read_preds, EnsembleBuilderMemMock, MockMetric # noqa (E402: module level import not at top of file)
+from ensemble_utils import ( # noqa (E402: module level import not at top of file)
+ BackendMock,
+ EnsembleBuilderMemMock,
+ MockMetric,
+ compare_read_preds,
+)
@pytest.fixture(scope="function")
def ensemble_backend(request):
- test_id = '%s_%s' % (request.module.__name__, request.node.name)
+ test_id = "%s_%s" % (request.module.__name__, request.node.name)
test_dir = os.path.join(this_directory, test_id)
try:
@@ -46,7 +51,9 @@ def session_run_at_end():
shutil.rmtree(test_dir)
except: # noqa E722
pass
+
return session_run_at_end
+
request.addfinalizer(get_finalizer(backend))
return backend
@@ -58,10 +65,7 @@ def ensemble_run_history(request):
run_history = RunHistory()
run_history._add(
RunKey(
- config_id=3,
- instance_id='{"task_id": "breast_cancer"}',
- seed=1,
- budget=3.0
+ config_id=3, instance_id='{"task_id": "breast_cancer"}', seed=1, budget=3.0
),
RunValue(
cost=0.11347517730496459,
@@ -70,30 +74,29 @@ def ensemble_run_history(request):
starttime=time.time(),
endtime=time.time(),
additional_info={
- 'duration': 0.20323538780212402,
- 'num_run': 3,
- 'configuration_origin': 'Random Search'}
+ "duration": 0.20323538780212402,
+ "num_run": 3,
+ "configuration_origin": "Random Search",
+ },
),
status=None,
origin=None,
)
run_history._add(
RunKey(
- config_id=6,
- instance_id='{"task_id": "breast_cancer"}',
- seed=1,
- budget=6.0
+ config_id=6, instance_id='{"task_id": "breast_cancer"}', seed=1, budget=6.0
),
RunValue(
- cost=2*0.11347517730496459,
- time=2*0.21858787536621094,
+ cost=2 * 0.11347517730496459,
+ time=2 * 0.21858787536621094,
status=None,
starttime=time.time(),
endtime=time.time(),
additional_info={
- 'duration': 0.20323538780212402,
- 'num_run': 6,
- 'configuration_origin': 'Random Search'}
+ "duration": 0.20323538780212402,
+ "num_run": 6,
+ "configuration_origin": "Random Search",
+ },
),
status=None,
origin=None,
@@ -118,13 +121,13 @@ def testRead(ensemble_backend):
filename = os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy"
+ ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy",
)
assert ensbuilder.read_losses[filename]["ens_loss"] == 0.5
filename = os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy"
+ ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy",
)
assert ensbuilder.read_losses[filename]["ens_loss"] == 0.0
@@ -132,13 +135,13 @@ def testRead(ensemble_backend):
@pytest.mark.parametrize(
"ensemble_nbest,max_models_on_disc,exp",
(
- (1, None, 1),
- (1.0, None, 2),
- (0.1, None, 1),
- (0.9, None, 1),
- (1, 2, 1),
- (2, 1, 1),
- )
+ (1, None, 1),
+ (1.0, None, 2),
+ (0.1, None, 1),
+ (0.9, None, 1),
+ (1, 2, 1),
+ (2, 1, 1),
+ ),
)
def testNBest(ensemble_backend, ensemble_nbest, max_models_on_disc, exp):
ensbuilder = EnsembleBuilder(
@@ -158,26 +161,29 @@ def testNBest(ensemble_backend, ensemble_nbest, max_models_on_disc, exp):
fixture = os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy"
+ ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy",
)
assert sel_keys[0] == fixture
-@pytest.mark.parametrize("test_case,exp", [
- # If None, no reduction
- (None, 2),
- # If Int, limit only on exceed
- (4, 2),
- (1, 1),
- # If Float, translate float to # models.
- # below, mock of each file is 100 Mb and 4 files .model and .npy (test/val/pred) exist
- # per run (except for run3, there they are 5). Now, it takes 500MB for run 3 and
- # another 500 MB of slack because we keep as much space as the largest model
- # available as slack
- (1499.0, 1),
- (1500.0, 2),
- (9999.0, 2),
-])
+@pytest.mark.parametrize(
+ "test_case,exp",
+ [
+ # If None, no reduction
+ (None, 2),
+ # If Int, limit only on exceed
+ (4, 2),
+ (1, 1),
+ # If Float, translate float to # models.
+ # below, mock of each file is 100 Mb and 4 files .model and .npy (test/val/pred)
+ # per run (except for run3, there they are 5). Now, it takes 500MB for run 3 and
+ # another 500 MB of slack because we keep as much space as the largest model
+ # available as slack
+ (1499.0, 1),
+ (1500.0, 2),
+ (9999.0, 2),
+ ],
+)
def testMaxModelsOnDisc(ensemble_backend, test_case, exp):
ensemble_nbest = 4
ensbuilder = EnsembleBuilder(
@@ -190,8 +196,8 @@ def testMaxModelsOnDisc(ensemble_backend, test_case, exp):
max_models_on_disc=test_case,
)
- with unittest.mock.patch('os.path.getsize') as mock:
- mock.return_value = 100*1024*1024
+ with unittest.mock.patch("os.path.getsize") as mock:
+ mock.return_value = 100 * 1024 * 1024
ensbuilder.compute_loss_per_model()
sel_keys = ensbuilder.get_n_best_preds()
assert len(sel_keys) == exp, test_case
@@ -211,26 +217,26 @@ def testMaxModelsOnDisc2(ensemble_backend):
)
ensbuilder.read_preds = {}
for i in range(50):
- ensbuilder.read_losses['pred'+str(i)] = {
- 'ens_loss': -i*10,
- 'num_run': i,
- 'loaded': 1,
+ ensbuilder.read_losses["pred" + str(i)] = {
+ "ens_loss": -i * 10,
+ "num_run": i,
+ "loaded": 1,
"seed": 1,
- "disc_space_cost_mb": 50*i,
+ "disc_space_cost_mb": 50 * i,
}
- ensbuilder.read_preds['pred'+str(i)] = {Y_ENSEMBLE: True}
+ ensbuilder.read_preds["pred" + str(i)] = {Y_ENSEMBLE: True}
sel_keys = ensbuilder.get_n_best_preds()
- assert ['pred49', 'pred48', 'pred47'] == sel_keys
+ assert ["pred49", "pred48", "pred47"] == sel_keys
# Make sure at least one model is kept alive
ensbuilder.max_models_on_disc = 0.0
sel_keys = ensbuilder.get_n_best_preds()
- assert ['pred49'] == sel_keys
+ assert ["pred49"] == sel_keys
@pytest.mark.parametrize(
"performance_range_threshold,exp",
- ((0.0, 4), (0.1, 4), (0.3, 3), (0.5, 2), (0.6, 2), (0.8, 1), (1.0, 1), (1, 1))
+ ((0.0, 4), (0.1, 4), (0.3, 3), (0.5, 2), (0.6, 2), (0.8, 1), (1.0, 1), (1, 1)),
)
def testPerformanceRangeThreshold(ensemble_backend, performance_range_threshold, exp):
ensbuilder = EnsembleBuilder(
@@ -240,14 +246,14 @@ def testPerformanceRangeThreshold(ensemble_backend, performance_range_threshold,
metric=roc_auc,
seed=0, # important to find the test files
ensemble_nbest=100,
- performance_range_threshold=performance_range_threshold
+ performance_range_threshold=performance_range_threshold,
)
ensbuilder.read_losses = {
- 'A': {'ens_loss': -1, 'num_run': 1, 'loaded': -1, "seed": 1},
- 'B': {'ens_loss': -2, 'num_run': 2, 'loaded': -1, "seed": 1},
- 'C': {'ens_loss': -3, 'num_run': 3, 'loaded': -1, "seed": 1},
- 'D': {'ens_loss': -4, 'num_run': 4, 'loaded': -1, "seed": 1},
- 'E': {'ens_loss': -5, 'num_run': 5, 'loaded': -1, "seed": 1},
+ "A": {"ens_loss": -1, "num_run": 1, "loaded": -1, "seed": 1},
+ "B": {"ens_loss": -2, "num_run": 2, "loaded": -1, "seed": 1},
+ "C": {"ens_loss": -3, "num_run": 3, "loaded": -1, "seed": 1},
+ "D": {"ens_loss": -4, "num_run": 4, "loaded": -1, "seed": 1},
+ "E": {"ens_loss": -5, "num_run": 5, "loaded": -1, "seed": 1},
}
ensbuilder.read_preds = {
key: {key_2: True for key_2 in (Y_ENSEMBLE, Y_VALID, Y_TEST)}
@@ -261,12 +267,19 @@ def testPerformanceRangeThreshold(ensemble_backend, performance_range_threshold,
@pytest.mark.parametrize(
"performance_range_threshold,ensemble_nbest,exp",
(
- (0.0, 1, 1), (0.0, 1.0, 4), (0.1, 2, 2), (0.3, 4, 3),
- (0.5, 1, 1), (0.6, 10, 2), (0.8, 0.5, 1), (1, 1.0, 1)
- )
+ (0.0, 1, 1),
+ (0.0, 1.0, 4),
+ (0.1, 2, 2),
+ (0.3, 4, 3),
+ (0.5, 1, 1),
+ (0.6, 10, 2),
+ (0.8, 0.5, 1),
+ (1, 1.0, 1),
+ ),
)
-def testPerformanceRangeThresholdMaxBest(ensemble_backend, performance_range_threshold,
- ensemble_nbest, exp):
+def testPerformanceRangeThresholdMaxBest(
+ ensemble_backend, performance_range_threshold, ensemble_nbest, exp
+):
ensbuilder = EnsembleBuilder(
backend=ensemble_backend,
dataset_name="TEST",
@@ -278,11 +291,11 @@ def testPerformanceRangeThresholdMaxBest(ensemble_backend, performance_range_thr
max_models_on_disc=None,
)
ensbuilder.read_losses = {
- 'A': {'ens_loss': -1, 'num_run': 1, 'loaded': -1, "seed": 1},
- 'B': {'ens_loss': -2, 'num_run': 2, 'loaded': -1, "seed": 1},
- 'C': {'ens_loss': -3, 'num_run': 3, 'loaded': -1, "seed": 1},
- 'D': {'ens_loss': -4, 'num_run': 4, 'loaded': -1, "seed": 1},
- 'E': {'ens_loss': -5, 'num_run': 5, 'loaded': -1, "seed": 1},
+ "A": {"ens_loss": -1, "num_run": 1, "loaded": -1, "seed": 1},
+ "B": {"ens_loss": -2, "num_run": 2, "loaded": -1, "seed": 1},
+ "C": {"ens_loss": -3, "num_run": 3, "loaded": -1, "seed": 1},
+ "D": {"ens_loss": -4, "num_run": 4, "loaded": -1, "seed": 1},
+ "E": {"ens_loss": -5, "num_run": 5, "loaded": -1, "seed": 1},
}
ensbuilder.read_preds = {
key: {key_2: True for key_2 in (Y_ENSEMBLE, Y_VALID, Y_TEST)}
@@ -295,13 +308,14 @@ def testPerformanceRangeThresholdMaxBest(ensemble_backend, performance_range_thr
def testFallBackNBest(ensemble_backend):
- ensbuilder = EnsembleBuilder(backend=ensemble_backend,
- dataset_name="TEST",
- task_type=BINARY_CLASSIFICATION,
- metric=roc_auc,
- seed=0, # important to find the test files
- ensemble_nbest=1
- )
+ ensbuilder = EnsembleBuilder(
+ backend=ensemble_backend,
+ dataset_name="TEST",
+ task_type=BINARY_CLASSIFICATION,
+ metric=roc_auc,
+ seed=0, # important to find the test files
+ ensemble_nbest=1,
+ )
ensbuilder.compute_loss_per_model()
print()
@@ -311,19 +325,19 @@ def testFallBackNBest(ensemble_backend):
filename = os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy"
+ ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy",
)
ensbuilder.read_losses[filename]["ens_loss"] = -1
filename = os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy"
+ ".auto-sklearn/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy",
)
ensbuilder.read_losses[filename]["ens_loss"] = -1
filename = os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy"
+ ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy",
)
ensbuilder.read_losses[filename]["ens_loss"] = -1
@@ -331,7 +345,7 @@ def testFallBackNBest(ensemble_backend):
fixture = os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy"
+ ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy",
)
assert len(sel_keys) == 1
assert sel_keys[0] == fixture
@@ -339,13 +353,14 @@ def testFallBackNBest(ensemble_backend):
def testGetValidTestPreds(ensemble_backend):
- ensbuilder = EnsembleBuilder(backend=ensemble_backend,
- dataset_name="TEST",
- task_type=BINARY_CLASSIFICATION,
- metric=roc_auc,
- seed=0, # important to find the test files
- ensemble_nbest=1
- )
+ ensbuilder = EnsembleBuilder(
+ backend=ensemble_backend,
+ dataset_name="TEST",
+ task_type=BINARY_CLASSIFICATION,
+ metric=roc_auc,
+ seed=0, # important to find the test files
+ ensemble_nbest=1,
+ )
ensbuilder.compute_loss_per_model()
@@ -353,15 +368,15 @@ def testGetValidTestPreds(ensemble_backend):
# different name. num_run=2 is selected when doing sorted()
d1 = os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy"
+ ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy",
)
d2 = os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy"
+ ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy",
)
d3 = os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy"
+ ".auto-sklearn/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy",
)
sel_keys = ensbuilder.get_n_best_preds()
@@ -371,10 +386,13 @@ def testGetValidTestPreds(ensemble_backend):
# Number of read files should be three and
# predictions_ensemble_0_4_0.0.npy must not be in there
assert len(ensbuilder.read_preds) == 3
- assert os.path.join(
+ assert (
+ os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_4_0.0/predictions_ensemble_0_4_0.0.npy"
- ) not in ensbuilder.read_preds
+ ".auto-sklearn/runs/0_4_0.0/predictions_ensemble_0_4_0.0.npy",
+ )
+ not in ensbuilder.read_preds
+ )
# not selected --> should still be None
assert ensbuilder.read_preds[d1][Y_VALID] is None
@@ -403,7 +421,7 @@ def testEntireEnsembleBuilder(ensemble_backend):
d2 = os.path.join(
ensemble_backend.temporary_directory,
- ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy"
+ ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy",
)
sel_keys = ensbuilder.get_n_best_preds()
@@ -454,11 +472,13 @@ def test_main(ensemble_backend):
seed=0, # important to find the test files
ensemble_nbest=2,
max_models_on_disc=None,
- )
+ )
ensbuilder.SAVE2DISC = False
run_history, ensemble_nbest, _, _, _ = ensbuilder.main(
- time_left=np.inf, iteration=1, return_predictions=False,
+ time_left=np.inf,
+ iteration=1,
+ return_predictions=False,
)
assert len(ensbuilder.read_preds) == 3
@@ -473,26 +493,26 @@ def test_main(ensemble_backend):
# As the data loader loads the same val/train/test
# we expect 1.0 as score and all keys available
expected_performance = {
- 'ensemble_val_score': 1.0,
- 'ensemble_test_score': 1.0,
- 'ensemble_optimization_score': 1.0,
+ "ensemble_val_score": 1.0,
+ "ensemble_test_score": 1.0,
+ "ensemble_optimization_score": 1.0,
}
# Make sure that expected performance is a subset of the run history
assert all(item in run_history[0].items() for item in expected_performance.items())
- assert 'Timestamp' in run_history[0]
- assert isinstance(run_history[0]['Timestamp'], pd.Timestamp)
+ assert "Timestamp" in run_history[0]
+ assert isinstance(run_history[0]["Timestamp"], pd.Timestamp)
assert os.path.exists(
- os.path.join(ensemble_backend.internals_directory, 'ensemble_read_preds.pkl')
+ os.path.join(ensemble_backend.internals_directory, "ensemble_read_preds.pkl")
), os.listdir(ensemble_backend.internals_directory)
assert os.path.exists(
- os.path.join(ensemble_backend.internals_directory, 'ensemble_read_losses.pkl')
+ os.path.join(ensemble_backend.internals_directory, "ensemble_read_losses.pkl")
), os.listdir(ensemble_backend.internals_directory)
def test_run_end_at(ensemble_backend):
- with unittest.mock.patch('pynisher.enforce_limits') as pynisher_mock:
+ with unittest.mock.patch("pynisher.enforce_limits") as pynisher_mock:
ensbuilder = EnsembleBuilder(
backend=ensemble_backend,
dataset_name="TEST",
@@ -501,69 +521,74 @@ def test_run_end_at(ensemble_backend):
seed=0, # important to find the test files
ensemble_nbest=2,
max_models_on_disc=None,
- )
+ )
ensbuilder.SAVE2DISC = False
current_time = time.time()
- ensbuilder.run(end_at=current_time + 10, iteration=1, pynisher_context='forkserver')
- # 4 seconds left because: 10 seconds - 5 seconds overhead - very little overhead,
+ ensbuilder.run(
+ end_at=current_time + 10, iteration=1, pynisher_context="forkserver"
+ )
+ # 4 seconds left because: 10 seconds - 5 seconds overhead - little overhead
# but then rounded to an integer
assert pynisher_mock.call_args_list[0][1]["wall_time_in_s"], 4
def testLimit(ensemble_backend):
- ensbuilder = EnsembleBuilderMemMock(backend=ensemble_backend,
- dataset_name="TEST",
- task_type=BINARY_CLASSIFICATION,
- metric=roc_auc,
- seed=0, # important to find the test files
- ensemble_nbest=10,
- # small to trigger MemoryException
- memory_limit=100,
- )
+ ensbuilder = EnsembleBuilderMemMock(
+ backend=ensemble_backend,
+ dataset_name="TEST",
+ task_type=BINARY_CLASSIFICATION,
+ metric=roc_auc,
+ seed=0, # important to find the test files
+ ensemble_nbest=10,
+ # small to trigger MemoryException
+ memory_limit=100,
+ )
ensbuilder.SAVE2DISC = False
read_losses_file = os.path.join(
- ensemble_backend.internals_directory,
- 'ensemble_read_losses.pkl'
+ ensemble_backend.internals_directory, "ensemble_read_losses.pkl"
)
read_preds_file = os.path.join(
- ensemble_backend.internals_directory,
- 'ensemble_read_preds.pkl'
+ ensemble_backend.internals_directory, "ensemble_read_preds.pkl"
)
def mtime_mock(filename):
mtimes = {
- 'predictions_ensemble_0_1_0.0.npy': 0,
- 'predictions_valid_0_1_0.0.npy': 0.1,
- 'predictions_test_0_1_0.0.npy': 0.2,
- 'predictions_ensemble_0_2_0.0.npy': 1,
- 'predictions_valid_0_2_0.0.npy': 1.1,
- 'predictions_test_0_2_0.0.npy': 1.2,
- 'predictions_ensemble_0_3_100.0.npy': 2,
- 'predictions_valid_0_3_100.0.npy': 2.1,
- 'predictions_test_0_3_100.0.npy': 2.2,
+ "predictions_ensemble_0_1_0.0.npy": 0,
+ "predictions_valid_0_1_0.0.npy": 0.1,
+ "predictions_test_0_1_0.0.npy": 0.2,
+ "predictions_ensemble_0_2_0.0.npy": 1,
+ "predictions_valid_0_2_0.0.npy": 1.1,
+ "predictions_test_0_2_0.0.npy": 1.2,
+ "predictions_ensemble_0_3_100.0.npy": 2,
+ "predictions_valid_0_3_100.0.npy": 2.1,
+ "predictions_test_0_3_100.0.npy": 2.2,
}
return mtimes[os.path.split(filename)[1]]
- with unittest.mock.patch('logging.getLogger') as get_logger_mock, \
- unittest.mock.patch('logging.config.dictConfig') as _, \
- unittest.mock.patch('os.path.getmtime') as mtime:
+ with unittest.mock.patch(
+ "logging.getLogger"
+ ) as get_logger_mock, unittest.mock.patch(
+ "logging.config.dictConfig"
+ ) as _, unittest.mock.patch(
+ "os.path.getmtime"
+ ) as mtime:
logger_mock = unittest.mock.Mock()
logger_mock.handlers = []
get_logger_mock.return_value = logger_mock
mtime.side_effect = mtime_mock
- ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork')
+ ensbuilder.run(time_left=1000, iteration=0, pynisher_context="fork")
assert os.path.exists(read_losses_file)
assert not os.path.exists(read_preds_file)
assert logger_mock.warning.call_count == 1
- ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork')
+ ensbuilder.run(time_left=1000, iteration=0, pynisher_context="fork")
assert os.path.exists(read_losses_file)
assert not os.path.exists(read_preds_file)
assert logger_mock.warning.call_count == 2
- ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork')
+ ensbuilder.run(time_left=1000, iteration=0, pynisher_context="fork")
assert os.path.exists(read_losses_file)
assert not os.path.exists(read_preds_file)
assert logger_mock.warning.call_count == 3
@@ -571,7 +596,7 @@ def mtime_mock(filename):
# it should try to reduce ensemble_nbest until it also failed at 2
assert ensbuilder.ensemble_nbest == 1
- ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork')
+ ensbuilder.run(time_left=1000, iteration=0, pynisher_context="fork")
assert os.path.exists(read_losses_file)
assert not os.path.exists(read_preds_file)
assert logger_mock.warning.call_count == 4
@@ -579,9 +604,9 @@ def mtime_mock(filename):
# it should next reduce the number of models to read at most
assert ensbuilder.read_at_most == 1
- # And then it still runs, but basically won't do anything any more except for raising error
- # messages via the logger
- ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork')
+ # And then it still runs, but basically won't do anything any more except for
+ # raising error messages via the logger
+ ensbuilder.run(time_left=1000, iteration=0, pynisher_context="fork")
assert os.path.exists(read_losses_file)
assert not os.path.exists(read_preds_file)
assert logger_mock.warning.call_count == 4
@@ -592,8 +617,9 @@ def mtime_mock(filename):
logger_mock.error.call_args_list
)
for i in range(len(logger_mock.error.call_args_list)):
- assert 'Memory Exception -- Unable to further reduce' in str(
- logger_mock.error.call_args_list[i])
+ assert "Memory Exception -- Unable to further reduce" in str(
+ logger_mock.error.call_args_list[i]
+ )
def test_read_pickle_read_preds(ensemble_backend):
@@ -610,15 +636,14 @@ def test_read_pickle_read_preds(ensemble_backend):
seed=0, # important to find the test files
ensemble_nbest=2,
max_models_on_disc=None,
- )
+ )
ensbuilder.SAVE2DISC = False
ensbuilder.main(time_left=np.inf, iteration=1, return_predictions=False)
# Check that the memory was created
ensemble_memory_file = os.path.join(
- ensemble_backend.internals_directory,
- 'ensemble_read_preds.pkl'
+ ensemble_backend.internals_directory, "ensemble_read_preds.pkl"
)
assert os.path.exists(ensemble_memory_file)
@@ -630,8 +655,7 @@ def test_read_pickle_read_preds(ensemble_backend):
assert last_hash == ensbuilder.last_hash
ensemble_memory_file = os.path.join(
- ensemble_backend.internals_directory,
- 'ensemble_read_losses.pkl'
+ ensemble_backend.internals_directory, "ensemble_read_losses.pkl"
)
assert os.path.exists(ensemble_memory_file)
@@ -650,21 +674,23 @@ def test_read_pickle_read_preds(ensemble_backend):
seed=0, # important to find the test files
ensemble_nbest=2,
max_models_on_disc=None,
- )
+ )
compare_read_preds(ensbuilder2.read_preds, ensbuilder.read_preds)
compare_read_preds(ensbuilder2.read_losses, ensbuilder.read_losses)
assert ensbuilder2.last_hash == ensbuilder.last_hash
@pytest.mark.parametrize("metric", [log_loss, accuracy])
-@unittest.mock.patch('os.path.exists')
-def test_get_identifiers_from_run_history(exists, metric, ensemble_run_history, ensemble_backend):
+@unittest.mock.patch("os.path.exists")
+def test_get_identifiers_from_run_history(
+ exists, metric, ensemble_run_history, ensemble_backend
+):
exists.return_value = True
ensemble = SingleBest(
- metric=log_loss,
- seed=1,
- run_history=ensemble_run_history,
- backend=ensemble_backend,
+ metric=log_loss,
+ seed=1,
+ run_history=ensemble_run_history,
+ backend=ensemble_backend,
)
# Just one model
@@ -682,7 +708,7 @@ def test_ensemble_builder_process_realrun(dask_client_single_worker, ensemble_ba
start_time=time.time(),
time_left_for_ensembles=1000,
backend=ensemble_backend,
- dataset_name='Test',
+ dataset_name="Test",
task=BINARY_CLASSIFICATION,
metric=MockMetric,
ensemble_size=50,
@@ -701,12 +727,12 @@ def test_ensemble_builder_process_realrun(dask_client_single_worker, ensemble_ba
result = future.result()
history, _, _, _, _ = result
- assert 'ensemble_optimization_score' in history[0]
- assert history[0]['ensemble_optimization_score'] == 0.9
- assert 'ensemble_val_score' in history[0]
- assert history[0]['ensemble_val_score'] == 0.9
- assert 'ensemble_test_score' in history[0]
- assert history[0]['ensemble_test_score'] == 0.9
+ assert "ensemble_optimization_score" in history[0]
+ assert history[0]["ensemble_optimization_score"] == 0.9
+ assert "ensemble_val_score" in history[0]
+ assert history[0]["ensemble_val_score"] == 0.9
+ assert "ensemble_test_score" in history[0]
+ assert history[0]["ensemble_test_score"] == 0.9
def test_ensemble_builder_nbest_remembered(
@@ -722,7 +748,7 @@ def test_ensemble_builder_nbest_remembered(
start_time=time.time(),
time_left_for_ensembles=1000,
backend=ensemble_backend,
- dataset_name='Test',
+ dataset_name="Test",
task=MULTILABEL_CLASSIFICATION,
metric=roc_auc,
ensemble_size=50,
@@ -740,7 +766,9 @@ def test_ensemble_builder_nbest_remembered(
future = manager.futures[0]
dask.distributed.wait([future]) # wait for the ensemble process to finish
assert future.result() == ([], 5, None, None, None)
- file_path = os.path.join(ensemble_backend.internals_directory, 'ensemble_read_preds.pkl')
+ file_path = os.path.join(
+ ensemble_backend.internals_directory, "ensemble_read_preds.pkl"
+ )
assert not os.path.exists(file_path)
manager.build_ensemble(dask_client_single_worker, unit_test=True)
diff --git a/test/test_ensemble_builder/test_ensemble_selection.py b/test/test_ensemble_builder/test_ensemble_selection.py
index c03060c037..44e00229fb 100644
--- a/test/test_ensemble_builder/test_ensemble_selection.py
+++ b/test/test_ensemble_builder/test_ensemble_selection.py
@@ -1,5 +1,4 @@
import numpy as np
-
import pytest
from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION
@@ -12,10 +11,12 @@ def testEnsembleSelection():
Makes sure ensemble selection fit method creates an ensemble correctly
"""
- ensemble = EnsembleSelection(ensemble_size=10,
- task_type=REGRESSION,
- random_state=0,
- metric=root_mean_squared_error)
+ ensemble = EnsembleSelection(
+ ensemble_size=10,
+ task_type=REGRESSION,
+ random_state=0,
+ metric=root_mean_squared_error,
+ )
# We create a problem such that we encourage the addition of members to the ensemble
# Fundamentally, the average of 10 sequential number is 5.5
@@ -23,24 +24,57 @@ def testEnsembleSelection():
predictions = []
for i in range(1, 20):
pred = np.full((100), i, dtype=np.float32)
- pred[i*5:5*(i+1)] = 5.5 * i
+ pred[i * 5 : 5 * (i + 1)] = 5.5 * i
predictions.append(pred)
ensemble.fit(predictions, y_true, identifiers=[(i, i, i) for i in range(20)])
- np.testing.assert_array_equal(ensemble.weights_,
- np.array([0.1, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1,
- 0., 0., 0., 0., 0., 0., 0., 0.,
- 0., 0., 0.]))
+ np.testing.assert_array_equal(
+ ensemble.weights_,
+ np.array(
+ [
+ 0.1,
+ 0.2,
+ 0.2,
+ 0.1,
+ 0.1,
+ 0.1,
+ 0.1,
+ 0.1,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ ]
+ ),
+ )
assert ensemble.identifiers_ == [(i, i, i) for i in range(20)]
- np.testing.assert_array_almost_equal(np.array(ensemble.trajectory_),
- np.array([3.462296925452813, 2.679202306657711,
- 2.2748626436960375, 2.065717187806695,
- 1.7874562615598728, 1.6983448128441783,
- 1.559451106330085, 1.5316326052614575,
- 1.3801950121782542, 1.3554980575295374]))
+ np.testing.assert_array_almost_equal(
+ np.array(ensemble.trajectory_),
+ np.array(
+ [
+ 3.462296925452813,
+ 2.679202306657711,
+ 2.2748626436960375,
+ 2.065717187806695,
+ 1.7874562615598728,
+ 1.6983448128441783,
+ 1.559451106330085,
+ 1.5316326052614575,
+ 1.3801950121782542,
+ 1.3554980575295374,
+ ]
+ ),
+ )
def testPredict():
@@ -54,52 +88,38 @@ def testPredict():
# we first exclude all occurrences of zero in self.weights_, and then
# apply the weights.
# If none of the above is the case, predict() raises Error.
- ensemble = EnsembleSelection(ensemble_size=3,
- task_type=BINARY_CLASSIFICATION,
- random_state=0,
- metric=accuracy,
- )
+ ensemble = EnsembleSelection(
+ ensemble_size=3,
+ task_type=BINARY_CLASSIFICATION,
+ random_state=0,
+ metric=accuracy,
+ )
# Test for case 1. Create (3, 2, 2) predictions.
- per_model_pred = np.array([
- [[0.9, 0.1],
- [0.4, 0.6]],
- [[0.8, 0.2],
- [0.3, 0.7]],
- [[1.0, 0.0],
- [0.1, 0.9]]
- ])
+ per_model_pred = np.array(
+ [[[0.9, 0.1], [0.4, 0.6]], [[0.8, 0.2], [0.3, 0.7]], [[1.0, 0.0], [0.1, 0.9]]]
+ )
# Weights of 3 hypothetical models
ensemble.weights_ = [0.7, 0.2, 0.1]
pred = ensemble.predict(per_model_pred)
- truth = np.array([[0.89, 0.11], # This should be the true prediction.
- [0.35, 0.65]])
+ truth = np.array(
+ [[0.89, 0.11], [0.35, 0.65]] # This should be the true prediction.
+ )
assert np.allclose(pred, truth)
# Test for case 2.
- per_model_pred = np.array([
- [[0.9, 0.1],
- [0.4, 0.6]],
- [[0.8, 0.2],
- [0.3, 0.7]],
- [[1.0, 0.0],
- [0.1, 0.9]]
- ])
+ per_model_pred = np.array(
+ [[[0.9, 0.1], [0.4, 0.6]], [[0.8, 0.2], [0.3, 0.7]], [[1.0, 0.0], [0.1, 0.9]]]
+ )
# The third model now has weight of zero.
ensemble.weights_ = [0.7, 0.2, 0.0, 0.1]
pred = ensemble.predict(per_model_pred)
- truth = np.array([[0.89, 0.11],
- [0.35, 0.65]])
+ truth = np.array([[0.89, 0.11], [0.35, 0.65]])
assert np.allclose(pred, truth)
# Test for error case.
- per_model_pred = np.array([
- [[0.9, 0.1],
- [0.4, 0.6]],
- [[0.8, 0.2],
- [0.3, 0.7]],
- [[1.0, 0.0],
- [0.1, 0.9]]
- ])
+ per_model_pred = np.array(
+ [[[0.9, 0.1], [0.4, 0.6]], [[0.8, 0.2], [0.3, 0.7]], [[1.0, 0.0], [0.1, 0.9]]]
+ )
# Now the weights have 2 zero weights and 2 non-zero weights,
# which is incompatible.
ensemble.weights_ = [0.6, 0.0, 0.0, 0.4]
diff --git a/test/test_evaluation/__init__.py b/test/test_evaluation/__init__.py
index cc3cd7becd..e298f0f075 100644
--- a/test/test_evaluation/__init__.py
+++ b/test/test_evaluation/__init__.py
@@ -1,2 +1,2 @@
# -*- encoding: utf-8 -*-
-__author__ = 'feurerm'
+__author__ = "feurerm"
diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py
index e8ba4edf07..d8bf017c35 100644
--- a/test/test_evaluation/evaluation_util.py
+++ b/test/test_evaluation/evaluation_util.py
@@ -1,28 +1,53 @@
import functools
-import traceback
import tempfile
+import traceback
import unittest
import numpy as np
-from numpy.linalg import LinAlgError
import sklearn.datasets
-from sklearn import preprocessing
import sklearn.model_selection
+from numpy.linalg import LinAlgError
+from sklearn import preprocessing
from autosklearn.automl_common.common.utils.backend import Backend
-
-from autosklearn.constants import \
- MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION, BINARY_CLASSIFICATION, REGRESSION
-from autosklearn.util.data import convert_to_bin
+from autosklearn.constants import (
+ BINARY_CLASSIFICATION,
+ MULTICLASS_CLASSIFICATION,
+ MULTILABEL_CLASSIFICATION,
+ REGRESSION,
+)
from autosklearn.data.xy_data_manager import XYDataManager
+from autosklearn.metrics import (
+ accuracy,
+ balanced_accuracy,
+ f1_macro,
+ f1_micro,
+ f1_weighted,
+ log_loss,
+ precision_macro,
+ precision_micro,
+ precision_weighted,
+ recall_macro,
+ recall_micro,
+ recall_weighted,
+)
from autosklearn.pipeline.util import get_dataset
-from autosklearn.metrics import accuracy, balanced_accuracy, f1_macro, f1_micro, f1_weighted, \
- log_loss, precision_macro, precision_micro, precision_weighted, recall_macro, \
- recall_micro, recall_weighted
+from autosklearn.util.data import convert_to_bin
-SCORER_LIST = [accuracy, balanced_accuracy, f1_macro, f1_micro, f1_weighted, log_loss,
- precision_macro, precision_micro, precision_weighted, recall_macro,
- recall_micro, recall_weighted]
+SCORER_LIST = [
+ accuracy,
+ balanced_accuracy,
+ f1_macro,
+ f1_micro,
+ f1_weighted,
+ log_loss,
+ precision_macro,
+ precision_micro,
+ precision_weighted,
+ recall_macro,
+ recall_micro,
+ recall_weighted,
+]
N_TEST_RUNS = 5
@@ -32,14 +57,16 @@ def get_evaluation_backend():
backend_mock.temporary_directory = tempfile.gettempdir()
# Assign a default data
- backend_mock.load_datamanager.return_value = get_multiclass_classification_datamanager()
+ backend_mock.load_datamanager.return_value = (
+ get_multiclass_classification_datamanager()
+ )
return backend_mock
class Dummy(object):
def __init__(self):
- self.name = 'Dummy'
+ self.name = "Dummy"
class BaseEvaluatorTest(unittest.TestCase):
@@ -61,82 +88,85 @@ def __fit(self, function_handle):
function_handle()
return True
except KeyError as e:
- if 'Floating-point under-/overflow occurred at epoch' in \
- e.args[0] or \
- 'removed all features' in e.args[0] or \
- 'failed to create intent' in e.args[0]:
+ if (
+ "Floating-point under-/overflow occurred at epoch" in e.args[0]
+ or "removed all features" in e.args[0]
+ or "failed to create intent" in e.args[0]
+ ):
pass
else:
traceback.print_exc()
raise e
except ValueError as e:
- if 'Floating-point under-/overflow occurred at epoch' in e.args[
- 0] or \
- 'removed all features' in e.args[0] or \
- 'failed to create intent' in e.args[0]:
+ if (
+ "Floating-point under-/overflow occurred at epoch" in e.args[0]
+ or "removed all features" in e.args[0]
+ or "failed to create intent" in e.args[0]
+ ):
pass
else:
raise e
except LinAlgError as e:
- if 'not positive definite, even with jitter' in e.args[0]:
+ if "not positive definite, even with jitter" in e.args[0]:
pass
else:
raise e
except RuntimeWarning as e:
- if 'invalid value encountered in sqrt' in e.args[0]:
+ if "invalid value encountered in sqrt" in e.args[0]:
pass
- elif 'divide by zero encountered in divide' in e.args[0]:
+ elif "divide by zero encountered in divide" in e.args[0]:
pass
else:
raise e
except UserWarning as e:
- if 'FastICA did not converge' in e.args[0]:
+ if "FastICA did not converge" in e.args[0]:
pass
else:
raise e
def get_multiclass_classification_datamanager():
- X_train, Y_train, X_test, Y_test = get_dataset('iris')
+ X_train, Y_train, X_test, Y_test = get_dataset("iris")
indices = list(range(X_train.shape[0]))
np.random.seed(1)
np.random.shuffle(indices)
X_train = X_train[indices]
Y_train = Y_train[indices]
- X_valid = X_test[:25, ]
- Y_valid = Y_test[:25, ]
- X_test = X_test[25:, ]
- Y_test = Y_test[25:, ]
+ X_valid = X_test[
+ :25,
+ ]
+ Y_valid = Y_test[
+ :25,
+ ]
+ X_test = X_test[
+ 25:,
+ ]
+ Y_test = Y_test[
+ 25:,
+ ]
D = Dummy()
- D.info = {
- 'task': MULTICLASS_CLASSIFICATION,
- 'is_sparse': False,
- 'label_num': 3
- }
+ D.info = {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False, "label_num": 3}
D.data = {
- 'X_train': X_train,
- 'Y_train': Y_train,
- 'X_valid': X_valid,
- 'Y_valid': Y_valid,
- 'X_test': X_test,
- 'Y_test': Y_test
+ "X_train": X_train,
+ "Y_train": Y_train,
+ "X_valid": X_valid,
+ "Y_valid": Y_valid,
+ "X_test": X_test,
+ "Y_test": Y_test,
}
- D.feat_type = {0: 'numerical',
- 1: 'Numerical',
- 2: 'numerical',
- 3: 'numerical'}
+ D.feat_type = {0: "numerical", 1: "Numerical", 2: "numerical", 3: "numerical"}
return D
def get_abalone_datamanager():
# https://www.openml.org/d/183
- dataset_name = 'abalone'
+ dataset_name = "abalone"
data = sklearn.datasets.fetch_openml(data_id=183, as_frame=True)
feat_type = {
- i: 'Categorical' if x.name == 'category' else 'Numerical'
- for i, x in enumerate(data['data'].dtypes)
+ i: "Categorical" if x.name == "category" else "Numerical"
+ for i, x in enumerate(data["data"].dtypes)
}
X, y = sklearn.datasets.fetch_openml(data_id=183, return_X_y=True, as_frame=False)
y = preprocessing.LabelEncoder().fit_transform(y)
@@ -145,17 +175,19 @@ def get_abalone_datamanager():
)
D = XYDataManager(
- X_train, y_train,
- X_test, y_test,
+ X_train,
+ y_train,
+ X_test,
+ y_test,
MULTICLASS_CLASSIFICATION,
feat_type,
- dataset_name
+ dataset_name,
)
return D
def get_multilabel_classification_datamanager():
- X_train, Y_train, X_test, Y_test = get_dataset('iris')
+ X_train, Y_train, X_test, Y_test = get_dataset("iris")
indices = list(range(X_train.shape[0]))
np.random.seed(1)
np.random.shuffle(indices)
@@ -171,34 +203,35 @@ def get_multilabel_classification_datamanager():
# Y_test_[:, Y_test[i]] = 1
# Y_test = Y_test_
- X_valid = X_test[:25, ]
- Y_valid = Y_test[:25, ]
- X_test = X_test[25:, ]
- Y_test = Y_test[25:, ]
+ X_valid = X_test[
+ :25,
+ ]
+ Y_valid = Y_test[
+ :25,
+ ]
+ X_test = X_test[
+ 25:,
+ ]
+ Y_test = Y_test[
+ 25:,
+ ]
D = Dummy()
- D.info = {
- 'task': MULTILABEL_CLASSIFICATION,
- 'is_sparse': False,
- 'label_num': 3
- }
+ D.info = {"task": MULTILABEL_CLASSIFICATION, "is_sparse": False, "label_num": 3}
D.data = {
- 'X_train': X_train,
- 'Y_train': Y_train,
- 'X_valid': X_valid,
- 'Y_valid': Y_valid,
- 'X_test': X_test,
- 'Y_test': Y_test
+ "X_train": X_train,
+ "Y_train": Y_train,
+ "X_valid": X_valid,
+ "Y_valid": Y_valid,
+ "X_test": X_test,
+ "Y_test": Y_test,
}
- D.feat_type = {0: 'numerical',
- 1: 'Numerical',
- 2: 'numerical',
- 3: 'numerical'}
+ D.feat_type = {0: "numerical", 1: "Numerical", 2: "numerical", 3: "numerical"}
return D
def get_binary_classification_datamanager():
- X_train, Y_train, X_test, Y_test = get_dataset('iris')
+ X_train, Y_train, X_test, Y_test = get_dataset("iris")
indices = list(range(X_train.shape[0]))
np.random.seed(1)
np.random.shuffle(indices)
@@ -213,99 +246,108 @@ def get_binary_classification_datamanager():
X_test = X_test[eliminate_class_two]
Y_test = Y_test[eliminate_class_two]
- X_valid = X_test[:25, ]
- Y_valid = Y_test[:25, ]
- X_test = X_test[25:, ]
- Y_test = Y_test[25:, ]
+ X_valid = X_test[
+ :25,
+ ]
+ Y_valid = Y_test[
+ :25,
+ ]
+ X_test = X_test[
+ 25:,
+ ]
+ Y_test = Y_test[
+ 25:,
+ ]
D = Dummy()
- D.info = {
- 'task': BINARY_CLASSIFICATION,
- 'is_sparse': False,
- 'label_num': 2
- }
+ D.info = {"task": BINARY_CLASSIFICATION, "is_sparse": False, "label_num": 2}
D.data = {
- 'X_train': X_train,
- 'Y_train': Y_train.reshape((-1, 1)),
- 'X_valid': X_valid,
- 'Y_valid': Y_valid.reshape((-1, 1)),
- 'X_test': X_test,
- 'Y_test': Y_test.reshape((-1, 1))
+ "X_train": X_train,
+ "Y_train": Y_train.reshape((-1, 1)),
+ "X_valid": X_valid,
+ "Y_valid": Y_valid.reshape((-1, 1)),
+ "X_test": X_test,
+ "Y_test": Y_test.reshape((-1, 1)),
}
- D.feat_type = {0: 'numerical',
- 1: 'Numerical',
- 2: 'numerical',
- 3: 'numerical'}
+ D.feat_type = {0: "numerical", 1: "Numerical", 2: "numerical", 3: "numerical"}
return D
def get_regression_datamanager():
- X_train, Y_train, X_test, Y_test = get_dataset('boston')
+ X_train, Y_train, X_test, Y_test = get_dataset("boston")
indices = list(range(X_train.shape[0]))
np.random.seed(1)
np.random.shuffle(indices)
X_train = X_train[indices]
Y_train = Y_train[indices]
- X_valid = X_test[:200, ]
- Y_valid = Y_test[:200, ]
- X_test = X_test[200:, ]
- Y_test = Y_test[200:, ]
+ X_valid = X_test[
+ :200,
+ ]
+ Y_valid = Y_test[
+ :200,
+ ]
+ X_test = X_test[
+ 200:,
+ ]
+ Y_test = Y_test[
+ 200:,
+ ]
D = Dummy()
- D.info = {
- 'task': REGRESSION,
- 'is_sparse': False,
- 'label_num': 1
- }
+ D.info = {"task": REGRESSION, "is_sparse": False, "label_num": 1}
D.data = {
- 'X_train': X_train,
- 'Y_train': Y_train.reshape((-1, 1)),
- 'X_valid': X_valid,
- 'Y_valid': Y_valid.reshape((-1, 1)),
- 'X_test': X_test,
- 'Y_test': Y_test.reshape((-1, 1))
+ "X_train": X_train,
+ "Y_train": Y_train.reshape((-1, 1)),
+ "X_valid": X_valid,
+ "Y_valid": Y_valid.reshape((-1, 1)),
+ "X_test": X_test,
+ "Y_test": Y_test.reshape((-1, 1)),
}
- D.feat_type = {i: 'numerical' for i in range(X_train.shape[1])}
+ D.feat_type = {i: "numerical" for i in range(X_train.shape[1])}
return D
def get_500_classes_datamanager():
weights = ([0.002] * 475) + ([0.001] * 25)
- X, Y = sklearn.datasets.make_classification(n_samples=1000,
- n_features=20,
- n_classes=500,
- n_clusters_per_class=1,
- n_informative=15,
- n_redundant=5,
- n_repeated=0,
- weights=weights,
- flip_y=0,
- class_sep=1.0,
- hypercube=True,
- shift=None,
- scale=1.0,
- shuffle=True,
- random_state=1)
+ X, Y = sklearn.datasets.make_classification(
+ n_samples=1000,
+ n_features=20,
+ n_classes=500,
+ n_clusters_per_class=1,
+ n_informative=15,
+ n_redundant=5,
+ n_repeated=0,
+ weights=weights,
+ flip_y=0,
+ class_sep=1.0,
+ hypercube=True,
+ shift=None,
+ scale=1.0,
+ shuffle=True,
+ random_state=1,
+ )
D = Dummy()
- D.info = {
- 'task': MULTICLASS_CLASSIFICATION,
- 'is_sparse': False,
- 'label_num': 500
+ D.info = {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False, "label_num": 500}
+ D.data = {
+ "X_train": X[:700],
+ "Y_train": Y[:700],
+ "X_valid": X[700:710],
+ "Y_valid": Y[700:710],
+ "X_test": X[710:],
+ "Y_test": Y[710:],
}
- D.data = {'X_train': X[:700], 'Y_train': Y[:700],
- 'X_valid': X[700:710], 'Y_valid': Y[700:710],
- 'X_test': X[710:], 'Y_test': Y[710:]
- }
- D.feat_type = {i: 'numerical' for i in range(20)}
+ D.feat_type = {i: "numerical" for i in range(20)}
return D
def get_dataset_getters():
- return [get_binary_classification_datamanager,
- get_multiclass_classification_datamanager,
- get_multilabel_classification_datamanager,
- get_500_classes_datamanager,
- get_abalone_datamanager,
- get_regression_datamanager]
+ return [
+ get_binary_classification_datamanager,
+ get_multiclass_classification_datamanager,
+ get_multilabel_classification_datamanager,
+ get_500_classes_datamanager,
+ get_abalone_datamanager,
+ get_regression_datamanager,
+ ]
diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py
index f51820221b..c668a82ffd 100644
--- a/test/test_evaluation/test_abstract_evaluator.py
+++ b/test/test_evaluation/test_abstract_evaluator.py
@@ -3,19 +3,18 @@
import os
import shutil
import sys
+import tempfile
import unittest
import unittest.mock
-import tempfile
import numpy as np
import sklearn.dummy
+from smac.tae import StatusType
from autosklearn.automl_common.common.utils.backend import Backend, BackendContext
-
from autosklearn.evaluation.abstract_evaluator import AbstractEvaluator
-from autosklearn.pipeline.components.base import _addons
from autosklearn.metrics import accuracy
-from smac.tae import StatusType
+from autosklearn.pipeline.components.base import _addons
this_directory = os.path.dirname(__file__)
sys.path.append(this_directory)
@@ -29,7 +28,7 @@ def setUp(self):
"""
Creates a backend mock
"""
- self.ev_path = os.path.join(this_directory, '.tmp_evaluations')
+ self.ev_path = os.path.join(this_directory, ".tmp_evaluations")
if not os.path.exists(self.ev_path):
os.mkdir(self.ev_path)
dummy_model_files = [os.path.join(self.ev_path, str(n)) for n in range(100)]
@@ -46,7 +45,7 @@ def setUp(self):
self.port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
- self.working_directory = os.path.join(this_directory, '.tmp_%s' % self.id())
+ self.working_directory = os.path.join(this_directory, ".tmp_%s" % self.id())
def tearDown(self):
if os.path.exists(self.ev_path):
@@ -56,16 +55,18 @@ def tearDown(self):
pass
def test_finish_up_model_predicts_NaN(self):
- '''Tests by handing in predictions which contain NaNs'''
+ """Tests by handing in predictions which contain NaNs"""
rs = np.random.RandomState(1)
queue_mock = unittest.mock.Mock()
- ae = AbstractEvaluator(backend=self.backend_mock,
- port=self.port,
- output_y_hat_optimization=False,
- queue=queue_mock, metric=accuracy,
- additional_components=dict(),
- )
+ ae = AbstractEvaluator(
+ backend=self.backend_mock,
+ port=self.port,
+ output_y_hat_optimization=False,
+ queue=queue_mock,
+ metric=accuracy,
+ additional_components=dict(),
+ )
ae.Y_optimization = rs.rand(33, 3)
predictions_ensemble = rs.rand(33, 3)
predictions_test = rs.rand(25, 3)
@@ -85,9 +86,10 @@ def test_finish_up_model_predicts_NaN(self):
status=StatusType.SUCCESS,
)
self.assertEqual(loss, 1.0)
- self.assertEqual(additional_run_info,
- {'error': 'Model predictions for optimization set '
- 'contains NaNs.'})
+ self.assertEqual(
+ additional_run_info,
+ {"error": "Model predictions for optimization set " "contains NaNs."},
+ )
# NaNs in prediction validation
predictions_ensemble[5, 2] = 0.5
@@ -104,9 +106,10 @@ def test_finish_up_model_predicts_NaN(self):
status=StatusType.SUCCESS,
)
self.assertEqual(loss, 1.0)
- self.assertEqual(additional_run_info,
- {'error': 'Model predictions for validation set '
- 'contains NaNs.'})
+ self.assertEqual(
+ additional_run_info,
+ {"error": "Model predictions for validation set " "contains NaNs."},
+ )
# NaNs in prediction test
predictions_valid[5, 2] = 0.5
@@ -123,9 +126,10 @@ def test_finish_up_model_predicts_NaN(self):
status=StatusType.SUCCESS,
)
self.assertEqual(loss, 1.0)
- self.assertEqual(additional_run_info,
- {'error': 'Model predictions for test set contains '
- 'NaNs.'})
+ self.assertEqual(
+ additional_run_info,
+ {"error": "Model predictions for test set contains " "NaNs."},
+ )
self.assertEqual(self.backend_mock.save_predictions_as_npy.call_count, 0)
@@ -147,12 +151,10 @@ def test_disable_file_output(self):
predictions_test = rs.rand(25, 3)
predictions_valid = rs.rand(25, 3)
- loss_, additional_run_info_ = (
- ae.file_output(
- predictions_ensemble,
- predictions_valid,
- predictions_test,
- )
+ loss_, additional_run_info_ = ae.file_output(
+ predictions_ensemble,
+ predictions_valid,
+ predictions_test,
)
self.assertIsNone(loss_)
@@ -160,7 +162,7 @@ def test_disable_file_output(self):
# This function is never called as there is a return before
self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 0)
- for call_count, disable in enumerate(['model', 'cv_model'], start=1):
+ for call_count, disable in enumerate(["model", "cv_model"], start=1):
ae = AbstractEvaluator(
backend=self.backend_mock,
output_y_hat_optimization=False,
@@ -174,38 +176,49 @@ def test_disable_file_output(self):
ae.model = unittest.mock.Mock()
ae.models = [unittest.mock.Mock()]
- loss_, additional_run_info_ = (
- ae.file_output(
- predictions_ensemble,
- predictions_valid,
- predictions_test,
- )
+ loss_, additional_run_info_ = ae.file_output(
+ predictions_ensemble,
+ predictions_valid,
+ predictions_test,
)
self.assertIsNone(loss_)
self.assertEqual(additional_run_info_, {})
- self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, call_count)
- if disable == 'model':
+ self.assertEqual(
+ self.backend_mock.save_numrun_to_dir.call_count, call_count
+ )
+ if disable == "model":
self.assertIsNone(
- self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model'])
+ self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["model"]
+ )
self.assertIsNotNone(
- self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model'])
+ self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
+ "cv_model"
+ ]
+ )
else:
self.assertIsNotNone(
- self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model'])
+ self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["model"]
+ )
self.assertIsNone(
- self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model'])
+ self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
+ "cv_model"
+ ]
+ )
self.assertIsNotNone(
self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
- 'ensemble_predictions']
+ "ensemble_predictions"
+ ]
)
self.assertIsNotNone(
self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
- 'valid_predictions']
+ "valid_predictions"
+ ]
)
self.assertIsNotNone(
self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
- 'test_predictions']
+ "test_predictions"
+ ]
)
ae = AbstractEvaluator(
@@ -213,20 +226,18 @@ def test_disable_file_output(self):
output_y_hat_optimization=False,
queue=queue_mock,
metric=accuracy,
- disable_file_output=['y_optimization'],
+ disable_file_output=["y_optimization"],
port=self.port,
additional_components=dict(),
)
ae.Y_optimization = predictions_ensemble
- ae.model = 'model'
+ ae.model = "model"
ae.models = [unittest.mock.Mock()]
- loss_, additional_run_info_ = (
- ae.file_output(
- predictions_ensemble,
- predictions_valid,
- predictions_test,
- )
+ loss_, additional_run_info_ = ae.file_output(
+ predictions_ensemble,
+ predictions_valid,
+ predictions_test,
)
self.assertIsNone(loss_)
@@ -234,15 +245,18 @@ def test_disable_file_output(self):
self.assertIsNone(
self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
- 'ensemble_predictions']
+ "ensemble_predictions"
+ ]
)
self.assertIsNotNone(
self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
- 'valid_predictions']
+ "valid_predictions"
+ ]
)
self.assertIsNotNone(
self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
- 'test_predictions']
+ "test_predictions"
+ ]
)
def test_file_output(self):
@@ -252,14 +266,18 @@ def test_file_output(self):
queue_mock = unittest.mock.Mock()
context = BackendContext(
- temporary_directory=os.path.join(self.working_directory, 'tmp'),
- output_directory=os.path.join(self.working_directory, 'tmp_output'),
+ temporary_directory=os.path.join(self.working_directory, "tmp"),
+ output_directory=os.path.join(self.working_directory, "tmp_output"),
delete_tmp_folder_after_terminate=True,
delete_output_folder_after_terminate=True,
- prefix="auto-sklearn"
+ prefix="auto-sklearn",
)
- with unittest.mock.patch.object(Backend, 'load_datamanager') as load_datamanager_mock:
- load_datamanager_mock.return_value = get_multiclass_classification_datamanager()
+ with unittest.mock.patch.object(
+ Backend, "load_datamanager"
+ ) as load_datamanager_mock:
+ load_datamanager_mock.return_value = (
+ get_multiclass_classification_datamanager()
+ )
backend = Backend(context, prefix="auto-sklearn")
@@ -285,8 +303,17 @@ def test_file_output(self):
Y_test_pred=predictions_test,
)
- self.assertTrue(os.path.exists(os.path.join(self.working_directory, 'tmp',
- '.auto-sklearn', 'runs', '1_0_None')))
+ self.assertTrue(
+ os.path.exists(
+ os.path.join(
+ self.working_directory,
+ "tmp",
+ ".auto-sklearn",
+ "runs",
+ "1_0_None",
+ )
+ )
+ )
shutil.rmtree(self.working_directory, ignore_errors=True)
@@ -297,26 +324,34 @@ def test_add_additional_components(self):
queue_mock = unittest.mock.Mock()
context = BackendContext(
- temporary_directory=os.path.join(self.working_directory, 'tmp'),
- output_directory=os.path.join(self.working_directory, 'tmp_output'),
+ temporary_directory=os.path.join(self.working_directory, "tmp"),
+ output_directory=os.path.join(self.working_directory, "tmp_output"),
delete_tmp_folder_after_terminate=True,
delete_output_folder_after_terminate=True,
- prefix="auto-sklearn"
+ prefix="auto-sklearn",
)
- with unittest.mock.patch.object(Backend, 'load_datamanager') as load_datamanager_mock:
- load_datamanager_mock.return_value = get_multiclass_classification_datamanager()
+ with unittest.mock.patch.object(
+ Backend, "load_datamanager"
+ ) as load_datamanager_mock:
+ load_datamanager_mock.return_value = (
+ get_multiclass_classification_datamanager()
+ )
backend = Backend(context, prefix="auto-sklearn")
- with unittest.mock.patch.object(_addons['classification'], 'add_component') as _:
+ with unittest.mock.patch.object(
+ _addons["classification"], "add_component"
+ ) as _:
- # If the components in the argument `additional_components` are an empty dict
- # there is no call to `add_component`, if there's something in it, `add_component
- # is called (2nd case)
- for fixture, case in ((0, dict()), (1, dict(abc='def'))):
+ # If the components in the argument `additional_components` are an
+ # empty dict there is no call to `add_component`,
+ # if there's something in it, `add_component is called (2nd case)
+ for fixture, case in ((0, dict()), (1, dict(abc="def"))):
thirdparty_components_patch = unittest.mock.Mock()
thirdparty_components_patch.components = case
- additional_components = dict(classification=thirdparty_components_patch)
+ additional_components = dict(
+ classification=thirdparty_components_patch
+ )
AbstractEvaluator(
backend=backend,
output_y_hat_optimization=False,
@@ -325,4 +360,6 @@ def test_add_additional_components(self):
port=self.port,
additional_components=additional_components,
)
- self.assertEqual(_addons['classification'].add_component.call_count, fixture)
+ self.assertEqual(
+ _addons["classification"].add_component.call_count, fixture
+ )
diff --git a/test/test_evaluation/test_custom_splitters.py b/test/test_evaluation/test_custom_splitters.py
index 4922442228..64f9dc2f18 100644
--- a/test/test_evaluation/test_custom_splitters.py
+++ b/test/test_evaluation/test_custom_splitters.py
@@ -1,37 +1,44 @@
-import pytest
-
import numpy as np
+import pytest
-from autosklearn.evaluation.splitter import CustomStratifiedShuffleSplit
from autosklearn.constants import (
- BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION
+ BINARY_CLASSIFICATION,
+ MULTICLASS_CLASSIFICATION,
+ MULTILABEL_CLASSIFICATION,
)
+from autosklearn.evaluation.splitter import CustomStratifiedShuffleSplit
-@pytest.mark.parametrize("task, X, y", [
- (
- BINARY_CLASSIFICATION,
- np.asarray(10000 * [[1, 1, 1, 1, 1]]),
- np.asarray(9999 * [0] + 1 * [1])
- ),
- (
- MULTICLASS_CLASSIFICATION,
- np.asarray(10000 * [[1, 1, 1, 1, 1]]),
- np.asarray(4999 * [1] + 4999 * [2] + 1 * [3] + 1 * [4])),
- (
- MULTILABEL_CLASSIFICATION,
- np.asarray(10000 * [[1, 1, 1, 1, 1]]),
- np.asarray(4999 * [[0, 1, 1]] + 4999 * [[1, 1, 0]] + 1 * [[1, 0, 1]] + 1 * [[0, 0, 0]])
- )
-])
-@pytest.mark.parametrize('train_size', [100, 0.5, 200, 0.75])
+@pytest.mark.parametrize(
+ "task, X, y",
+ [
+ (
+ BINARY_CLASSIFICATION,
+ np.asarray(10000 * [[1, 1, 1, 1, 1]]),
+ np.asarray(9999 * [0] + 1 * [1]),
+ ),
+ (
+ MULTICLASS_CLASSIFICATION,
+ np.asarray(10000 * [[1, 1, 1, 1, 1]]),
+ np.asarray(4999 * [1] + 4999 * [2] + 1 * [3] + 1 * [4]),
+ ),
+ (
+ MULTILABEL_CLASSIFICATION,
+ np.asarray(10000 * [[1, 1, 1, 1, 1]]),
+ np.asarray(
+ 4999 * [[0, 1, 1]]
+ + 4999 * [[1, 1, 0]]
+ + 1 * [[1, 0, 1]]
+ + 1 * [[0, 0, 0]]
+ ),
+ ),
+ ],
+)
+@pytest.mark.parametrize("train_size", [100, 0.5, 200, 0.75])
def test_custom_stratified_shuffle_split_returns_unique_labels_and_maintains_size(
task, X, y, train_size
):
- splitter = CustomStratifiedShuffleSplit(
- train_size=train_size,
- random_state=1
- )
+ splitter = CustomStratifiedShuffleSplit(train_size=train_size, random_state=1)
left_idxs, _ = next(splitter.split(X=X, y=y))
y_sampled = y[left_idxs]
X_sampled = X[left_idxs]
@@ -46,5 +53,6 @@ def test_custom_stratified_shuffle_split_returns_unique_labels_and_maintains_siz
assert len(X_sampled) == n_samples
# Assert all the unique labels are present in the training set
- assert all(label in np.unique(y_sampled) for label in np.unique(y)), \
- f"{task} failed, {np.unique(y)} != {np.unique(y_sampled)}"
+ assert all(
+ label in np.unique(y_sampled) for label in np.unique(y)
+ ), f"{task} failed, {np.unique(y)} != {np.unique(y_sampled)}"
diff --git a/test/test_evaluation/test_dummy_pipelines.py b/test/test_evaluation/test_dummy_pipelines.py
index ed7c499711..3d5f1d0f59 100644
--- a/test/test_evaluation/test_dummy_pipelines.py
+++ b/test/test_evaluation/test_dummy_pipelines.py
@@ -1,20 +1,21 @@
import numpy as np
-
import pytest
-
from sklearn.base import clone
from sklearn.datasets import make_classification, make_regression
from sklearn.utils.validation import check_is_fitted
-from autosklearn.evaluation.abstract_evaluator import MyDummyClassifier, MyDummyRegressor
+from autosklearn.evaluation.abstract_evaluator import (
+ MyDummyClassifier,
+ MyDummyRegressor,
+)
-@pytest.mark.parametrize("task_type", ['classification', 'regression'])
+@pytest.mark.parametrize("task_type", ["classification", "regression"])
def test_dummy_pipeline(task_type):
- if task_type == 'classification':
+ if task_type == "classification":
estimator_class = MyDummyClassifier
data_maker = make_classification
- elif task_type == 'regression':
+ elif task_type == "regression":
estimator_class = MyDummyRegressor
data_maker = make_regression
else:
diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py
index 77f6e5c4bf..67d9e0ca8b 100644
--- a/test/test_evaluation/test_evaluation.py
+++ b/test/test_evaluation/test_evaluation.py
@@ -1,6 +1,6 @@
-import os
import logging
import logging.handlers
+import os
import shutil
import sys
import time
@@ -19,20 +19,21 @@
this_directory = os.path.dirname(__file__)
sys.path.append(this_directory)
-from evaluation_util import get_multiclass_classification_datamanager, get_evaluation_backend # noqa E402
+from evaluation_util import ( # noqa E402
+ get_evaluation_backend,
+ get_multiclass_classification_datamanager,
+)
def safe_eval_success_mock(*args, **kwargs):
- queue = kwargs['queue']
- queue.put({'status': StatusType.SUCCESS,
- 'loss': 0.5,
- 'additional_run_info': ''})
+ queue = kwargs["queue"]
+ queue.put({"status": StatusType.SUCCESS, "loss": 0.5, "additional_run_info": ""})
class EvaluationTest(unittest.TestCase):
def setUp(self):
self.datamanager = get_multiclass_classification_datamanager()
- self.tmp = os.path.join(os.getcwd(), '.test_evaluation')
+ self.tmp = os.path.join(os.getcwd(), ".test_evaluation")
self.logger = logging.getLogger()
scenario_mock = unittest.mock.Mock()
scenario_mock.wallclock_limit = 10
@@ -72,142 +73,211 @@ def test_pynisher_timeout(self):
def run_over_time():
time.sleep(2)
- safe_eval = pynisher.enforce_limits(wall_time_in_s=1,
- grace_period_in_s=0)(run_over_time)
+ safe_eval = pynisher.enforce_limits(wall_time_in_s=1, grace_period_in_s=0)(
+ run_over_time
+ )
safe_eval()
self.assertEqual(safe_eval.exit_status, pynisher.TimeoutException)
############################################################################
# Test ExecuteTaFuncWithQueue.run_wrapper()
- @unittest.mock.patch('autosklearn.evaluation.train_evaluator.eval_holdout')
+ @unittest.mock.patch("autosklearn.evaluation.train_evaluator.eval_holdout")
def test_eval_with_limits_holdout(self, pynisher_mock):
pynisher_mock.side_effect = safe_eval_success_mock
config = unittest.mock.Mock()
config.config_id = 198
- ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1,
- port=self.logger_port,
- resampling_strategy='holdout',
- stats=self.stats,
- memory_limit=3072,
- metric=accuracy,
- cost_for_crash=get_cost_of_crash(accuracy),
- abort_on_first_run_crash=False,
- pynisher_context='fork',
- )
- info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None,
- instance_specific=None, seed=1, capped=False))
+ ta = ExecuteTaFuncWithQueue(
+ backend=self.backend,
+ autosklearn_seed=1,
+ port=self.logger_port,
+ resampling_strategy="holdout",
+ stats=self.stats,
+ memory_limit=3072,
+ metric=accuracy,
+ cost_for_crash=get_cost_of_crash(accuracy),
+ abort_on_first_run_crash=False,
+ pynisher_context="fork",
+ )
+ info = ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=30,
+ instance=None,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
self.assertEqual(info[0].config.config_id, 198)
self.assertEqual(info[1].status, StatusType.SUCCESS)
self.assertEqual(info[1].cost, 0.5)
self.assertIsInstance(info[1].time, float)
- @unittest.mock.patch('pynisher.enforce_limits')
+ @unittest.mock.patch("pynisher.enforce_limits")
def test_zero_or_negative_cutoff(self, pynisher_mock):
config = unittest.mock.Mock()
config.config_id = 198
- ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1,
- port=self.logger_port,
- resampling_strategy='holdout',
- stats=self.stats,
- metric=accuracy,
- cost_for_crash=get_cost_of_crash(accuracy),
- abort_on_first_run_crash=False,
- pynisher_context='forkserver',
- )
+ ta = ExecuteTaFuncWithQueue(
+ backend=self.backend,
+ autosklearn_seed=1,
+ port=self.logger_port,
+ resampling_strategy="holdout",
+ stats=self.stats,
+ metric=accuracy,
+ cost_for_crash=get_cost_of_crash(accuracy),
+ abort_on_first_run_crash=False,
+ pynisher_context="forkserver",
+ )
self.scenario.wallclock_limit = 5
self.stats.submitted_ta_runs += 1
- run_info, run_value = ta.run_wrapper(RunInfo(config=config, cutoff=9, instance=None,
- instance_specific=None, seed=1, capped=False))
+ run_info, run_value = ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=9,
+ instance=None,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
self.assertEqual(run_value.status, StatusType.STOP)
- @unittest.mock.patch('pynisher.enforce_limits')
+ @unittest.mock.patch("pynisher.enforce_limits")
def test_cutoff_lower_than_remaining_time(self, pynisher_mock):
config = unittest.mock.Mock()
config.config_id = 198
- ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1,
- port=self.logger_port,
- resampling_strategy='holdout',
- stats=self.stats,
- metric=accuracy,
- cost_for_crash=get_cost_of_crash(accuracy),
- abort_on_first_run_crash=False,
- pynisher_context='forkserver',
- )
+ ta = ExecuteTaFuncWithQueue(
+ backend=self.backend,
+ autosklearn_seed=1,
+ port=self.logger_port,
+ resampling_strategy="holdout",
+ stats=self.stats,
+ metric=accuracy,
+ cost_for_crash=get_cost_of_crash(accuracy),
+ abort_on_first_run_crash=False,
+ pynisher_context="forkserver",
+ )
self.stats.ta_runs = 1
- ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None, instance_specific=None,
- seed=1, capped=False))
- self.assertEqual(pynisher_mock.call_args[1]['wall_time_in_s'], 4)
- self.assertIsInstance(pynisher_mock.call_args[1]['wall_time_in_s'], int)
+ ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=30,
+ instance=None,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
+ self.assertEqual(pynisher_mock.call_args[1]["wall_time_in_s"], 4)
+ self.assertIsInstance(pynisher_mock.call_args[1]["wall_time_in_s"], int)
- @unittest.mock.patch('autosklearn.evaluation.train_evaluator.eval_holdout')
+ @unittest.mock.patch("autosklearn.evaluation.train_evaluator.eval_holdout")
def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock):
pynisher_mock.return_value = None
config = unittest.mock.Mock()
- config.origin = 'MOCK'
+ config.origin = "MOCK"
config.config_id = 198
- ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1,
- port=self.logger_port,
- resampling_strategy='holdout',
- stats=self.stats,
- memory_limit=3072,
- metric=accuracy,
- cost_for_crash=get_cost_of_crash(accuracy),
- abort_on_first_run_crash=False,
- pynisher_context='fork',
- )
+ ta = ExecuteTaFuncWithQueue(
+ backend=self.backend,
+ autosklearn_seed=1,
+ port=self.logger_port,
+ resampling_strategy="holdout",
+ stats=self.stats,
+ memory_limit=3072,
+ metric=accuracy,
+ cost_for_crash=get_cost_of_crash(accuracy),
+ abort_on_first_run_crash=False,
+ pynisher_context="fork",
+ )
# The following should not fail because abort on first config crashed is false
- info = ta.run_wrapper(RunInfo(config=config, cutoff=60, instance=None,
- instance_specific=None, seed=1, capped=False))
+ info = ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=60,
+ instance=None,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
self.assertEqual(info[1].status, StatusType.CRASHED)
self.assertEqual(info[1].cost, 1.0)
self.assertIsInstance(info[1].time, float)
- self.assertEqual(info[1].additional_info, {'configuration_origin': 'MOCK',
- 'error': "Result queue is empty",
- 'exit_status': 0,
- 'exitcode': 0,
- 'subprocess_stdout': '',
- 'subprocess_stderr': ''})
+ self.assertEqual(
+ info[1].additional_info,
+ {
+ "configuration_origin": "MOCK",
+ "error": "Result queue is empty",
+ "exit_status": 0,
+ "exitcode": 0,
+ "subprocess_stdout": "",
+ "subprocess_stderr": "",
+ },
+ )
self.stats.submitted_ta_runs += 1
- info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None,
- instance_specific=None, seed=1, capped=False))
+ info = ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=30,
+ instance=None,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
self.assertEqual(info[1].status, StatusType.CRASHED)
self.assertEqual(info[1].cost, 1.0)
self.assertIsInstance(info[1].time, float)
- self.assertEqual(info[1].additional_info, {'configuration_origin': 'MOCK',
- 'error': "Result queue is empty",
- 'exit_status': 0,
- 'exitcode': 0,
- 'subprocess_stdout': '',
- 'subprocess_stderr': ''})
-
- @unittest.mock.patch('autosklearn.evaluation.train_evaluator.eval_holdout')
+ self.assertEqual(
+ info[1].additional_info,
+ {
+ "configuration_origin": "MOCK",
+ "error": "Result queue is empty",
+ "exit_status": 0,
+ "exitcode": 0,
+ "subprocess_stdout": "",
+ "subprocess_stderr": "",
+ },
+ )
+
+ @unittest.mock.patch("autosklearn.evaluation.train_evaluator.eval_holdout")
def test_eval_with_limits_holdout_fail_memory_error(self, pynisher_mock):
pynisher_mock.side_effect = MemoryError
config = unittest.mock.Mock()
config.config_id = 198
- ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1,
- port=self.logger_port,
- resampling_strategy='holdout',
- stats=self.stats,
- memory_limit=3072,
- metric=log_loss,
- cost_for_crash=get_cost_of_crash(log_loss),
- abort_on_first_run_crash=False,
- pynisher_context='fork',
- )
- info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None,
- instance_specific=None, seed=1, capped=False))
+ ta = ExecuteTaFuncWithQueue(
+ backend=self.backend,
+ autosklearn_seed=1,
+ port=self.logger_port,
+ resampling_strategy="holdout",
+ stats=self.stats,
+ memory_limit=3072,
+ metric=log_loss,
+ cost_for_crash=get_cost_of_crash(log_loss),
+ abort_on_first_run_crash=False,
+ pynisher_context="fork",
+ )
+ info = ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=30,
+ instance=None,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
self.assertEqual(info[1].status, StatusType.MEMOUT)
# For logloss, worst possible result is MAXINT
worst_possible_result = MAXINT
self.assertEqual(info[1].cost, worst_possible_result)
self.assertIsInstance(info[1].time, float)
- self.assertNotIn('exitcode', info[1].additional_info)
+ self.assertNotIn("exitcode", info[1].additional_info)
- @unittest.mock.patch('pynisher.enforce_limits')
+ @unittest.mock.patch("pynisher.enforce_limits")
def test_eval_with_limits_holdout_fail_timeout(self, pynisher_mock):
config = unittest.mock.Mock()
config.config_id = 198
@@ -218,33 +288,46 @@ def test_eval_with_limits_holdout_fail_timeout(self, pynisher_mock):
pynisher_mock.return_value = m1
m2.exit_status = pynisher.TimeoutException
m2.wall_clock_time = 30
- ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1,
- port=self.logger_port,
- resampling_strategy='holdout',
- stats=self.stats,
- memory_limit=3072,
- metric=accuracy,
- cost_for_crash=get_cost_of_crash(accuracy),
- abort_on_first_run_crash=False,
- pynisher_context='forkserver',
- )
- info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None,
- instance_specific=None, seed=1, capped=False))
+ ta = ExecuteTaFuncWithQueue(
+ backend=self.backend,
+ autosklearn_seed=1,
+ port=self.logger_port,
+ resampling_strategy="holdout",
+ stats=self.stats,
+ memory_limit=3072,
+ metric=accuracy,
+ cost_for_crash=get_cost_of_crash(accuracy),
+ abort_on_first_run_crash=False,
+ pynisher_context="forkserver",
+ )
+ info = ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=30,
+ instance=None,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
self.assertEqual(info[1].status, StatusType.TIMEOUT)
self.assertEqual(info[1].cost, 1.0)
self.assertIsInstance(info[1].time, float)
- self.assertNotIn('exitcode', info[1].additional_info)
+ self.assertNotIn("exitcode", info[1].additional_info)
- @unittest.mock.patch('pynisher.enforce_limits')
- def test_eval_with_limits_holdout_timeout_with_results_in_queue(self, pynisher_mock):
+ @unittest.mock.patch("pynisher.enforce_limits")
+ def test_eval_with_limits_holdout_timeout_with_results_in_queue(
+ self, pynisher_mock
+ ):
config = unittest.mock.Mock()
config.config_id = 198
def side_effect(**kwargs):
- queue = kwargs['queue']
- queue.put({'status': StatusType.SUCCESS,
- 'loss': 0.5,
- 'additional_run_info': {}})
+ queue = kwargs["queue"]
+ queue.put(
+ {"status": StatusType.SUCCESS, "loss": 0.5, "additional_run_info": {}}
+ )
+
m1 = unittest.mock.Mock()
m2 = unittest.mock.Mock()
m1.return_value = m2
@@ -254,137 +337,194 @@ def side_effect(**kwargs):
m2.wall_clock_time = 30
# Test for a succesful run
- ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1,
- port=self.logger_port,
- resampling_strategy='holdout',
- stats=self.stats,
- memory_limit=3072,
- metric=accuracy,
- cost_for_crash=get_cost_of_crash(accuracy),
- abort_on_first_run_crash=False,
- pynisher_context='forkserver',
- )
- info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None,
- instance_specific=None, seed=1, capped=False))
+ ta = ExecuteTaFuncWithQueue(
+ backend=self.backend,
+ autosklearn_seed=1,
+ port=self.logger_port,
+ resampling_strategy="holdout",
+ stats=self.stats,
+ memory_limit=3072,
+ metric=accuracy,
+ cost_for_crash=get_cost_of_crash(accuracy),
+ abort_on_first_run_crash=False,
+ pynisher_context="forkserver",
+ )
+ info = ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=30,
+ instance=None,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
self.assertEqual(info[1].status, StatusType.SUCCESS)
self.assertEqual(info[1].cost, 0.5)
self.assertIsInstance(info[1].time, float)
- self.assertNotIn('exitcode', info[1].additional_info)
+ self.assertNotIn("exitcode", info[1].additional_info)
# And a crashed run which is in the queue
def side_effect(**kwargs):
- queue = kwargs['queue']
- queue.put({'status': StatusType.CRASHED,
- 'loss': 2.0,
- 'additional_run_info': {}})
+ queue = kwargs["queue"]
+ queue.put(
+ {"status": StatusType.CRASHED, "loss": 2.0, "additional_run_info": {}}
+ )
+
m2.side_effect = side_effect
- ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1,
- port=self.logger_port,
- resampling_strategy='holdout',
- stats=self.stats,
- memory_limit=3072,
- metric=accuracy,
- cost_for_crash=get_cost_of_crash(accuracy),
- abort_on_first_run_crash=False,
- pynisher_context='forkserver',
- )
- info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None,
- instance_specific=None, seed=1, capped=False))
+ ta = ExecuteTaFuncWithQueue(
+ backend=self.backend,
+ autosklearn_seed=1,
+ port=self.logger_port,
+ resampling_strategy="holdout",
+ stats=self.stats,
+ memory_limit=3072,
+ metric=accuracy,
+ cost_for_crash=get_cost_of_crash(accuracy),
+ abort_on_first_run_crash=False,
+ pynisher_context="forkserver",
+ )
+ info = ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=30,
+ instance=None,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
self.assertEqual(info[1].status, StatusType.CRASHED)
self.assertEqual(info[1].cost, 1.0)
self.assertIsInstance(info[1].time, float)
- self.assertNotIn('exitcode', info[1].additional_info)
+ self.assertNotIn("exitcode", info[1].additional_info)
- @unittest.mock.patch('autosklearn.evaluation.train_evaluator.eval_holdout')
+ @unittest.mock.patch("autosklearn.evaluation.train_evaluator.eval_holdout")
def test_eval_with_limits_holdout_2(self, eval_houldout_mock):
config = unittest.mock.Mock()
config.config_id = 198
def side_effect(*args, **kwargs):
- queue = kwargs['queue']
- queue.put({'status': StatusType.SUCCESS,
- 'loss': 0.5,
- 'additional_run_info': kwargs['instance']})
+ queue = kwargs["queue"]
+ queue.put(
+ {
+ "status": StatusType.SUCCESS,
+ "loss": 0.5,
+ "additional_run_info": kwargs["instance"],
+ }
+ )
+
eval_houldout_mock.side_effect = side_effect
- ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1,
- port=self.logger_port,
- resampling_strategy='holdout',
- stats=self.stats,
- memory_limit=3072,
- metric=accuracy,
- cost_for_crash=get_cost_of_crash(accuracy),
- abort_on_first_run_crash=False,
- pynisher_context='fork',
- )
+ ta = ExecuteTaFuncWithQueue(
+ backend=self.backend,
+ autosklearn_seed=1,
+ port=self.logger_port,
+ resampling_strategy="holdout",
+ stats=self.stats,
+ memory_limit=3072,
+ metric=accuracy,
+ cost_for_crash=get_cost_of_crash(accuracy),
+ abort_on_first_run_crash=False,
+ pynisher_context="fork",
+ )
self.scenario.wallclock_limit = 180
instance = "{'subsample': 30}"
- info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=instance,
- instance_specific=None, seed=1, capped=False))
+ info = ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=30,
+ instance=instance,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
self.assertEqual(info[1].status, StatusType.SUCCESS)
self.assertEqual(len(info[1].additional_info), 2)
- self.assertIn('configuration_origin', info[1].additional_info)
- self.assertEqual(info[1].additional_info['message'], "{'subsample': 30}")
+ self.assertIn("configuration_origin", info[1].additional_info)
+ self.assertEqual(info[1].additional_info["message"], "{'subsample': 30}")
- @unittest.mock.patch('autosklearn.evaluation.train_evaluator.eval_holdout')
+ @unittest.mock.patch("autosklearn.evaluation.train_evaluator.eval_holdout")
def test_exception_in_target_function(self, eval_holdout_mock):
config = unittest.mock.Mock()
config.config_id = 198
eval_holdout_mock.side_effect = ValueError
- ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1,
- port=self.logger_port,
- resampling_strategy='holdout',
- stats=self.stats,
- memory_limit=3072,
- metric=accuracy,
- cost_for_crash=get_cost_of_crash(accuracy),
- abort_on_first_run_crash=False,
- pynisher_context='fork',
- )
+ ta = ExecuteTaFuncWithQueue(
+ backend=self.backend,
+ autosklearn_seed=1,
+ port=self.logger_port,
+ resampling_strategy="holdout",
+ stats=self.stats,
+ memory_limit=3072,
+ metric=accuracy,
+ cost_for_crash=get_cost_of_crash(accuracy),
+ abort_on_first_run_crash=False,
+ pynisher_context="fork",
+ )
self.stats.submitted_ta_runs += 1
- info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None,
- instance_specific=None, seed=1, capped=False))
+ info = ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=30,
+ instance=None,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
self.assertEqual(info[1].status, StatusType.CRASHED)
self.assertEqual(info[1].cost, 1.0)
self.assertIsInstance(info[1].time, float)
- self.assertEqual(info[1].additional_info['error'], 'ValueError()')
- self.assertIn('traceback', info[1].additional_info)
- self.assertNotIn('exitcode', info[1].additional_info)
+ self.assertEqual(info[1].additional_info["error"], "ValueError()")
+ self.assertIn("traceback", info[1].additional_info)
+ self.assertNotIn("exitcode", info[1].additional_info)
def test_silent_exception_in_target_function(self):
config = unittest.mock.Mock()
config.config_id = 198
- delattr(self.backend, 'save_targets_ensemble')
- ta = ExecuteTaFuncWithQueue(backend=self.backend,
- port=self.logger_port,
- autosklearn_seed=1,
- resampling_strategy='holdout',
- stats=self.stats,
- memory_limit=3072,
- metric=accuracy,
- cost_for_crash=get_cost_of_crash(accuracy),
- abort_on_first_run_crash=False,
- iterative=False,
- pynisher_context='fork',
- )
+ delattr(self.backend, "save_targets_ensemble")
+ ta = ExecuteTaFuncWithQueue(
+ backend=self.backend,
+ port=self.logger_port,
+ autosklearn_seed=1,
+ resampling_strategy="holdout",
+ stats=self.stats,
+ memory_limit=3072,
+ metric=accuracy,
+ cost_for_crash=get_cost_of_crash(accuracy),
+ abort_on_first_run_crash=False,
+ iterative=False,
+ pynisher_context="fork",
+ )
ta.pynisher_logger = unittest.mock.Mock()
self.stats.submitted_ta_runs += 1
- info = ta.run_wrapper(RunInfo(config=config, cutoff=3000, instance=None,
- instance_specific=None, seed=1, capped=False))
- self.assertEqual(info[1].status, StatusType.CRASHED, msg=str(info[1].additional_info))
+ info = ta.run_wrapper(
+ RunInfo(
+ config=config,
+ cutoff=3000,
+ instance=None,
+ instance_specific=None,
+ seed=1,
+ capped=False,
+ )
+ )
+ self.assertEqual(
+ info[1].status, StatusType.CRASHED, msg=str(info[1].additional_info)
+ )
self.assertEqual(info[1].cost, 1.0)
self.assertIsInstance(info[1].time, float)
self.assertIn(
- info[1].additional_info['error'],
+ info[1].additional_info["error"],
(
"""AttributeError("'BackendMock' object has no attribute """
"""'save_targets_ensemble'",)""",
"""AttributeError("'BackendMock' object has no attribute """
"""'save_targets_ensemble'")""",
- """AttributeError('save_targets_ensemble')"""
- )
+ """AttributeError('save_targets_ensemble')""",
+ ),
)
- self.assertNotIn('exitcode', info[1].additional_info)
- self.assertNotIn('exit_status', info[1].additional_info)
- self.assertNotIn('traceback', info[1])
+ self.assertNotIn("exitcode", info[1].additional_info)
+ self.assertNotIn("exit_status", info[1].additional_info)
+ self.assertNotIn("traceback", info[1])
diff --git a/test/test_evaluation/test_test_evaluator.py b/test/test_evaluation/test_test_evaluator.py
index 93ea0c2265..0a1b67faa9 100644
--- a/test/test_evaluation/test_test_evaluator.py
+++ b/test/test_evaluation/test_test_evaluator.py
@@ -14,24 +14,26 @@
from smac.tae import StatusType
from autosklearn.automl_common.common.utils.backend import Backend
-
-from autosklearn.constants import MULTILABEL_CLASSIFICATION, BINARY_CLASSIFICATION, \
- MULTICLASS_CLASSIFICATION, REGRESSION
+from autosklearn.constants import (
+ BINARY_CLASSIFICATION,
+ MULTICLASS_CLASSIFICATION,
+ MULTILABEL_CLASSIFICATION,
+ REGRESSION,
+)
from autosklearn.evaluation.test_evaluator import TestEvaluator, eval_t
from autosklearn.evaluation.util import read_queue
+from autosklearn.metrics import accuracy, f1_macro, r2
from autosklearn.util.pipeline import get_configuration_space
-from autosklearn.metrics import accuracy, r2, f1_macro
this_directory = os.path.dirname(__file__)
sys.path.append(this_directory)
from evaluation_util import ( # noqa (E402: module level import not at top of file)
- get_evaluation_backend,
- get_dataset_getters,
+ SCORER_LIST,
BaseEvaluatorTest,
+ get_dataset_getters,
+ get_evaluation_backend,
get_multiclass_classification_datamanager,
- SCORER_LIST
-) # noqa (E402: module level import not at top of file)
-
+)
N_TEST_RUNS = 3
@@ -45,28 +47,31 @@ class TestEvaluator_Test(BaseEvaluatorTest, unittest.TestCase):
def test_datasets(self):
for getter in get_dataset_getters():
- testname = '%s_%s' % (os.path.basename(__file__).
- replace('.pyc', '').replace('.py', ''),
- getter.__name__)
+ testname = "%s_%s" % (
+ os.path.basename(__file__).replace(".pyc", "").replace(".py", ""),
+ getter.__name__,
+ )
with self.subTest(testname):
backend_mock = get_evaluation_backend()
D = getter()
D_ = copy.deepcopy(D)
- y = D.data['Y_train']
+ y = D.data["Y_train"]
if len(y.shape) == 2 and y.shape[1] == 1:
- D_.data['Y_train'] = y.flatten()
+ D_.data["Y_train"] = y.flatten()
backend_mock.load_datamanager.return_value = D_
- metric_lookup = {MULTILABEL_CLASSIFICATION: f1_macro,
- BINARY_CLASSIFICATION: accuracy,
- MULTICLASS_CLASSIFICATION: accuracy,
- REGRESSION: r2}
+ metric_lookup = {
+ MULTILABEL_CLASSIFICATION: f1_macro,
+ BINARY_CLASSIFICATION: accuracy,
+ MULTICLASS_CLASSIFICATION: accuracy,
+ REGRESSION: r2,
+ }
queue_ = multiprocessing.Queue()
evaluator = TestEvaluator(
backend_mock,
queue_,
- metric=metric_lookup[D.info['task']],
+ metric=metric_lookup[D.info["task"]],
port=logging.handlers.DEFAULT_TCP_LOGGING_PORT,
additional_components=dict(),
)
@@ -75,22 +80,21 @@ def test_datasets(self):
rval = read_queue(evaluator.queue)
self.assertEqual(len(rval), 1)
self.assertEqual(len(rval[0]), 3)
- self.assertTrue(np.isfinite(rval[0]['loss']))
+ self.assertTrue(np.isfinite(rval[0]["loss"]))
class FunctionsTest(unittest.TestCase):
def setUp(self):
self.queue = multiprocessing.Queue()
self.configuration = get_configuration_space(
- {'task': MULTICLASS_CLASSIFICATION,
- 'is_sparse': False}).get_default_configuration()
+ {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False}
+ ).get_default_configuration()
self.data = get_multiclass_classification_datamanager()
- self.tmp_dir = os.path.join(os.path.dirname(__file__),
- '.test_cv_functions')
+ self.tmp_dir = os.path.join(os.path.dirname(__file__), ".test_cv_functions")
self.backend = unittest.mock.Mock(spec=Backend)
self.backend.temporary_directory = tempfile.gettempdir()
self.backend.load_datamanager.return_value = self.data
- self.dataset_name = json.dumps({'task_id': 'test'})
+ self.dataset_name = json.dumps({"task_id": "test"})
self.port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
@@ -106,7 +110,8 @@ def test_eval_test(self):
backend=self.backend,
config=self.configuration,
metric=accuracy,
- seed=1, num_run=1,
+ seed=1,
+ num_run=1,
scoring_functions=None,
output_y_hat_optimization=False,
include=None,
@@ -118,9 +123,9 @@ def test_eval_test(self):
)
rval = read_queue(self.queue)
self.assertEqual(len(rval), 1)
- self.assertAlmostEqual(rval[0]['loss'], 0.040000000000000036)
- self.assertEqual(rval[0]['status'], StatusType.SUCCESS)
- self.assertNotIn('bac_metric', rval[0]['additional_run_info'])
+ self.assertAlmostEqual(rval[0]["loss"], 0.040000000000000036)
+ self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
+ self.assertNotIn("bac_metric", rval[0]["additional_run_info"])
def test_eval_test_all_loss_functions(self):
eval_t(
@@ -128,7 +133,8 @@ def test_eval_test_all_loss_functions(self):
backend=self.backend,
config=self.configuration,
metric=accuracy,
- seed=1, num_run=1,
+ seed=1,
+ num_run=1,
scoring_functions=SCORER_LIST,
output_y_hat_optimization=False,
include=None,
@@ -142,25 +148,30 @@ def test_eval_test_all_loss_functions(self):
self.assertEqual(len(rval), 1)
# Note: All metric here should be minimized
- fixture = {'accuracy': 0.040000000000000036,
- 'balanced_accuracy': 0.02777777777777779,
- 'f1_macro': 0.0341005967604433,
- 'f1_micro': 0.040000000000000036,
- 'f1_weighted': 0.039693094629155934,
- 'log_loss': 0.13966929787769913,
- 'precision_macro': 0.03703703703703709,
- 'precision_micro': 0.040000000000000036,
- 'precision_weighted': 0.03555555555555556,
- 'recall_macro': 0.02777777777777779,
- 'recall_micro': 0.040000000000000036,
- 'recall_weighted': 0.040000000000000036,
- 'num_run': -1}
-
- additional_run_info = rval[0]['additional_run_info']
+ fixture = {
+ "accuracy": 0.040000000000000036,
+ "balanced_accuracy": 0.02777777777777779,
+ "f1_macro": 0.0341005967604433,
+ "f1_micro": 0.040000000000000036,
+ "f1_weighted": 0.039693094629155934,
+ "log_loss": 0.13966929787769913,
+ "precision_macro": 0.03703703703703709,
+ "precision_micro": 0.040000000000000036,
+ "precision_weighted": 0.03555555555555556,
+ "recall_macro": 0.02777777777777779,
+ "recall_micro": 0.040000000000000036,
+ "recall_weighted": 0.040000000000000036,
+ "num_run": -1,
+ }
+
+ additional_run_info = rval[0]["additional_run_info"]
for key, value in fixture.items():
self.assertAlmostEqual(additional_run_info[key], fixture[key], msg=key)
- self.assertEqual(len(additional_run_info), len(fixture) + 1,
- msg=sorted(additional_run_info.items()))
- self.assertIn('duration', additional_run_info)
- self.assertAlmostEqual(rval[0]['loss'], 0.040000000000000036)
- self.assertEqual(rval[0]['status'], StatusType.SUCCESS)
+ self.assertEqual(
+ len(additional_run_info),
+ len(fixture) + 1,
+ msg=sorted(additional_run_info.items()),
+ )
+ self.assertIn("duration", additional_run_info)
+ self.assertAlmostEqual(rval[0]["loss"], 0.040000000000000036)
+ self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py
index 28bddcdb09..92e3cfcc10 100644
--- a/test/test_evaluation/test_train_evaluator.py
+++ b/test/test_evaluation/test_train_evaluator.py
@@ -1,49 +1,73 @@
import copy
import json
import logging.handlers
-import queue
import multiprocessing
import os
-import tempfile
+import queue
import shutil
import sys
+import tempfile
import unittest
import unittest.mock
-from ConfigSpace import Configuration
import numpy as np
-from sklearn.model_selection import GroupKFold, GroupShuffleSplit, \
- KFold, LeaveOneGroupOut, LeavePGroupsOut, LeaveOneOut, LeavePOut, \
- PredefinedSplit, RepeatedKFold, RepeatedStratifiedKFold, ShuffleSplit, \
- StratifiedKFold, StratifiedShuffleSplit, TimeSeriesSplit
import sklearn.model_selection
+from ConfigSpace import Configuration
+from sklearn.model_selection import (
+ GroupKFold,
+ GroupShuffleSplit,
+ KFold,
+ LeaveOneGroupOut,
+ LeaveOneOut,
+ LeavePGroupsOut,
+ LeavePOut,
+ PredefinedSplit,
+ RepeatedKFold,
+ RepeatedStratifiedKFold,
+ ShuffleSplit,
+ StratifiedKFold,
+ StratifiedShuffleSplit,
+ TimeSeriesSplit,
+)
from smac.tae import StatusType, TAEAbortException
-from autosklearn.automl_common.common.utils import backend
-
import autosklearn.evaluation.splitter
+from autosklearn.automl_common.common.utils import backend
+from autosklearn.constants import (
+ BINARY_CLASSIFICATION,
+ MULTICLASS_CLASSIFICATION,
+ MULTILABEL_CLASSIFICATION,
+ MULTIOUTPUT_REGRESSION,
+ REGRESSION,
+)
from autosklearn.data.abstract_data_manager import AbstractDataManager
+from autosklearn.evaluation.train_evaluator import (
+ TrainEvaluator,
+ eval_cv,
+ eval_holdout,
+ eval_iterative_holdout,
+ eval_partial_cv,
+ subsample_indices,
+)
from autosklearn.evaluation.util import read_queue
-from autosklearn.evaluation.train_evaluator import TrainEvaluator, \
- eval_holdout, eval_iterative_holdout, eval_cv, eval_partial_cv, subsample_indices
+from autosklearn.metrics import accuracy, f1_macro, r2
from autosklearn.util.pipeline import get_configuration_space
-from autosklearn.constants import BINARY_CLASSIFICATION, \
- MULTILABEL_CLASSIFICATION,\
- MULTICLASS_CLASSIFICATION,\
- REGRESSION,\
- MULTIOUTPUT_REGRESSION
-from autosklearn.metrics import accuracy, r2, f1_macro
this_directory = os.path.dirname(__file__)
sys.path.append(this_directory)
-from evaluation_util import get_regression_datamanager, BaseEvaluatorTest, \
- get_binary_classification_datamanager, get_dataset_getters, \
- get_multiclass_classification_datamanager, SCORER_LIST # noqa (E402: module level import not at top of file)
+from evaluation_util import ( # noqa (E402: module level import not at top of file)
+ SCORER_LIST,
+ BaseEvaluatorTest,
+ get_binary_classification_datamanager,
+ get_dataset_getters,
+ get_multiclass_classification_datamanager,
+ get_regression_datamanager,
+)
class Dummy(object):
def __init__(self):
- self.name = 'dummy'
+ self.name = "dummy"
class TestTrainEvaluator(BaseEvaluatorTest, unittest.TestCase):
@@ -54,13 +78,15 @@ def setUp(self):
Creates a backend mock
"""
tmp_dir_name = self.id()
- self.ev_path = os.path.join(this_directory, '.tmp_evaluations', tmp_dir_name)
+ self.ev_path = os.path.join(this_directory, ".tmp_evaluations", tmp_dir_name)
if os.path.exists(self.ev_path):
shutil.rmtree(self.ev_path)
os.makedirs(self.ev_path, exist_ok=False)
dummy_model_files = [os.path.join(self.ev_path, str(n)) for n in range(100)]
dummy_pred_files = [os.path.join(self.ev_path, str(n)) for n in range(100, 200)]
- dummy_cv_model_files = [os.path.join(self.ev_path, str(n)) for n in range(200, 300)]
+ dummy_cv_model_files = [
+ os.path.join(self.ev_path, str(n)) for n in range(200, 300)
+ ]
backend_mock = unittest.mock.Mock()
backend_mock.temporary_directory = tempfile.gettempdir()
backend_mock.get_model_dir.return_value = self.ev_path
@@ -70,7 +96,7 @@ def setUp(self):
backend_mock.get_prediction_output_path.side_effect = dummy_pred_files
self.backend_mock = backend_mock
- self.tmp_dir = os.path.join(self.ev_path, 'tmp_dir')
+ self.tmp_dir = os.path.join(self.ev_path, "tmp_dir")
self.port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
@@ -78,15 +104,18 @@ def tearDown(self):
if os.path.exists(self.ev_path):
shutil.rmtree(self.ev_path)
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_holdout(self, pipeline_mock):
# Binary iris, contains 69 train samples, 25 validation samples,
# 6 test samples
D = get_binary_classification_datamanager()
- D.name = 'test'
+ D.name = "test"
- pipeline_mock.predict_proba.side_effect = \
- lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1))
+ pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile(
+ [0.6, 0.4], (len(X), 1)
+ )
pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
pipeline_mock.get_additional_run_info.return_value = None
pipeline_mock.get_max_iter.return_value = 1
@@ -96,21 +125,23 @@ def test_holdout(self, pipeline_mock):
backend_api = backend.create(
temporary_directory=self.tmp_dir,
output_directory=None,
- prefix="auto-sklearn"
+ prefix="auto-sklearn",
)
backend_api.load_datamanager = lambda: D
queue_ = multiprocessing.Queue()
- evaluator = TrainEvaluator(backend_api, queue_,
- configuration=configuration,
- resampling_strategy='holdout',
- resampling_strategy_args={'train_size': 0.66},
- scoring_functions=None,
- output_y_hat_optimization=True,
- metric=accuracy,
- port=self.port,
- additional_components=dict(),
- )
+ evaluator = TrainEvaluator(
+ backend_api,
+ queue_,
+ configuration=configuration,
+ resampling_strategy="holdout",
+ resampling_strategy_args={"train_size": 0.66},
+ scoring_functions=None,
+ output_y_hat_optimization=True,
+ metric=accuracy,
+ port=self.port,
+ additional_components=dict(),
+ )
evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
evaluator.file_output.return_value = (None, {})
@@ -118,7 +149,7 @@ def test_holdout(self, pipeline_mock):
rval = read_queue(evaluator.queue)
self.assertEqual(len(rval), 1)
- result = rval[0]['loss']
+ result = rval[0]["loss"]
self.assertEqual(len(rval[0]), 3)
self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
@@ -129,17 +160,21 @@ def test_holdout(self, pipeline_mock):
self.assertEqual(pipeline_mock.predict_proba.call_count, 4)
self.assertEqual(evaluator.file_output.call_count, 1)
self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 24)
- self.assertEqual(evaluator.file_output.call_args[0][1].shape[0],
- D.data['Y_valid'].shape[0])
- self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
- D.data['Y_test'].shape[0])
+ self.assertEqual(
+ evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
+ )
+ self.assertEqual(
+ evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+ )
self.assertEqual(evaluator.model.fit.call_count, 1)
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_iterative_holdout(self, pipeline_mock):
# Regular fitting
D = get_binary_classification_datamanager()
- D.name = 'test'
+ D.name = "test"
class SideEffect(object):
def __init__(self):
@@ -152,55 +187,100 @@ def configuration_fully_fitted(self):
# final call to iterative fit
return self.fully_fitted_call_count > 18
- Xt_fixture = 'Xt_fixture'
+ Xt_fixture = "Xt_fixture"
pipeline_mock.estimator_supports_iterative_fit.return_value = True
- pipeline_mock.configuration_fully_fitted.side_effect = \
+ pipeline_mock.configuration_fully_fitted.side_effect = (
SideEffect().configuration_fully_fitted
+ )
pipeline_mock.fit_transformer.return_value = Xt_fixture, {}
- pipeline_mock.predict_proba.side_effect = \
- lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1))
+ pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile(
+ [0.6, 0.4], (len(X), 1)
+ )
pipeline_mock.get_additional_run_info.return_value = None
pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
pipeline_mock.get_max_iter.return_value = 512
- pipeline_mock.get_current_iter.side_effect = (2, 4, 8, 16, 32, 64, 128, 256, 512)
+ pipeline_mock.get_current_iter.side_effect = (
+ 2,
+ 4,
+ 8,
+ 16,
+ 32,
+ 64,
+ 128,
+ 256,
+ 512,
+ )
configuration = unittest.mock.Mock(spec=Configuration)
backend_api = backend.create(
temporary_directory=self.tmp_dir,
output_directory=None,
- prefix="auto-sklearn"
+ prefix="auto-sklearn",
)
backend_api.load_datamanager = lambda: D
queue_ = multiprocessing.Queue()
- evaluator = TrainEvaluator(backend_api, queue_,
- port=self.port,
- configuration=configuration,
- resampling_strategy='holdout',
- scoring_functions=None,
- output_y_hat_optimization=True,
- metric=accuracy,
- budget=0.0,
- additional_components=dict(),)
+ evaluator = TrainEvaluator(
+ backend_api,
+ queue_,
+ port=self.port,
+ configuration=configuration,
+ resampling_strategy="holdout",
+ scoring_functions=None,
+ output_y_hat_optimization=True,
+ metric=accuracy,
+ budget=0.0,
+ additional_components=dict(),
+ )
evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
evaluator.file_output.return_value = (None, {})
class LossSideEffect(object):
def __init__(self):
- self.losses = [1.0, 1.0, 1.0, 1.0,
- 0.9, 0.9, 0.9, 0.9,
- 0.8, 0.8, 0.8, 0.8,
- 0.7, 0.7, 0.7, 0.7,
- 0.6, 0.6, 0.6, 0.6,
- 0.5, 0.5, 0.5, 0.5,
- 0.4, 0.4, 0.4, 0.4,
- 0.3, 0.3, 0.3, 0.3,
- 0.2, 0.2, 0.2, 0.2]
+ self.losses = [
+ 1.0,
+ 1.0,
+ 1.0,
+ 1.0,
+ 0.9,
+ 0.9,
+ 0.9,
+ 0.9,
+ 0.8,
+ 0.8,
+ 0.8,
+ 0.8,
+ 0.7,
+ 0.7,
+ 0.7,
+ 0.7,
+ 0.6,
+ 0.6,
+ 0.6,
+ 0.6,
+ 0.5,
+ 0.5,
+ 0.5,
+ 0.5,
+ 0.4,
+ 0.4,
+ 0.4,
+ 0.4,
+ 0.3,
+ 0.3,
+ 0.3,
+ 0.3,
+ 0.2,
+ 0.2,
+ 0.2,
+ 0.2,
+ ]
self.iteration = 0
def side_effect(self, *args, **kwargs):
self.iteration += 1
return self.losses[self.iteration - 1]
+
evaluator._loss = unittest.mock.Mock()
evaluator._loss.side_effect = LossSideEffect().side_effect
@@ -209,38 +289,42 @@ def side_effect(self, *args, **kwargs):
for i in range(1, 10):
rval = evaluator.queue.get(timeout=1)
- result = rval['loss']
+ result = rval["loss"]
self.assertAlmostEqual(result, 1.0 - (0.1 * (i - 1)))
if i < 9:
- self.assertEqual(rval['status'], StatusType.DONOTADVANCE)
+ self.assertEqual(rval["status"], StatusType.DONOTADVANCE)
self.assertEqual(len(rval), 3)
else:
- self.assertEqual(rval['status'], StatusType.SUCCESS)
+ self.assertEqual(rval["status"], StatusType.SUCCESS)
self.assertEqual(len(rval), 4)
self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
self.assertEqual(pipeline_mock.iterative_fit.call_count, 9)
self.assertEqual(
- [cal[1]['n_iter'] for cal in pipeline_mock.iterative_fit.call_args_list],
- [2, 2, 4, 8, 16, 32, 64, 128, 256]
+ [cal[1]["n_iter"] for cal in pipeline_mock.iterative_fit.call_args_list],
+ [2, 2, 4, 8, 16, 32, 64, 128, 256],
)
# 20 calls because of train, holdout, validation and test set
# and a total of five calls because of five iterations of fitting
self.assertEqual(evaluator.model.predict_proba.call_count, 36)
# 1/3 of 69
self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23)
- self.assertEqual(evaluator.file_output.call_args[0][1].shape[0],
- D.data['Y_valid'].shape[0])
- self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
- D.data['Y_test'].shape[0])
+ self.assertEqual(
+ evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
+ )
+ self.assertEqual(
+ evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+ )
self.assertEqual(evaluator.file_output.call_count, 9)
self.assertEqual(evaluator.model.fit.call_count, 0)
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_iterative_holdout_interuption(self, pipeline_mock):
# Regular fitting
D = get_binary_classification_datamanager()
- D.name = 'test'
+ D.name = "test"
class SideEffect(object):
def __init__(self):
@@ -252,61 +336,93 @@ def configuration_fully_fitted(self):
# if we need to add a special indicator to show that this is the
# final call to iterative fit
if self.fully_fitted_call_count == 5:
- raise ValueError('fixture')
+ raise ValueError("fixture")
return self.fully_fitted_call_count > 10
- Xt_fixture = 'Xt_fixture'
+ Xt_fixture = "Xt_fixture"
pipeline_mock.estimator_supports_iterative_fit.return_value = True
- pipeline_mock.configuration_fully_fitted.side_effect = \
+ pipeline_mock.configuration_fully_fitted.side_effect = (
SideEffect().configuration_fully_fitted
+ )
pipeline_mock.fit_transformer.return_value = Xt_fixture, {}
- pipeline_mock.predict_proba.side_effect = \
- lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1))
+ pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile(
+ [0.6, 0.4], (len(X), 1)
+ )
pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
pipeline_mock.get_additional_run_info.return_value = None
pipeline_mock.get_max_iter.return_value = 512
- pipeline_mock.get_current_iter.side_effect = (2, 4, 8, 16, 32, 64, 128, 256, 512)
+ pipeline_mock.get_current_iter.side_effect = (
+ 2,
+ 4,
+ 8,
+ 16,
+ 32,
+ 64,
+ 128,
+ 256,
+ 512,
+ )
configuration = unittest.mock.Mock(spec=Configuration)
backend_api = backend.create(
temporary_directory=self.tmp_dir,
output_directory=None,
- prefix="auto-sklearn"
+ prefix="auto-sklearn",
)
backend_api.load_datamanager = lambda: D
queue_ = multiprocessing.Queue()
- evaluator = TrainEvaluator(backend_api, queue_,
- port=self.port,
- configuration=configuration,
- resampling_strategy='holdout-iterative-fit',
- scoring_functions=None,
- output_y_hat_optimization=True,
- metric=accuracy,
- budget=0.0,
- additional_components=dict(),
- )
+ evaluator = TrainEvaluator(
+ backend_api,
+ queue_,
+ port=self.port,
+ configuration=configuration,
+ resampling_strategy="holdout-iterative-fit",
+ scoring_functions=None,
+ output_y_hat_optimization=True,
+ metric=accuracy,
+ budget=0.0,
+ additional_components=dict(),
+ )
evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
evaluator.file_output.return_value = (None, {})
class LossSideEffect(object):
def __init__(self):
- self.losses = [0.8, 0.8, 0.8, 0.8,
- 0.6, 0.6, 0.6, 0.6,
- 0.4, 0.4, 0.4, 0.4,
- 0.2, 0.2, 0.2, 0.2,
- 0.0, 0.0, 0.0, 0.0]
+ self.losses = [
+ 0.8,
+ 0.8,
+ 0.8,
+ 0.8,
+ 0.6,
+ 0.6,
+ 0.6,
+ 0.6,
+ 0.4,
+ 0.4,
+ 0.4,
+ 0.4,
+ 0.2,
+ 0.2,
+ 0.2,
+ 0.2,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ ]
self.iteration = 0
def side_effect(self, *args, **kwargs):
self.iteration += 1
return self.losses[self.iteration - 1]
+
evaluator._loss = unittest.mock.Mock()
evaluator._loss.side_effect = LossSideEffect().side_effect
self.assertRaisesRegex(
ValueError,
- 'fixture',
+ "fixture",
evaluator.fit_predict_and_loss,
iterative=True,
)
@@ -314,7 +430,7 @@ def side_effect(self, *args, **kwargs):
for i in range(1, 3):
rval = evaluator.queue.get(timeout=1)
- self.assertAlmostEqual(rval['loss'], 1.0 - (0.2 * i))
+ self.assertAlmostEqual(rval["loss"], 1.0 - (0.2 * i))
self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
self.assertEqual(pipeline_mock.iterative_fit.call_count, 2)
@@ -322,24 +438,29 @@ def side_effect(self, *args, **kwargs):
# and a total of two calls each because of two iterations of fitting
self.assertEqual(evaluator.model.predict_proba.call_count, 8)
self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23)
- self.assertEqual(evaluator.file_output.call_args[0][1].shape[0],
- D.data['Y_valid'].shape[0])
- self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
- D.data['Y_test'].shape[0])
+ self.assertEqual(
+ evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
+ )
+ self.assertEqual(
+ evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+ )
self.assertEqual(evaluator.file_output.call_count, 2)
self.assertEqual(evaluator.model.fit.call_count, 0)
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_iterative_holdout_not_iterative(self, pipeline_mock):
# Regular fitting
D = get_binary_classification_datamanager()
- D.name = 'test'
+ D.name = "test"
- Xt_fixture = 'Xt_fixture'
+ Xt_fixture = "Xt_fixture"
pipeline_mock.estimator_supports_iterative_fit.return_value = False
pipeline_mock.fit_transformer.return_value = Xt_fixture, {}
- pipeline_mock.predict_proba.side_effect = \
- lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1))
+ pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile(
+ [0.6, 0.4], (len(X), 1)
+ )
pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
pipeline_mock.get_additional_run_info.return_value = None
@@ -347,20 +468,22 @@ def test_iterative_holdout_not_iterative(self, pipeline_mock):
backend_api = backend.create(
temporary_directory=self.tmp_dir,
output_directory=None,
- prefix="auto-sklearn"
+ prefix="auto-sklearn",
)
backend_api.load_datamanager = lambda: D
queue_ = multiprocessing.Queue()
- evaluator = TrainEvaluator(backend_api, queue_,
- port=self.port,
- configuration=configuration,
- resampling_strategy='holdout-iterative-fit',
- scoring_functions=None,
- output_y_hat_optimization=True,
- metric=accuracy,
- additional_components=dict(),
- )
+ evaluator = TrainEvaluator(
+ backend_api,
+ queue_,
+ port=self.port,
+ configuration=configuration,
+ resampling_strategy="holdout-iterative-fit",
+ scoring_functions=None,
+ output_y_hat_optimization=True,
+ metric=accuracy,
+ additional_components=dict(),
+ )
evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
evaluator.file_output.return_value = (None, {})
@@ -368,26 +491,31 @@ def test_iterative_holdout_not_iterative(self, pipeline_mock):
self.assertEqual(evaluator.file_output.call_count, 1)
rval = evaluator.queue.get(timeout=1)
- self.assertAlmostEqual(rval['loss'], 0.47826086956521741)
+ self.assertAlmostEqual(rval["loss"], 0.47826086956521741)
self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
self.assertEqual(pipeline_mock.iterative_fit.call_count, 0)
# four calls for train, opt, valid and test
self.assertEqual(evaluator.model.predict_proba.call_count, 4)
self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23)
- self.assertEqual(evaluator.file_output.call_args[0][1].shape[0],
- D.data['Y_valid'].shape[0])
- self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
- D.data['Y_test'].shape[0])
+ self.assertEqual(
+ evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
+ )
+ self.assertEqual(
+ evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+ )
self.assertEqual(evaluator.file_output.call_count, 1)
self.assertEqual(evaluator.model.fit.call_count, 1)
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_cv(self, pipeline_mock):
D = get_binary_classification_datamanager()
- pipeline_mock.predict_proba.side_effect = \
- lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1))
+ pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile(
+ [0.6, 0.4], (len(X), 1)
+ )
pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
pipeline_mock.get_additional_run_info.return_value = None
@@ -395,21 +523,23 @@ def test_cv(self, pipeline_mock):
backend_api = backend.create(
temporary_directory=self.tmp_dir,
output_directory=None,
- prefix="auto-sklearn"
+ prefix="auto-sklearn",
)
backend_api.load_datamanager = lambda: D
queue_ = multiprocessing.Queue()
- evaluator = TrainEvaluator(backend_api, queue_,
- port=self.port,
- configuration=configuration,
- resampling_strategy='cv',
- resampling_strategy_args={'folds': 5},
- scoring_functions=None,
- output_y_hat_optimization=True,
- metric=accuracy,
- additional_components=dict(),
- )
+ evaluator = TrainEvaluator(
+ backend_api,
+ queue_,
+ port=self.port,
+ configuration=configuration,
+ resampling_strategy="cv",
+ resampling_strategy_args={"folds": 5},
+ scoring_functions=None,
+ output_y_hat_optimization=True,
+ metric=accuracy,
+ additional_components=dict(),
+ )
evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
evaluator.file_output.return_value = (None, {})
@@ -417,7 +547,7 @@ def test_cv(self, pipeline_mock):
rval = read_queue(evaluator.queue)
self.assertEqual(len(rval), 1)
- result = rval[0]['loss']
+ result = rval[0]["loss"]
self.assertEqual(len(rval[0]), 3)
self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
@@ -427,49 +557,57 @@ def test_cv(self, pipeline_mock):
# Fifteen calls because of the training, holdout, validation and
# test set (4 sets x 5 folds = 20)
self.assertEqual(pipeline_mock.predict_proba.call_count, 20)
- self.assertEqual(evaluator.file_output.call_args[0][0].shape[0],
- D.data['Y_train'].shape[0])
- self.assertEqual(evaluator.file_output.call_args[0][1].shape[0],
- D.data['Y_valid'].shape[0])
- self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
- D.data['Y_test'].shape[0])
+ self.assertEqual(
+ evaluator.file_output.call_args[0][0].shape[0], D.data["Y_train"].shape[0]
+ )
+ self.assertEqual(
+ evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
+ )
+ self.assertEqual(
+ evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+ )
# The model prior to fitting is saved, this cannot be directly tested
# because of the way the mock module is used. Instead, we test whether
# the if block in which model assignment is done is accessed
self.assertTrue(evaluator._added_empty_model)
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_partial_cv(self, pipeline_mock):
D = get_binary_classification_datamanager()
- pipeline_mock.predict_proba.side_effect = \
- lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1))
+ pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile(
+ [0.6, 0.4], (len(X), 1)
+ )
pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
pipeline_mock.get_additional_run_info.return_value = None
pipeline_mock.get_max_iter.return_value = 1
pipeline_mock.get_current_iter.return_value = 1
D = get_binary_classification_datamanager()
- D.name = 'test'
+ D.name = "test"
configuration = unittest.mock.Mock(spec=Configuration)
backend_api = backend.create(
temporary_directory=self.tmp_dir,
output_directory=None,
- prefix="auto-sklearn"
+ prefix="auto-sklearn",
)
backend_api.load_datamanager = lambda: D
queue_ = multiprocessing.Queue()
- evaluator = TrainEvaluator(backend_api, queue_,
- port=self.port,
- configuration=configuration,
- resampling_strategy='partial-cv',
- resampling_strategy_args={'folds': 5},
- scoring_functions=None,
- output_y_hat_optimization=True,
- metric=accuracy,
- additional_components=dict(),
- )
+ evaluator = TrainEvaluator(
+ backend_api,
+ queue_,
+ port=self.port,
+ configuration=configuration,
+ resampling_strategy="partial-cv",
+ resampling_strategy_args={"folds": 5},
+ scoring_functions=None,
+ output_y_hat_optimization=True,
+ metric=accuracy,
+ additional_components=dict(),
+ )
evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
evaluator.file_output.return_value = (None, {})
@@ -480,19 +618,21 @@ def test_partial_cv(self, pipeline_mock):
self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
self.assertEqual(evaluator.file_output.call_count, 0)
- self.assertEqual(rval['loss'], 0.5)
+ self.assertEqual(rval["loss"], 0.5)
self.assertEqual(pipeline_mock.fit.call_count, 1)
self.assertEqual(pipeline_mock.predict_proba.call_count, 4)
# The model prior to fitting is saved, this cannot be directly tested
# because of the way the mock module is used. Instead, we test whether
# the if block in which model assignment is done is accessed
- self.assertTrue(hasattr(evaluator, 'model'))
+ self.assertTrue(hasattr(evaluator, "model"))
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_iterative_partial_cv(self, pipeline_mock):
# Regular fitting
D = get_binary_classification_datamanager()
- D.name = 'test'
+ D.name = "test"
class SideEffect(object):
def __init__(self):
@@ -505,57 +645,101 @@ def configuration_fully_fitted(self):
# final call to iterative fit
return self.fully_fitted_call_count > 18
- Xt_fixture = 'Xt_fixture'
+ Xt_fixture = "Xt_fixture"
pipeline_mock.estimator_supports_iterative_fit.return_value = True
- pipeline_mock.configuration_fully_fitted.side_effect = \
+ pipeline_mock.configuration_fully_fitted.side_effect = (
SideEffect().configuration_fully_fitted
+ )
pipeline_mock.fit_transformer.return_value = Xt_fixture, {}
- pipeline_mock.predict_proba.side_effect = \
- lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1))
+ pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile(
+ [0.6, 0.4], (len(X), 1)
+ )
pipeline_mock.get_additional_run_info.return_value = None
pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
pipeline_mock.get_max_iter.return_value = 512
- pipeline_mock.get_current_iter.side_effect = (2, 4, 8, 16, 32, 64, 128, 256, 512)
+ pipeline_mock.get_current_iter.side_effect = (
+ 2,
+ 4,
+ 8,
+ 16,
+ 32,
+ 64,
+ 128,
+ 256,
+ 512,
+ )
configuration = unittest.mock.Mock(spec=Configuration)
backend_api = backend.create(
temporary_directory=self.tmp_dir,
output_directory=None,
- prefix="auto-sklearn"
+ prefix="auto-sklearn",
)
backend_api.load_datamanager = lambda: D
queue_ = multiprocessing.Queue()
- evaluator = TrainEvaluator(backend_api, queue_,
- port=self.port,
- configuration=configuration,
- resampling_strategy='partial-cv-iterative-fit',
- resampling_strategy_args={'folds': 5},
- scoring_functions=None,
- output_y_hat_optimization=True,
- metric=accuracy,
- budget=0.0,
- additional_components=dict(),
- )
+ evaluator = TrainEvaluator(
+ backend_api,
+ queue_,
+ port=self.port,
+ configuration=configuration,
+ resampling_strategy="partial-cv-iterative-fit",
+ resampling_strategy_args={"folds": 5},
+ scoring_functions=None,
+ output_y_hat_optimization=True,
+ metric=accuracy,
+ budget=0.0,
+ additional_components=dict(),
+ )
evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
evaluator.file_output.return_value = (None, {})
class LossSideEffect(object):
def __init__(self):
- self.losses = [1.0, 1.0, 1.0, 1.0,
- 0.9, 0.9, 0.9, 0.9,
- 0.8, 0.8, 0.8, 0.8,
- 0.7, 0.7, 0.7, 0.7,
- 0.6, 0.6, 0.6, 0.6,
- 0.5, 0.5, 0.5, 0.5,
- 0.4, 0.4, 0.4, 0.4,
- 0.3, 0.3, 0.3, 0.3,
- 0.2, 0.2, 0.2, 0.2]
+ self.losses = [
+ 1.0,
+ 1.0,
+ 1.0,
+ 1.0,
+ 0.9,
+ 0.9,
+ 0.9,
+ 0.9,
+ 0.8,
+ 0.8,
+ 0.8,
+ 0.8,
+ 0.7,
+ 0.7,
+ 0.7,
+ 0.7,
+ 0.6,
+ 0.6,
+ 0.6,
+ 0.6,
+ 0.5,
+ 0.5,
+ 0.5,
+ 0.5,
+ 0.4,
+ 0.4,
+ 0.4,
+ 0.4,
+ 0.3,
+ 0.3,
+ 0.3,
+ 0.3,
+ 0.2,
+ 0.2,
+ 0.2,
+ 0.2,
+ ]
self.iteration = 0
def side_effect(self, *args, **kwargs):
self.iteration += 1
return self.losses[self.iteration - 1]
+
evaluator._loss = unittest.mock.Mock()
evaluator._loss.side_effect = LossSideEffect().side_effect
@@ -565,118 +749,145 @@ def side_effect(self, *args, **kwargs):
for i in range(1, 10):
rval = evaluator.queue.get(timeout=1)
- self.assertAlmostEqual(rval['loss'], 1.0 - (0.1 * (i - 1)))
+ self.assertAlmostEqual(rval["loss"], 1.0 - (0.1 * (i - 1)))
if i < 9:
- self.assertEqual(rval['status'], StatusType.DONOTADVANCE)
+ self.assertEqual(rval["status"], StatusType.DONOTADVANCE)
else:
- self.assertEqual(rval['status'], StatusType.SUCCESS)
+ self.assertEqual(rval["status"], StatusType.SUCCESS)
self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
self.assertEqual(pipeline_mock.iterative_fit.call_count, 9)
self.assertEqual(
- [cal[1]['n_iter'] for cal in pipeline_mock.iterative_fit.call_args_list],
- [2, 2, 4, 8, 16, 32, 64, 128, 256]
+ [cal[1]["n_iter"] for cal in pipeline_mock.iterative_fit.call_args_list],
+ [2, 2, 4, 8, 16, 32, 64, 128, 256],
)
# fifteen calls because of the holdout, the validation and the test set
# and a total of five calls because of five iterations of fitting
- self.assertTrue(hasattr(evaluator, 'model'))
+ self.assertTrue(hasattr(evaluator, "model"))
self.assertEqual(pipeline_mock.iterative_fit.call_count, 9)
# 20 calls because of train, holdout, the validation and the test set
# and a total of five calls because of five iterations of fitting
self.assertEqual(pipeline_mock.predict_proba.call_count, 36)
- @unittest.mock.patch.object(TrainEvaluator, '_loss')
- @unittest.mock.patch.object(TrainEvaluator, '_get_model')
+ @unittest.mock.patch.object(TrainEvaluator, "_loss")
+ @unittest.mock.patch.object(TrainEvaluator, "_get_model")
def test_file_output(self, loss_mock, model_mock):
D = get_regression_datamanager()
- D.name = 'test'
+ D.name = "test"
self.backend_mock.load_datamanager.return_value = D
configuration = unittest.mock.Mock(spec=Configuration)
queue_ = multiprocessing.Queue()
loss_mock.return_value = None
model_mock.return_value = None
- evaluator = TrainEvaluator(self.backend_mock, queue=queue_,
- port=self.port,
- configuration=configuration,
- resampling_strategy='cv',
- resampling_strategy_args={'folds': 5},
- scoring_functions=SCORER_LIST,
- output_y_hat_optimization=True,
- metric=accuracy,
- additional_components=dict(),)
+ evaluator = TrainEvaluator(
+ self.backend_mock,
+ queue=queue_,
+ port=self.port,
+ configuration=configuration,
+ resampling_strategy="cv",
+ resampling_strategy_args={"folds": 5},
+ scoring_functions=SCORER_LIST,
+ output_y_hat_optimization=True,
+ metric=accuracy,
+ additional_components=dict(),
+ )
self.backend_mock.get_model_dir.return_value = True
- evaluator.model = 'model'
- evaluator.Y_optimization = D.data['Y_train']
+ evaluator.model = "model"
+ evaluator.Y_optimization = D.data["Y_train"]
rval = evaluator.file_output(
- D.data['Y_train'],
- D.data['Y_valid'],
- D.data['Y_test'],
+ D.data["Y_train"],
+ D.data["Y_valid"],
+ D.data["Y_test"],
)
self.assertEqual(rval, (None, {}))
self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, 1)
self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 1)
- self.assertEqual(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(),
- {'seed', 'idx', 'budget', 'model', 'cv_model',
- 'ensemble_predictions', 'valid_predictions', 'test_predictions'})
- self.assertIsNotNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model'])
- self.assertIsNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model'])
+ self.assertEqual(
+ self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(),
+ {
+ "seed",
+ "idx",
+ "budget",
+ "model",
+ "cv_model",
+ "ensemble_predictions",
+ "valid_predictions",
+ "test_predictions",
+ },
+ )
+ self.assertIsNotNone(
+ self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["model"]
+ )
+ self.assertIsNone(
+ self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["cv_model"]
+ )
- evaluator.models = ['model2', 'model2']
+ evaluator.models = ["model2", "model2"]
rval = evaluator.file_output(
- D.data['Y_train'],
- D.data['Y_valid'],
- D.data['Y_test'],
+ D.data["Y_train"],
+ D.data["Y_valid"],
+ D.data["Y_test"],
)
self.assertEqual(rval, (None, {}))
self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, 2)
self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 2)
- self.assertEqual(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(),
- {'seed', 'idx', 'budget', 'model', 'cv_model',
- 'ensemble_predictions', 'valid_predictions', 'test_predictions'})
- self.assertIsNotNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model'])
- self.assertIsNotNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model'])
+ self.assertEqual(
+ self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(),
+ {
+ "seed",
+ "idx",
+ "budget",
+ "model",
+ "cv_model",
+ "ensemble_predictions",
+ "valid_predictions",
+ "test_predictions",
+ },
+ )
+ self.assertIsNotNone(
+ self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["model"]
+ )
+ self.assertIsNotNone(
+ self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["cv_model"]
+ )
# Check for not containing NaNs - that the models don't predict nonsense
# for unseen data
- D.data['Y_valid'][0] = np.NaN
+ D.data["Y_valid"][0] = np.NaN
rval = evaluator.file_output(
- D.data['Y_train'],
- D.data['Y_valid'],
- D.data['Y_test'],
+ D.data["Y_train"],
+ D.data["Y_valid"],
+ D.data["Y_test"],
)
self.assertEqual(
rval,
(
1.0,
- {
- 'error':
- 'Model predictions for validation set contains NaNs.'
- },
- )
+ {"error": "Model predictions for validation set contains NaNs."},
+ ),
)
- D.data['Y_train'][0] = np.NaN
+ D.data["Y_train"][0] = np.NaN
rval = evaluator.file_output(
- D.data['Y_train'],
- D.data['Y_valid'],
- D.data['Y_test'],
+ D.data["Y_train"],
+ D.data["Y_valid"],
+ D.data["Y_test"],
)
self.assertEqual(
rval,
(
1.0,
- {
- 'error':
- 'Model predictions for optimization set contains NaNs.'
- },
- )
+ {"error": "Model predictions for optimization set contains NaNs."},
+ ),
)
- @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend')
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend")
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_subsample_indices_classification(self, mock, backend_mock):
configuration = unittest.mock.Mock(spec=Configuration)
@@ -684,26 +895,32 @@ def test_subsample_indices_classification(self, mock, backend_mock):
D = get_binary_classification_datamanager()
backend_mock.load_datamanager.return_value = D
backend_mock.temporary_directory = tempfile.gettempdir()
- evaluator = TrainEvaluator(backend_mock, queue_,
- port=self.port,
- configuration=configuration,
- resampling_strategy='cv',
- resampling_strategy_args={'folds': 10},
- metric=accuracy,
- additional_components=dict(),
- )
+ evaluator = TrainEvaluator(
+ backend_mock,
+ queue_,
+ port=self.port,
+ configuration=configuration,
+ resampling_strategy="cv",
+ resampling_strategy_args={"folds": 10},
+ metric=accuracy,
+ additional_components=dict(),
+ )
train_indices = np.arange(69, dtype=int)
train_indices1 = subsample_indices(
- train_indices, 0.1449, evaluator.task_type, evaluator.Y_train)
+ train_indices, 0.1449, evaluator.task_type, evaluator.Y_train
+ )
evaluator.subsample = 20
train_indices2 = subsample_indices(
- train_indices, 0.2898, evaluator.task_type, evaluator.Y_train)
+ train_indices, 0.2898, evaluator.task_type, evaluator.Y_train
+ )
evaluator.subsample = 30
train_indices3 = subsample_indices(
- train_indices, 0.4347, evaluator.task_type, evaluator.Y_train)
+ train_indices, 0.4347, evaluator.task_type, evaluator.Y_train
+ )
evaluator.subsample = 67
train_indices4 = subsample_indices(
- train_indices, 0.971, evaluator.task_type, evaluator.Y_train)
+ train_indices, 0.971, evaluator.task_type, evaluator.Y_train
+ )
# Common cases
for ti in train_indices1:
self.assertIn(ti, train_indices2)
@@ -714,62 +931,98 @@ def test_subsample_indices_classification(self, mock, backend_mock):
# Corner cases
self.assertRaisesRegex(
- ValueError, 'train_size=0.0 should be either positive and smaller than the '
- r'number of samples 69 or a float in the \(0, 1\) range',
- subsample_indices, train_indices, 0.0, evaluator.task_type, evaluator.Y_train)
+ ValueError,
+ "train_size=0.0 should be either positive and smaller than the "
+ r"number of samples 69 or a float in the \(0, 1\) range",
+ subsample_indices,
+ train_indices,
+ 0.0,
+ evaluator.task_type,
+ evaluator.Y_train,
+ )
# With equal or greater it should return a non-shuffled array of indices
train_indices5 = subsample_indices(
- train_indices, 1.0, evaluator.task_type, evaluator.Y_train)
+ train_indices, 1.0, evaluator.task_type, evaluator.Y_train
+ )
self.assertTrue(np.all(train_indices5 == train_indices))
evaluator.subsample = 68
self.assertRaisesRegex(
- ValueError, 'The test_size = 1 should be greater or equal to the number of '
- 'classes = 2', subsample_indices, train_indices, 0.9999, evaluator.task_type,
- evaluator.Y_train)
+ ValueError,
+ "The test_size = 1 should be greater or equal to the number of "
+ "classes = 2",
+ subsample_indices,
+ train_indices,
+ 0.9999,
+ evaluator.task_type,
+ evaluator.Y_train,
+ )
- @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend')
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend")
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_subsample_indices_regression(self, mock, backend_mock):
configuration = unittest.mock.Mock(spec=Configuration)
queue_ = multiprocessing.Queue()
backend_mock.temporary_directory = tempfile.gettempdir()
- evaluator = TrainEvaluator(backend_mock, queue_,
- port=self.port,
- configuration=configuration,
- resampling_strategy='cv',
- resampling_strategy_args={'folds': 10},
- metric=accuracy,
- additional_components=dict(),
- )
+ evaluator = TrainEvaluator(
+ backend_mock,
+ queue_,
+ port=self.port,
+ configuration=configuration,
+ resampling_strategy="cv",
+ resampling_strategy_args={"folds": 10},
+ metric=accuracy,
+ additional_components=dict(),
+ )
train_indices = np.arange(69, dtype=int)
- train_indices3 = subsample_indices(train_indices, subsample=0.4347,
- task_type=evaluator.task_type,
- Y_train=evaluator.Y_train)
+ train_indices3 = subsample_indices(
+ train_indices,
+ subsample=0.4347,
+ task_type=evaluator.task_type,
+ Y_train=evaluator.Y_train,
+ )
evaluator.subsample = 67
- train_indices4 = subsample_indices(train_indices, subsample=0.4347,
- task_type=evaluator.task_type,
- Y_train=evaluator.Y_train)
+ train_indices4 = subsample_indices(
+ train_indices,
+ subsample=0.4347,
+ task_type=evaluator.task_type,
+ Y_train=evaluator.Y_train,
+ )
# Common cases
for ti in train_indices3:
self.assertIn(ti, train_indices4)
# Corner cases
self.assertRaisesRegex(
- ValueError, 'train_size=0.0 should be either positive and smaller than the '
- r'number of samples 69 or a float in the \(0, 1\) range',
- subsample_indices, train_indices, 0.0,
- evaluator.task_type, evaluator.Y_train)
+ ValueError,
+ "train_size=0.0 should be either positive and smaller than the "
+ r"number of samples 69 or a float in the \(0, 1\) range",
+ subsample_indices,
+ train_indices,
+ 0.0,
+ evaluator.task_type,
+ evaluator.Y_train,
+ )
self.assertRaisesRegex(
- ValueError, 'Subsample must not be larger than 1, but is 1.000100',
- subsample_indices, train_indices, 1.0001,
- evaluator.task_type, evaluator.Y_train)
+ ValueError,
+ "Subsample must not be larger than 1, but is 1.000100",
+ subsample_indices,
+ train_indices,
+ 1.0001,
+ evaluator.task_type,
+ evaluator.Y_train,
+ )
# With equal or greater it should return a non-shuffled array of indices
- train_indices6 = subsample_indices(train_indices, 1.0, evaluator.task_type,
- evaluator.Y_train)
+ train_indices6 = subsample_indices(
+ train_indices, 1.0, evaluator.task_type, evaluator.Y_train
+ )
np.testing.assert_allclose(train_indices6, train_indices)
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_predict_proba_binary_classification(self, mock):
D = get_binary_classification_datamanager()
self.backend_mock.load_datamanager.return_value = D
@@ -781,30 +1034,38 @@ def test_predict_proba_binary_classification(self, mock):
configuration = unittest.mock.Mock(spec=Configuration)
queue_ = multiprocessing.Queue()
- evaluator = TrainEvaluator(self.backend_mock, queue_,
- port=self.port,
- configuration=configuration,
- resampling_strategy='cv',
- resampling_strategy_args={'folds': 10},
- output_y_hat_optimization=False,
- metric=accuracy,
- additional_components=dict(),
- )
+ evaluator = TrainEvaluator(
+ self.backend_mock,
+ queue_,
+ port=self.port,
+ configuration=configuration,
+ resampling_strategy="cv",
+ resampling_strategy_args={"folds": 10},
+ output_y_hat_optimization=False,
+ metric=accuracy,
+ additional_components=dict(),
+ )
evaluator.fit_predict_and_loss()
Y_optimization_pred = self.backend_mock.save_numrun_to_dir.call_args_list[0][1][
- 'ensemble_predictions']
+ "ensemble_predictions"
+ ]
for i in range(7):
self.assertEqual(0.9, Y_optimization_pred[i][1])
- @unittest.mock.patch.object(TrainEvaluator, 'file_output')
- @unittest.mock.patch.object(TrainEvaluator, '_partial_fit_and_predict_standard')
- @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend')
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch.object(TrainEvaluator, "file_output")
+ @unittest.mock.patch.object(TrainEvaluator, "_partial_fit_and_predict_standard")
+ @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend")
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_fit_predict_and_loss_standard_additional_run_info(
- self, mock, backend_mock, _partial_fit_and_predict_mock,
- file_output_mock,
+ self,
+ mock,
+ backend_mock,
+ _partial_fit_and_predict_mock,
+ file_output_mock,
):
D = get_binary_classification_datamanager()
backend_mock.load_datamanager.return_value = D
@@ -815,7 +1076,7 @@ def test_fit_predict_and_loss_standard_additional_run_info(
np.array([[0.1, 0.9]] * 23),
np.array([[0.1, 0.9]] * 25),
np.array([[0.1, 0.9]] * 6),
- {'a': 5},
+ {"a": 5},
)
file_output_mock.return_value = (None, {})
@@ -823,10 +1084,11 @@ def test_fit_predict_and_loss_standard_additional_run_info(
queue_ = multiprocessing.Queue()
evaluator = TrainEvaluator(
- backend_mock, queue_,
+ backend_mock,
+ queue_,
port=self.port,
configuration=configuration,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
output_y_hat_optimization=False,
metric=accuracy,
additional_components=dict(),
@@ -840,8 +1102,8 @@ def test_fit_predict_and_loss_standard_additional_run_info(
rval = evaluator.fit_predict_and_loss(iterative=False)
self.assertIsNone(rval)
element = queue_.get()
- self.assertEqual(element['status'], StatusType.SUCCESS)
- self.assertEqual(element['additional_run_info']['a'], 5)
+ self.assertEqual(element["status"], StatusType.SUCCESS)
+ self.assertEqual(element["additional_run_info"]["a"], 5)
self.assertEqual(_partial_fit_and_predict_mock.call_count, 1)
class SideEffect(object):
@@ -856,7 +1118,7 @@ def __call__(self, *args, **kwargs):
np.array([[0.1, 0.9]] * 35),
np.array([[0.1, 0.9]] * 25),
np.array([[0.1, 0.9]] * 6),
- {'a': 5}
+ {"a": 5},
)
else:
return (
@@ -864,15 +1126,17 @@ def __call__(self, *args, **kwargs):
np.array([[0.1, 0.9]] * 34),
np.array([[0.1, 0.9]] * 25),
np.array([[0.1, 0.9]] * 6),
- {'a': 5}
+ {"a": 5},
)
+
_partial_fit_and_predict_mock.side_effect = SideEffect()
evaluator = TrainEvaluator(
- backend_mock, queue_,
+ backend_mock,
+ queue_,
port=self.port,
configuration=configuration,
- resampling_strategy='cv',
- resampling_strategy_args={'folds': 2},
+ resampling_strategy="cv",
+ resampling_strategy_args={"folds": 2},
output_y_hat_optimization=False,
metric=accuracy,
additional_components=dict(),
@@ -885,28 +1149,34 @@ def __call__(self, *args, **kwargs):
self.assertRaisesRegex(
TAEAbortException,
- 'Found additional run info "{\'a\': 5}" in fold 1, '
- 'but cannot handle additional run info if fold >= 1.',
+ "Found additional run info \"{'a': 5}\" in fold 1, "
+ "but cannot handle additional run info if fold >= 1.",
evaluator.fit_predict_and_loss,
- iterative=False
+ iterative=False,
)
- @unittest.mock.patch.object(TrainEvaluator, '_loss')
- @unittest.mock.patch.object(TrainEvaluator, 'finish_up')
- @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend')
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch.object(TrainEvaluator, "_loss")
+ @unittest.mock.patch.object(TrainEvaluator, "finish_up")
+ @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend")
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_fit_predict_and_loss_iterative_additional_run_info(
- self, mock, backend_mock, finish_up_mock, loss_mock,
+ self,
+ mock,
+ backend_mock,
+ finish_up_mock,
+ loss_mock,
):
-
class Counter:
counter = 0
def __call__(self):
self.counter += 1
return False if self.counter <= 1 else True
+
mock.estimator_supports_iterative_fit.return_value = True
- mock.fit_transformer.return_value = ('Xt', {})
+ mock.fit_transformer.return_value = ("Xt", {})
mock.configuration_fully_fitted.side_effect = Counter()
mock.get_current_iter.side_effect = Counter()
mock.get_max_iter.return_value = 1
@@ -922,10 +1192,11 @@ def __call__(self):
queue_ = multiprocessing.Queue()
evaluator = TrainEvaluator(
- backend_mock, queue_,
+ backend_mock,
+ queue_,
port=self.port,
configuration=configuration,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
output_y_hat_optimization=False,
metric=accuracy,
budget=0.0,
@@ -938,17 +1209,23 @@ def __call__(self):
rval = evaluator.fit_predict_and_loss(iterative=True)
self.assertIsNone(rval)
self.assertEqual(finish_up_mock.call_count, 1)
- self.assertEqual(finish_up_mock.call_args[1]['additional_run_info'], 14678)
-
- @unittest.mock.patch.object(TrainEvaluator, '_loss')
- @unittest.mock.patch.object(TrainEvaluator, 'finish_up')
- @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend')
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ self.assertEqual(finish_up_mock.call_args[1]["additional_run_info"], 14678)
+
+ @unittest.mock.patch.object(TrainEvaluator, "_loss")
+ @unittest.mock.patch.object(TrainEvaluator, "finish_up")
+ @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend")
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info(
- self, mock, backend_mock, finish_up_mock, loss_mock,
+ self,
+ mock,
+ backend_mock,
+ finish_up_mock,
+ loss_mock,
):
mock.estimator_supports_iterative_fit.return_value = False
- mock.fit_transformer.return_value = ('Xt', {})
+ mock.fit_transformer.return_value = ("Xt", {})
mock.get_additional_run_info.return_value = 14678
loss_mock.return_value = 0.5
@@ -961,10 +1238,11 @@ def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info(
queue_ = multiprocessing.Queue()
evaluator = TrainEvaluator(
- backend_mock, queue_,
+ backend_mock,
+ queue_,
port=self.port,
configuration=configuration,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
output_y_hat_optimization=False,
metric=accuracy,
additional_components=dict(),
@@ -977,14 +1255,20 @@ def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info(
rval = evaluator.fit_predict_and_loss(iterative=True)
self.assertIsNone(rval)
self.assertEqual(finish_up_mock.call_count, 1)
- self.assertEqual(finish_up_mock.call_args[1]['additional_run_info'], 14678)
-
- @unittest.mock.patch.object(TrainEvaluator, '_loss')
- @unittest.mock.patch.object(TrainEvaluator, 'finish_up')
- @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend')
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ self.assertEqual(finish_up_mock.call_args[1]["additional_run_info"], 14678)
+
+ @unittest.mock.patch.object(TrainEvaluator, "_loss")
+ @unittest.mock.patch.object(TrainEvaluator, "finish_up")
+ @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend")
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_fit_predict_and_loss_budget_additional_run_info(
- self, mock, backend_mock, finish_up_mock, loss_mock,
+ self,
+ mock,
+ backend_mock,
+ finish_up_mock,
+ loss_mock,
):
class Counter:
counter = 0
@@ -992,12 +1276,13 @@ class Counter:
def __call__(self):
self.counter += 1
return False if self.counter <= 1 else True
+
mock.configuration_fully_fitted.side_effect = Counter()
mock.get_current_iter.side_effect = Counter()
mock.get_max_iter.return_value = 1
mock.estimator_supports_iterative_fit.return_value = True
- mock.fit_transformer.return_value = ('Xt', {})
- mock.get_additional_run_info.return_value = {'val': 14678}
+ mock.fit_transformer.return_value = ("Xt", {})
+ mock.get_additional_run_info.return_value = {"val": 14678}
mock.get_max_iter.return_value = 512
loss_mock.return_value = 0.5
@@ -1010,13 +1295,14 @@ def __call__(self):
queue_ = multiprocessing.Queue()
evaluator = TrainEvaluator(
- backend_mock, queue_,
+ backend_mock,
+ queue_,
port=self.port,
configuration=configuration,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
output_y_hat_optimization=False,
metric=accuracy,
- budget_type='iterations',
+ budget_type="iterations",
budget=50,
additional_components=dict(),
)
@@ -1028,18 +1314,26 @@ def __call__(self):
rval = evaluator.fit_predict_and_loss(iterative=False)
self.assertIsNone(rval)
self.assertEqual(finish_up_mock.call_count, 1)
- self.assertEqual(finish_up_mock.call_args[1]['additional_run_info'], {'val': 14678})
+ self.assertEqual(
+ finish_up_mock.call_args[1]["additional_run_info"], {"val": 14678}
+ )
- @unittest.mock.patch.object(TrainEvaluator, '_loss')
- @unittest.mock.patch.object(TrainEvaluator, 'finish_up')
- @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend')
- @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
+ @unittest.mock.patch.object(TrainEvaluator, "_loss")
+ @unittest.mock.patch.object(TrainEvaluator, "finish_up")
+ @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend")
+ @unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ )
def test_fit_predict_and_loss_budget_2_additional_run_info(
- self, mock, backend_mock, finish_up_mock, loss_mock,
+ self,
+ mock,
+ backend_mock,
+ finish_up_mock,
+ loss_mock,
):
mock.estimator_supports_iterative_fit.return_value = False
- mock.fit_transformer.return_value = ('Xt', {})
- mock.get_additional_run_info.return_value = {'val': 14678}
+ mock.fit_transformer.return_value = ("Xt", {})
+ mock.get_additional_run_info.return_value = {"val": 14678}
loss_mock.return_value = 0.5
D = get_binary_classification_datamanager()
@@ -1051,13 +1345,14 @@ def test_fit_predict_and_loss_budget_2_additional_run_info(
queue_ = multiprocessing.Queue()
evaluator = TrainEvaluator(
- backend_mock, queue_,
+ backend_mock,
+ queue_,
port=self.port,
configuration=configuration,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
output_y_hat_optimization=False,
metric=accuracy,
- budget_type='subsample',
+ budget_type="subsample",
budget=50,
additional_components=dict(),
)
@@ -1069,7 +1364,9 @@ def test_fit_predict_and_loss_budget_2_additional_run_info(
rval = evaluator.fit_predict_and_loss(iterative=False)
self.assertIsNone(rval)
self.assertEqual(finish_up_mock.call_count, 1)
- self.assertEqual(finish_up_mock.call_args[1]['additional_run_info'], {'val': 14678})
+ self.assertEqual(
+ finish_up_mock.call_args[1]["additional_run_info"], {"val": 14678}
+ )
def test_get_results(self):
queue_ = multiprocessing.Queue()
@@ -1082,33 +1379,39 @@ def test_get_results(self):
def test_datasets(self):
for getter in get_dataset_getters():
- testname = '%s_%s' % (os.path.basename(__file__).
- replace('.pyc', '').replace('.py', ''),
- getter.__name__)
+ testname = "%s_%s" % (
+ os.path.basename(__file__).replace(".pyc", "").replace(".py", ""),
+ getter.__name__,
+ )
with self.subTest(testname):
D = getter()
D_ = copy.deepcopy(D)
- y = D.data['Y_train']
+ y = D.data["Y_train"]
if len(y.shape) == 2 and y.shape[1] == 1:
- D_.data['Y_train'] = y.flatten()
+ D_.data["Y_train"] = y.flatten()
self.backend_mock.load_datamanager.return_value = D_
queue_ = multiprocessing.Queue()
- metric_lookup = {MULTILABEL_CLASSIFICATION: f1_macro,
- BINARY_CLASSIFICATION: accuracy,
- MULTICLASS_CLASSIFICATION: accuracy,
- REGRESSION: r2}
- evaluator = TrainEvaluator(self.backend_mock, queue_,
- port=self.port,
- resampling_strategy='cv',
- resampling_strategy_args={'folds': 2},
- output_y_hat_optimization=False,
- metric=metric_lookup[D.info['task']],
- additional_components=dict(),)
+ metric_lookup = {
+ MULTILABEL_CLASSIFICATION: f1_macro,
+ BINARY_CLASSIFICATION: accuracy,
+ MULTICLASS_CLASSIFICATION: accuracy,
+ REGRESSION: r2,
+ }
+ evaluator = TrainEvaluator(
+ self.backend_mock,
+ queue_,
+ port=self.port,
+ resampling_strategy="cv",
+ resampling_strategy_args={"folds": 2},
+ output_y_hat_optimization=False,
+ metric=metric_lookup[D.info["task"]],
+ additional_components=dict(),
+ )
evaluator.fit_predict_and_loss()
rval = evaluator.queue.get(timeout=1)
- self.assertTrue(np.isfinite(rval['loss']))
+ self.assertTrue(np.isfinite(rval["loss"]))
############################################################################
# Test obtaining a splitter object from scikit-learn
@@ -1122,147 +1425,142 @@ def test_get_splitter(self, te_mock):
# holdout, binary classification
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'holdout'
+ evaluator.resampling_strategy = "holdout"
evaluator.resampling_strategy_args = {}
cv = evaluator.get_splitter(D)
- self.assertIsInstance(cv,
- sklearn.model_selection.StratifiedShuffleSplit)
+ self.assertIsInstance(cv, sklearn.model_selection.StratifiedShuffleSplit)
# holdout, binary classification, no shuffle
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'holdout'
- evaluator.resampling_strategy_args = {'shuffle': False}
+ evaluator.resampling_strategy = "holdout"
+ evaluator.resampling_strategy_args = {"shuffle": False}
cv = evaluator.get_splitter(D)
- self.assertIsInstance(cv,
- sklearn.model_selection.PredefinedSplit)
+ self.assertIsInstance(cv, sklearn.model_selection.PredefinedSplit)
# holdout, binary classification, fallback to custom shuffle split
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1, 2])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1, 2])
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'holdout'
+ evaluator.resampling_strategy = "holdout"
evaluator.resampling_strategy_args = {}
cv = evaluator.get_splitter(D)
- self.assertIsInstance(cv,
- autosklearn.evaluation.splitter.CustomStratifiedShuffleSplit)
+ self.assertIsInstance(
+ cv, autosklearn.evaluation.splitter.CustomStratifiedShuffleSplit
+ )
# cv, binary classification
- D.data['Y_train'] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'cv'
- evaluator.resampling_strategy_args = {'folds': 5}
+ evaluator.resampling_strategy = "cv"
+ evaluator.resampling_strategy_args = {"folds": 5}
cv = evaluator.get_splitter(D)
- self.assertIsInstance(cv,
- sklearn.model_selection._split.StratifiedKFold)
+ self.assertIsInstance(cv, sklearn.model_selection._split.StratifiedKFold)
# cv, binary classification, shuffle is True
- D.data['Y_train'] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'cv'
- evaluator.resampling_strategy_args = {'folds': 5}
+ evaluator.resampling_strategy = "cv"
+ evaluator.resampling_strategy_args = {"folds": 5}
cv = evaluator.get_splitter(D)
- self.assertIsInstance(cv,
- sklearn.model_selection._split.StratifiedKFold)
+ self.assertIsInstance(cv, sklearn.model_selection._split.StratifiedKFold)
self.assertTrue(cv.shuffle)
# cv, binary classification, shuffle is False
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'cv'
- evaluator.resampling_strategy_args = {'folds': 5, 'shuffle': False}
+ evaluator.resampling_strategy = "cv"
+ evaluator.resampling_strategy_args = {"folds": 5, "shuffle": False}
cv = evaluator.get_splitter(D)
- self.assertIsInstance(cv,
- sklearn.model_selection._split.KFold)
+ self.assertIsInstance(cv, sklearn.model_selection._split.KFold)
self.assertFalse(cv.shuffle)
# cv, binary classification, fallback to custom splitter
- D.data['Y_train'] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2])
+ D.data["Y_train"] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2])
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'cv'
- evaluator.resampling_strategy_args = {'folds': 5}
+ evaluator.resampling_strategy = "cv"
+ evaluator.resampling_strategy_args = {"folds": 5}
cv = evaluator.get_splitter(D)
- self.assertIsInstance(cv,
- autosklearn.evaluation.splitter.CustomStratifiedKFold)
+ self.assertIsInstance(cv, autosklearn.evaluation.splitter.CustomStratifiedKFold)
# regression, shuffle split
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'holdout'
+ evaluator.resampling_strategy = "holdout"
evaluator.resampling_strategy_args = {}
cv = evaluator.get_splitter(D)
- self.assertIsInstance(cv,
- sklearn.model_selection._split.ShuffleSplit)
+ self.assertIsInstance(cv, sklearn.model_selection._split.ShuffleSplit)
# regression, no shuffle
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'holdout'
- evaluator.resampling_strategy_args = {'shuffle': False}
+ evaluator.resampling_strategy = "holdout"
+ evaluator.resampling_strategy_args = {"shuffle": False}
cv = evaluator.get_splitter(D)
- self.assertIsInstance(cv,
- sklearn.model_selection._split.PredefinedSplit)
+ self.assertIsInstance(cv, sklearn.model_selection._split.PredefinedSplit)
# regression cv, KFold
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'cv'
- evaluator.resampling_strategy_args = {'folds': 5}
+ evaluator.resampling_strategy = "cv"
+ evaluator.resampling_strategy_args = {"folds": 5}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, sklearn.model_selection._split.KFold)
self.assertTrue(cv.shuffle)
# regression cv, KFold, no shuffling
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'cv'
- evaluator.resampling_strategy_args = {'folds': 5, 'shuffle': False}
+ evaluator.resampling_strategy = "cv"
+ evaluator.resampling_strategy_args = {"folds": 5, "shuffle": False}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, sklearn.model_selection._split.KFold)
self.assertFalse(cv.shuffle)
# multioutput regression, shuffle split
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'holdout'
+ evaluator.resampling_strategy = "holdout"
evaluator.resampling_strategy_args = {}
cv = evaluator.get_splitter(D)
- self.assertIsInstance(cv,
- sklearn.model_selection._split.ShuffleSplit)
+ self.assertIsInstance(cv, sklearn.model_selection._split.ShuffleSplit)
# multioutput regression, no shuffle
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'holdout'
- evaluator.resampling_strategy_args = {'shuffle': False}
+ evaluator.resampling_strategy = "holdout"
+ evaluator.resampling_strategy_args = {"shuffle": False}
cv = evaluator.get_splitter(D)
- self.assertIsInstance(cv,
- sklearn.model_selection._split.PredefinedSplit)
+ self.assertIsInstance(cv, sklearn.model_selection._split.PredefinedSplit)
# multioutput regression cv, KFold
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'cv'
- evaluator.resampling_strategy_args = {'folds': 5}
+ evaluator.resampling_strategy = "cv"
+ evaluator.resampling_strategy_args = {"folds": 5}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, sklearn.model_selection._split.KFold)
self.assertTrue(cv.shuffle)
# multioutput regression cv, KFold, no shuffling
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'cv'
- evaluator.resampling_strategy_args = {'folds': 5, 'shuffle': False}
+ evaluator.resampling_strategy = "cv"
+ evaluator.resampling_strategy_args = {"folds": 5, "shuffle": False}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, sklearn.model_selection._split.KFold)
self.assertFalse(cv.shuffle)
@@ -1276,19 +1574,26 @@ def test_get_splitter_cv_object(self, te_mock):
D.feat_type = {}
# GroupKFold, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
- D.data['X_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["X_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = GroupKFold(n_splits=2)
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, GroupKFold)
- self.assertEqual(cv.get_n_splits(groups=evaluator.resampling_strategy_args['groups']), 2)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2
+ )
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# GroupKFold, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = GroupKFold(n_splits=2)
evaluator.resampling_strategy_args = None
@@ -1296,23 +1601,31 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# GroupKFold, regression with args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = GroupKFold(n_splits=2)
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, GroupKFold)
- self.assertEqual(cv.get_n_splits(groups=evaluator.resampling_strategy_args['groups']), 2)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2
+ )
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# GroupKFold, regression no args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = GroupKFold(n_splits=2)
evaluator.resampling_strategy_args = None
@@ -1320,25 +1633,35 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# GroupKFold, multi-output regression with args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = GroupKFold(n_splits=2)
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, GroupKFold)
- self.assertEqual(cv.get_n_splits(groups=evaluator.resampling_strategy_args['groups']), 2)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2
+ )
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# GroupKFold, multi-output regression no args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = GroupKFold(n_splits=2)
evaluator.resampling_strategy_args = None
@@ -1346,110 +1669,154 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# KFold, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = KFold(n_splits=4, shuffle=True, random_state=5)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, KFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 4)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4
+ )
self.assertTrue(cv.shuffle)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# KFold, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = KFold(n_splits=3)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, KFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 3)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3
+ )
self.assertFalse(cv.shuffle)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# KFold, regression with args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = KFold(n_splits=4, shuffle=True, random_state=5)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, KFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 4)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4
+ )
self.assertTrue(cv.shuffle)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# KFold, regression no args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = KFold(n_splits=3)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, KFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 3)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3
+ )
self.assertFalse(cv.shuffle)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# KFold, multi-output regression with args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = KFold(n_splits=4, shuffle=True, random_state=5)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, KFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 4)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4
+ )
self.assertTrue(cv.shuffle)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# KFold, multi-output regression no args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = KFold(n_splits=3)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, KFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 3)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3
+ )
self.assertFalse(cv.shuffle)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeaveOneGroupOut, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeaveOneGroupOut()
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeaveOneGroupOut)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeaveOneGroupOut, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeaveOneGroupOut()
evaluator.resampling_strategy_args = None
@@ -1457,22 +1824,28 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# LeaveOneGroupOut, regression with args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeaveOneGroupOut()
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeaveOneGroupOut)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeaveOneGroupOut, regression no args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeaveOneGroupOut()
evaluator.resampling_strategy_args = None
@@ -1480,24 +1853,32 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# LeaveOneGroupOut, multi-output regression with args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeaveOneGroupOut()
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeaveOneGroupOut)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeaveOneGroupOut, multi-output regression no args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeaveOneGroupOut()
evaluator.resampling_strategy_args = None
@@ -1505,21 +1886,27 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# LeavePGroupsOut, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeavePGroupsOut(n_groups=1)
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeavePGroupsOut)
self.assertEqual(cv.n_groups, 1)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeavePGroupsOut, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeaveOneGroupOut()
evaluator.resampling_strategy_args = None
@@ -1527,23 +1914,29 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# LeavePGroupsOut, regression with args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeavePGroupsOut(n_groups=1)
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeavePGroupsOut)
self.assertEqual(cv.n_groups, 1)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeavePGroupsOut, regression no args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeavePGroupsOut(n_groups=1)
evaluator.resampling_strategy_args = None
@@ -1551,25 +1944,33 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# LeavePGroupsOut, multi-output regression with args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeavePGroupsOut(n_groups=1)
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeavePGroupsOut)
self.assertEqual(cv.n_groups, 1)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeavePGroupsOut, multi-output regression no args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeavePGroupsOut(n_groups=1)
evaluator.resampling_strategy_args = None
@@ -1577,384 +1978,567 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# LeaveOneOut, classification
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeaveOneOut()
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeaveOneOut)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeaveOneOut, regression
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeaveOneOut()
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeaveOneOut)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeaveOneOut, multi-output regression
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeaveOneOut()
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeaveOneOut)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeavePOut, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = LeavePOut(p=3)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeavePOut)
self.assertEqual(cv.p, 3)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeavePOut, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeavePOut(p=2)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeavePOut)
self.assertEqual(cv.p, 2)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeavePOut, regression with args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = LeavePOut(p=3)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeavePOut)
self.assertEqual(cv.p, 3)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeavePOut, regression no args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeavePOut(p=2)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeavePOut)
self.assertEqual(cv.p, 2)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeavePOut, multi-output regression with args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = LeavePOut(p=3)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeavePOut)
self.assertEqual(cv.p, 3)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# LeavePOut, multi-output regression no args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = LeavePOut(p=2)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, LeavePOut)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# PredefinedSplit, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
- evaluator.resampling_strategy = PredefinedSplit(test_fold=np.array([0, 1, 0, 1, 0, 1]))
+ evaluator.resampling_strategy = PredefinedSplit(
+ test_fold=np.array([0, 1, 0, 1, 0, 1])
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, PredefinedSplit)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# PredefinedSplit, regression with args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
- evaluator.resampling_strategy = PredefinedSplit(test_fold=np.array([0, 1, 0, 1, 0, 1]))
+ evaluator.resampling_strategy = PredefinedSplit(
+ test_fold=np.array([0, 1, 0, 1, 0, 1])
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, PredefinedSplit)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# PredefinedSplit, multi-output regression with args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
- evaluator.resampling_strategy = PredefinedSplit(test_fold=np.array([0, 1, 0, 1, 0, 1]))
+ evaluator.resampling_strategy = PredefinedSplit(
+ test_fold=np.array([0, 1, 0, 1, 0, 1])
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, PredefinedSplit)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# RepeatedKFold, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
- evaluator.resampling_strategy = RepeatedKFold(n_splits=4, n_repeats=3, random_state=5)
+ evaluator.resampling_strategy = RepeatedKFold(
+ n_splits=4, n_repeats=3, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, RepeatedKFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 4*3)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4 * 3
+ )
self.assertEqual(cv.n_repeats, 3)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# RepeatedKFold, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = RepeatedKFold(n_splits=5, n_repeats=10)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, RepeatedKFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 5*10)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 5 * 10
+ )
self.assertEqual(cv.n_repeats, 10)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# RepeatedKFold, regression with args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
- evaluator.resampling_strategy = RepeatedKFold(n_splits=4, n_repeats=3, random_state=5)
+ evaluator.resampling_strategy = RepeatedKFold(
+ n_splits=4, n_repeats=3, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, RepeatedKFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 4*3)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4 * 3
+ )
self.assertEqual(cv.n_repeats, 3)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# RepeatedKFold, regression no args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = RepeatedKFold(n_splits=5, n_repeats=10)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, RepeatedKFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 5*10)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 5 * 10
+ )
self.assertEqual(cv.n_repeats, 10)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# RepeatedKFold, multi-output regression with args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
- evaluator.resampling_strategy = RepeatedKFold(n_splits=4, n_repeats=3, random_state=5)
+ evaluator.resampling_strategy = RepeatedKFold(
+ n_splits=4, n_repeats=3, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, RepeatedKFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 4*3)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4 * 3
+ )
self.assertEqual(cv.n_repeats, 3)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# RepeatedKFold, multi-output regression no args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = RepeatedKFold(n_splits=5, n_repeats=10)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, RepeatedKFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 5*10)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 5 * 10
+ )
self.assertEqual(cv.n_repeats, 10)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# RepeatedStratifiedKFold, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = RepeatedStratifiedKFold(
- n_splits=2, n_repeats=3, random_state=5)
+ n_splits=2, n_repeats=3, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, RepeatedStratifiedKFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 2*3)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 * 3
+ )
self.assertEqual(cv.n_repeats, 3)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# RepeatedStratifiedKFold, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
- D.data['X_train'] = D.data['Y_train']
+ D.data["Y_train"] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+ D.data["X_train"] = D.data["Y_train"]
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
+ evaluator.resampling_strategy = RepeatedStratifiedKFold(
+ n_splits=5, n_repeats=10
+ )
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, RepeatedStratifiedKFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 5*10)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 5 * 10
+ )
self.assertEqual(cv.n_repeats, 10)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# StratifiedKFold, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
- D.data['X_train'] = D.data['Y_train']
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["X_train"] = D.data["Y_train"]
evaluator = TrainEvaluator()
evaluator.resampling_strategy = StratifiedKFold
evaluator.resampling_strategy_args = None
- evaluator.resampling_strategy = StratifiedKFold(n_splits=2, shuffle=True, random_state=5)
+ evaluator.resampling_strategy = StratifiedKFold(
+ n_splits=2, shuffle=True, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, StratifiedKFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 2)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2
+ )
self.assertTrue(cv.shuffle)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# StratifiedKFold, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = StratifiedKFold(n_splits=3, shuffle=False)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, StratifiedKFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 3)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3
+ )
self.assertFalse(cv.shuffle)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# TimeSeriesSplit, multi-output regression with args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = TimeSeriesSplit(n_splits=4, max_train_size=3)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, TimeSeriesSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 4)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4
+ )
self.assertEqual(cv.max_train_size, 3)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# TimeSeriesSplit, multi-output regression with args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = TimeSeriesSplit(n_splits=3)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, TimeSeriesSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 3)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3
+ )
self.assertIsNone(cv.max_train_size)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# TimeSeriesSplit, regression with args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = TimeSeriesSplit(n_splits=4, max_train_size=3)
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, TimeSeriesSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 4)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4
+ )
self.assertEqual(cv.max_train_size, 3)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# TimeSeriesSplit, regression no args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = TimeSeriesSplit(n_splits=3)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, TimeSeriesSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 3)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3
+ )
self.assertIsNone(cv.max_train_size)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# StratifiedKFold, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = StratifiedKFold(n_splits=3)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, StratifiedKFold)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 3)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3
+ )
self.assertFalse(cv.shuffle)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# GroupShuffleSplit, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
- evaluator.resampling_strategy = GroupShuffleSplit(n_splits=2, test_size=0.3,
- random_state=5)
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy = GroupShuffleSplit(
+ n_splits=2, test_size=0.3, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, GroupShuffleSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 2)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2
+ )
self.assertEqual(cv.test_size, 0.3)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# GroupShuffleSplit, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = GroupShuffleSplit(n_splits=5)
evaluator.resampling_strategy_args = None
@@ -1962,27 +2546,35 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# GroupShuffleSplit, regression with args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
- evaluator.resampling_strategy = GroupShuffleSplit(n_splits=2, test_size=0.3,
- random_state=5)
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy = GroupShuffleSplit(
+ n_splits=2, test_size=0.3, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, GroupShuffleSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 2)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2
+ )
self.assertEqual(cv.test_size, 0.3)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# GroupShuffleSplit, regression no args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = GroupShuffleSplit(n_splits=5)
evaluator.resampling_strategy_args = None
@@ -1990,29 +2582,39 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# GroupShuffleSplit, multi-output regression with args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
- evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])}
- evaluator.resampling_strategy = GroupShuffleSplit(n_splits=2, test_size=0.3,
- random_state=5)
+ evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])}
+ evaluator.resampling_strategy = GroupShuffleSplit(
+ n_splits=2, test_size=0.3, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, GroupShuffleSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 2)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2
+ )
self.assertEqual(cv.test_size, 0.3)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# GroupShuffleSplit, multi-output regression no args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = GroupShuffleSplit(n_splits=5)
evaluator.resampling_strategy_args = None
@@ -2020,129 +2622,188 @@ def test_get_splitter_cv_object(self, te_mock):
ValueError,
"The 'groups' parameter should not be None",
evaluator.get_splitter,
- D)
+ D,
+ )
# StratifiedShuffleSplit, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
evaluator.resampling_strategy = StratifiedShuffleSplit(
- n_splits=2, test_size=0.3, random_state=5)
+ n_splits=2, test_size=0.3, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, StratifiedShuffleSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 2)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2
+ )
self.assertEqual(cv.test_size, 0.3)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# StratifiedShuffleSplit, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
- 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1])
- D.data['X_train'] = D.data['Y_train']
+ D.data["Y_train"] = np.array(
+ [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]
+ )
+ D.data["X_train"] = D.data["Y_train"]
evaluator = TrainEvaluator()
evaluator.resampling_strategy = StratifiedShuffleSplit(n_splits=10)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, StratifiedShuffleSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 10)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 10
+ )
self.assertIsNone(cv.test_size)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# ShuffleSplit, classification with args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
- D.data['X_train'] = D.data['Y_train']
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["X_train"] = D.data["Y_train"]
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
- evaluator.resampling_strategy = ShuffleSplit(n_splits=2, test_size=0.3, random_state=5)
+ evaluator.resampling_strategy = ShuffleSplit(
+ n_splits=2, test_size=0.3, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, ShuffleSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 2)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2
+ )
self.assertEqual(cv.test_size, 0.3)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# ShuffleSplit, classification no args
- D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+ D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1])
evaluator = TrainEvaluator()
evaluator.resampling_strategy = ShuffleSplit(n_splits=10)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, ShuffleSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 10)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 10
+ )
self.assertIsNone(cv.test_size)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# ShuffleSplit, regression with args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
- evaluator.resampling_strategy = ShuffleSplit(n_splits=2, test_size=0.3, random_state=5)
+ evaluator.resampling_strategy = ShuffleSplit(
+ n_splits=2, test_size=0.3, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, ShuffleSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 2)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2
+ )
self.assertEqual(cv.test_size, 0.3)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# ShuffleSplit, regression no args
- D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
- D.info['task'] = REGRESSION
+ D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+ D.info["task"] = REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = ShuffleSplit(n_splits=10)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, ShuffleSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 10)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 10
+ )
self.assertIsNone(cv.test_size)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# ShuffleSplit, multi-output regression with args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy_args = None
- evaluator.resampling_strategy = ShuffleSplit(n_splits=2, test_size=0.3, random_state=5)
+ evaluator.resampling_strategy = ShuffleSplit(
+ n_splits=2, test_size=0.3, random_state=5
+ )
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, ShuffleSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 2)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2
+ )
self.assertEqual(cv.test_size, 0.3)
self.assertEqual(cv.random_state, 5)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
# ShuffleSplit, multi-output regression no args
- D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5],
- [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]])
- D.info['task'] = MULTIOUTPUT_REGRESSION
+ D.data["Y_train"] = np.array(
+ [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]
+ )
+ D.info["task"] = MULTIOUTPUT_REGRESSION
evaluator = TrainEvaluator()
evaluator.resampling_strategy = ShuffleSplit(n_splits=10)
evaluator.resampling_strategy_args = None
cv = evaluator.get_splitter(D)
self.assertIsInstance(cv, ShuffleSplit)
- self.assertEqual(cv.get_n_splits(
- groups=evaluator.resampling_strategy_args['groups']), 10)
+ self.assertEqual(
+ cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 10
+ )
self.assertIsNone(cv.test_size)
self.assertIsNone(cv.random_state)
- next(cv.split(D.data['Y_train'], D.data['Y_train'],
- groups=evaluator.resampling_strategy_args['groups']))
+ next(
+ cv.split(
+ D.data["Y_train"],
+ D.data["Y_train"],
+ groups=evaluator.resampling_strategy_args["groups"],
+ )
+ )
@unittest.mock.patch.object(TrainEvaluator, "__init__")
def test_holdout_split_size(self, te_mock):
@@ -2151,102 +2812,119 @@ def test_holdout_split_size(self, te_mock):
D.feat_type = {}
evaluator = TrainEvaluator()
- evaluator.resampling_strategy = 'holdout'
+ evaluator.resampling_strategy = "holdout"
# Exact Ratio
D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]))
D.info = dict(task=BINARY_CLASSIFICATION)
- evaluator.resampling_strategy_args = {'shuffle': True,
- 'train_size': 0.7}
+ evaluator.resampling_strategy_args = {"shuffle": True, "train_size": 0.7}
cv = evaluator.get_splitter(D)
self.assertEqual(cv.get_n_splits(), 1)
- train_samples, test_samples = next(cv.split(D.data['Y_train'],
- D.data['Y_train']))
+ train_samples, test_samples = next(
+ cv.split(D.data["Y_train"], D.data["Y_train"])
+ )
self.assertEqual(len(train_samples), 7)
self.assertEqual(len(test_samples), 3)
# No Shuffle
- evaluator.resampling_strategy_args = {'shuffle': False,
- 'train_size': 0.7}
+ evaluator.resampling_strategy_args = {"shuffle": False, "train_size": 0.7}
cv = evaluator.get_splitter(D)
self.assertEqual(cv.get_n_splits(), 1)
- train_samples, test_samples = next(cv.split(D.data['Y_train'],
- D.data['Y_train']))
+ train_samples, test_samples = next(
+ cv.split(D.data["Y_train"], D.data["Y_train"])
+ )
self.assertEqual(len(train_samples), 7)
self.assertEqual(len(test_samples), 3)
# Rounded Ratio
D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1]))
- evaluator.resampling_strategy_args = {'shuffle': True,
- 'train_size': 0.7}
+ evaluator.resampling_strategy_args = {"shuffle": True, "train_size": 0.7}
cv = evaluator.get_splitter(D)
self.assertEqual(cv.get_n_splits(), 1)
- train_samples, test_samples = next(cv.split(D.data['Y_train'],
- D.data['Y_train']))
+ train_samples, test_samples = next(
+ cv.split(D.data["Y_train"], D.data["Y_train"])
+ )
self.assertEqual(len(train_samples), 6)
self.assertEqual(len(test_samples), 3)
# Rounded Ratio No Shuffle
- evaluator.resampling_strategy_args = {'shuffle': False,
- 'train_size': 0.7}
+ evaluator.resampling_strategy_args = {"shuffle": False, "train_size": 0.7}
cv = evaluator.get_splitter(D)
self.assertEqual(cv.get_n_splits(), 1)
- train_samples, test_samples = next(cv.split(D.data['Y_train'],
- D.data['Y_train']))
+ train_samples, test_samples = next(
+ cv.split(D.data["Y_train"], D.data["Y_train"])
+ )
self.assertEqual(len(train_samples), 6)
self.assertEqual(len(test_samples), 3)
# More data
- evaluator.resampling_strategy_args = {'shuffle': True,
- 'train_size': 0.7}
+ evaluator.resampling_strategy_args = {"shuffle": True, "train_size": 0.7}
D.data = dict(Y_train=np.zeros((900, 1)))
cv = evaluator.get_splitter(D)
self.assertEqual(cv.get_n_splits(), 1)
- train_samples, test_samples = next(cv.split(D.data['Y_train'],
- D.data['Y_train']))
+ train_samples, test_samples = next(
+ cv.split(D.data["Y_train"], D.data["Y_train"])
+ )
self.assertEqual(len(train_samples), 630)
self.assertEqual(len(test_samples), 270)
- evaluator.resampling_strategy_args = {'train_size': 0.752}
+ evaluator.resampling_strategy_args = {"train_size": 0.752}
D.data = dict(Y_train=np.zeros((900, 1)))
cv = evaluator.get_splitter(D)
self.assertEqual(cv.get_n_splits(), 1)
- train_samples, test_samples = next(cv.split(D.data['Y_train'],
- D.data['Y_train']))
+ train_samples, test_samples = next(
+ cv.split(D.data["Y_train"], D.data["Y_train"])
+ )
self.assertEqual(len(train_samples), 676)
self.assertEqual(len(test_samples), 224)
# Multilabel Exact Ratio
- D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1],
- [1, 1], [1, 1], [1, 0], [1, 1], [1, 1]]
- ))
+ D.data = dict(
+ Y_train=np.array(
+ [
+ [0, 0],
+ [0, 1],
+ [1, 1],
+ [1, 0],
+ [1, 1],
+ [1, 1],
+ [1, 1],
+ [1, 0],
+ [1, 1],
+ [1, 1],
+ ]
+ )
+ )
D.info = dict(task=MULTILABEL_CLASSIFICATION)
- evaluator.resampling_strategy_args = {'shuffle': True,
- 'train_size': 0.7}
+ evaluator.resampling_strategy_args = {"shuffle": True, "train_size": 0.7}
cv = evaluator.get_splitter(D)
self.assertEqual(cv.get_n_splits(), 1)
- train_samples, test_samples = next(cv.split(D.data['Y_train'],
- D.data['Y_train']))
+ train_samples, test_samples = next(
+ cv.split(D.data["Y_train"], D.data["Y_train"])
+ )
self.assertEqual(len(train_samples), 7)
self.assertEqual(len(test_samples), 3)
# Multilabel No Shuffle
- D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1],
- [1, 1], [1, 1], [1, 0], [1, 1]]))
- evaluator.resampling_strategy_args = {'shuffle': False,
- 'train_size': 0.7}
+ D.data = dict(
+ Y_train=np.array(
+ [[0, 0], [0, 1], [1, 1], [1, 0], [1, 1], [1, 1], [1, 1], [1, 0], [1, 1]]
+ )
+ )
+ evaluator.resampling_strategy_args = {"shuffle": False, "train_size": 0.7}
cv = evaluator.get_splitter(D)
self.assertEqual(cv.get_n_splits(), 1)
- train_samples, test_samples = next(cv.split(D.data['Y_train'],
- D.data['Y_train']))
+ train_samples, test_samples = next(
+ cv.split(D.data["Y_train"], D.data["Y_train"])
+ )
self.assertEqual(len(train_samples), 6)
self.assertEqual(len(test_samples), 3)
@@ -2255,16 +2933,17 @@ class FunctionsTest(unittest.TestCase):
def setUp(self):
self.queue = multiprocessing.Queue()
self.configuration = get_configuration_space(
- {'task': MULTICLASS_CLASSIFICATION,
- 'is_sparse': False}).get_default_configuration()
+ {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False}
+ ).get_default_configuration()
self.data = get_multiclass_classification_datamanager()
- self.tmp_dir = os.path.join(os.path.dirname(__file__),
- '.test_holdout_functions')
- self.n = len(self.data.data['Y_train'])
- self.y = self.data.data['Y_train'].flatten()
+ self.tmp_dir = os.path.join(
+ os.path.dirname(__file__), ".test_holdout_functions"
+ )
+ self.n = len(self.data.data["Y_train"])
+ self.y = self.data.data["Y_train"].flatten()
tmp_dir_name = self.id()
- self.ev_path = os.path.join(this_directory, '.tmp_evaluations', tmp_dir_name)
+ self.ev_path = os.path.join(this_directory, ".tmp_evaluations", tmp_dir_name)
if os.path.exists(self.ev_path):
shutil.rmtree(self.ev_path)
os.makedirs(self.ev_path, exist_ok=False)
@@ -2274,12 +2953,14 @@ def setUp(self):
self.backend.get_cv_model_dir.return_value = self.ev_path
dummy_model_files = [os.path.join(self.ev_path, str(n)) for n in range(100)]
dummy_pred_files = [os.path.join(self.ev_path, str(n)) for n in range(100, 200)]
- dummy_cv_model_files = [os.path.join(self.ev_path, str(n)) for n in range(200, 300)]
+ dummy_cv_model_files = [
+ os.path.join(self.ev_path, str(n)) for n in range(200, 300)
+ ]
self.backend.get_model_path.side_effect = dummy_model_files
self.backend.get_cv_model_path.side_effect = dummy_cv_model_files
self.backend.get_prediction_output_path.side_effect = dummy_pred_files
self.backend.load_datamanager.return_value = self.data
- self.dataset_name = json.dumps({'task_id': 'test'})
+ self.dataset_name = json.dumps({"task_id": "test"})
self.port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
def tearDown(self):
@@ -2292,7 +2973,7 @@ def test_eval_holdout(self):
port=self.port,
config=self.configuration,
backend=self.backend,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
resampling_strategy_args=None,
seed=1,
num_run=1,
@@ -2307,9 +2988,9 @@ def test_eval_holdout(self):
)
info = read_queue(self.queue)
self.assertEqual(len(info), 1)
- self.assertAlmostEqual(info[0]['loss'], 0.030303030303030276, places=3)
- self.assertEqual(info[0]['status'], StatusType.SUCCESS)
- self.assertNotIn('bac_metric', info[0]['additional_run_info'])
+ self.assertAlmostEqual(info[0]["loss"], 0.030303030303030276, places=3)
+ self.assertEqual(info[0]["status"], StatusType.SUCCESS)
+ self.assertNotIn("bac_metric", info[0]["additional_run_info"])
def test_eval_holdout_all_loss_functions(self):
eval_holdout(
@@ -2317,7 +2998,7 @@ def test_eval_holdout_all_loss_functions(self):
port=self.port,
config=self.configuration,
backend=self.backend,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
resampling_strategy_args=None,
seed=1,
num_run=1,
@@ -2334,34 +3015,36 @@ def test_eval_holdout_all_loss_functions(self):
self.assertEqual(len(rval), 1)
fixture = {
- 'accuracy': 0.030303030303030276,
- 'balanced_accuracy': 0.033333333333333326,
- 'f1_macro': 0.032036613272311221,
- 'f1_micro': 0.030303030303030276,
- 'f1_weighted': 0.030441716940572849,
- 'log_loss': 0.06376745642134637,
- 'precision_macro': 0.02777777777777779,
- 'precision_micro': 0.030303030303030276,
- 'precision_weighted': 0.027777777777777901,
- 'recall_macro': 0.033333333333333326,
- 'recall_micro': 0.030303030303030276,
- 'recall_weighted': 0.030303030303030276,
- 'num_run': 1,
- 'validation_loss': 0.0,
- 'test_loss': 0.04,
- 'train_loss': 0.0,
+ "accuracy": 0.030303030303030276,
+ "balanced_accuracy": 0.033333333333333326,
+ "f1_macro": 0.032036613272311221,
+ "f1_micro": 0.030303030303030276,
+ "f1_weighted": 0.030441716940572849,
+ "log_loss": 0.06376745642134637,
+ "precision_macro": 0.02777777777777779,
+ "precision_micro": 0.030303030303030276,
+ "precision_weighted": 0.027777777777777901,
+ "recall_macro": 0.033333333333333326,
+ "recall_micro": 0.030303030303030276,
+ "recall_weighted": 0.030303030303030276,
+ "num_run": 1,
+ "validation_loss": 0.0,
+ "test_loss": 0.04,
+ "train_loss": 0.0,
}
- additional_run_info = rval[0]['additional_run_info']
+ additional_run_info = rval[0]["additional_run_info"]
for key, value in fixture.items():
- self.assertAlmostEqual(additional_run_info[key], fixture[key],
- msg=key)
- self.assertIn('duration', additional_run_info)
- self.assertEqual(len(additional_run_info), len(fixture) + 1,
- msg=sorted(additional_run_info.items()))
+ self.assertAlmostEqual(additional_run_info[key], fixture[key], msg=key)
+ self.assertIn("duration", additional_run_info)
+ self.assertEqual(
+ len(additional_run_info),
+ len(fixture) + 1,
+ msg=sorted(additional_run_info.items()),
+ )
- self.assertAlmostEqual(rval[0]['loss'], 0.030303030303030276, places=3)
- self.assertEqual(rval[0]['status'], StatusType.SUCCESS)
+ self.assertAlmostEqual(rval[0]["loss"], 0.030303030303030276, places=3)
+ self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
def test_eval_holdout_iterative_fit_no_timeout(self):
eval_iterative_holdout(
@@ -2369,7 +3052,7 @@ def test_eval_holdout_iterative_fit_no_timeout(self):
port=self.port,
config=self.configuration,
backend=self.backend,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
resampling_strategy_args=None,
seed=1,
num_run=1,
@@ -2384,9 +3067,9 @@ def test_eval_holdout_iterative_fit_no_timeout(self):
)
rval = read_queue(self.queue)
self.assertEqual(len(rval), 9)
- self.assertAlmostEqual(rval[-1]['loss'], 0.030303030303030276)
- self.assertEqual(rval[0]['status'], StatusType.DONOTADVANCE)
- self.assertEqual(rval[-1]['status'], StatusType.SUCCESS)
+ self.assertAlmostEqual(rval[-1]["loss"], 0.030303030303030276)
+ self.assertEqual(rval[0]["status"], StatusType.DONOTADVANCE)
+ self.assertEqual(rval[-1]["status"], StatusType.SUCCESS)
def test_eval_holdout_budget_iterations(self):
eval_holdout(
@@ -2394,7 +3077,7 @@ def test_eval_holdout_budget_iterations(self):
port=self.port,
config=self.configuration,
backend=self.backend,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
resampling_strategy_args=None,
seed=1,
num_run=1,
@@ -2406,45 +3089,45 @@ def test_eval_holdout_budget_iterations(self):
instance=self.dataset_name,
metric=accuracy,
budget=1,
- budget_type='iterations',
+ budget_type="iterations",
additional_components=dict(),
)
info = read_queue(self.queue)
self.assertEqual(len(info), 1)
- self.assertAlmostEqual(info[0]['loss'], 0.06060606060606055, places=3)
- self.assertEqual(info[0]['status'], StatusType.SUCCESS)
- self.assertNotIn('bac_metric', info[0]['additional_run_info'])
+ self.assertAlmostEqual(info[0]["loss"], 0.06060606060606055, places=3)
+ self.assertEqual(info[0]["status"], StatusType.SUCCESS)
+ self.assertNotIn("bac_metric", info[0]["additional_run_info"])
def test_eval_holdout_budget_iterations_converged(self):
configuration = get_configuration_space(
- exclude={'classifier': ['random_forest', 'liblinear_svc']},
- info={'task': MULTICLASS_CLASSIFICATION, 'is_sparse': False},
+ exclude={"classifier": ["random_forest", "liblinear_svc"]},
+ info={"task": MULTICLASS_CLASSIFICATION, "is_sparse": False},
).get_default_configuration()
eval_holdout(
queue=self.queue,
port=self.port,
config=configuration,
backend=self.backend,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
resampling_strategy_args=None,
seed=1,
num_run=1,
scoring_functions=None,
output_y_hat_optimization=True,
include=None,
- exclude={'classifier': ['random_forest', 'liblinear_svc']},
+ exclude={"classifier": ["random_forest", "liblinear_svc"]},
disable_file_output=False,
instance=self.dataset_name,
metric=accuracy,
budget=80,
- budget_type='iterations',
+ budget_type="iterations",
additional_components=dict(),
)
info = read_queue(self.queue)
self.assertEqual(len(info), 1)
- self.assertAlmostEqual(info[0]['loss'], 0.18181818181818177, places=3)
- self.assertEqual(info[0]['status'], StatusType.DONOTADVANCE)
- self.assertNotIn('bac_metric', info[0]['additional_run_info'])
+ self.assertAlmostEqual(info[0]["loss"], 0.18181818181818177, places=3)
+ self.assertEqual(info[0]["status"], StatusType.DONOTADVANCE)
+ self.assertNotIn("bac_metric", info[0]["additional_run_info"])
def test_eval_holdout_budget_subsample(self):
eval_holdout(
@@ -2452,7 +3135,7 @@ def test_eval_holdout_budget_subsample(self):
port=self.port,
config=self.configuration,
backend=self.backend,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
resampling_strategy_args=None,
seed=1,
num_run=1,
@@ -2464,14 +3147,14 @@ def test_eval_holdout_budget_subsample(self):
instance=self.dataset_name,
metric=accuracy,
budget=30,
- budget_type='subsample',
+ budget_type="subsample",
additional_components=dict(),
)
info = read_queue(self.queue)
self.assertEqual(len(info), 1)
- self.assertAlmostEqual(info[0]['loss'], 0.0)
- self.assertEqual(info[0]['status'], StatusType.SUCCESS)
- self.assertNotIn('bac_metric', info[0]['additional_run_info'])
+ self.assertAlmostEqual(info[0]["loss"], 0.0)
+ self.assertEqual(info[0]["status"], StatusType.SUCCESS)
+ self.assertNotIn("bac_metric", info[0]["additional_run_info"])
def test_eval_holdout_budget_mixed_iterations(self):
print(self.configuration)
@@ -2480,7 +3163,7 @@ def test_eval_holdout_budget_mixed_iterations(self):
port=self.port,
config=self.configuration,
backend=self.backend,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
resampling_strategy_args=None,
seed=1,
num_run=1,
@@ -2492,44 +3175,44 @@ def test_eval_holdout_budget_mixed_iterations(self):
instance=self.dataset_name,
metric=accuracy,
budget=1,
- budget_type='mixed',
- additional_components=dict()
+ budget_type="mixed",
+ additional_components=dict(),
)
info = read_queue(self.queue)
self.assertEqual(len(info), 1)
- self.assertAlmostEqual(info[0]['loss'], 0.06060606060606055)
+ self.assertAlmostEqual(info[0]["loss"], 0.06060606060606055)
def test_eval_holdout_budget_mixed_subsample(self):
configuration = get_configuration_space(
- exclude={'classifier': ['random_forest']},
- info={'task': MULTICLASS_CLASSIFICATION, 'is_sparse': False},
+ exclude={"classifier": ["random_forest"]},
+ info={"task": MULTICLASS_CLASSIFICATION, "is_sparse": False},
).get_default_configuration()
- self.assertEqual(configuration['classifier:__choice__'], 'liblinear_svc')
+ self.assertEqual(configuration["classifier:__choice__"], "liblinear_svc")
eval_holdout(
queue=self.queue,
port=self.port,
config=configuration,
backend=self.backend,
- resampling_strategy='holdout',
+ resampling_strategy="holdout",
resampling_strategy_args=None,
seed=1,
num_run=1,
scoring_functions=None,
output_y_hat_optimization=True,
include=None,
- exclude={'classifier': ['random_forest']},
+ exclude={"classifier": ["random_forest"]},
disable_file_output=False,
instance=self.dataset_name,
metric=accuracy,
budget=40,
- budget_type='mixed',
+ budget_type="mixed",
additional_components=dict(),
)
info = read_queue(self.queue)
self.assertEqual(len(info), 1)
- self.assertAlmostEqual(info[0]['loss'], 0.06060606060606055)
- self.assertEqual(info[0]['status'], StatusType.SUCCESS)
- self.assertNotIn('bac_metric', info[0]['additional_run_info'])
+ self.assertAlmostEqual(info[0]["loss"], 0.06060606060606055)
+ self.assertEqual(info[0]["status"], StatusType.SUCCESS)
+ self.assertNotIn("bac_metric", info[0]["additional_run_info"])
def test_eval_cv(self):
eval_cv(
@@ -2539,8 +3222,8 @@ def test_eval_cv(self):
backend=self.backend,
seed=1,
num_run=1,
- resampling_strategy='cv',
- resampling_strategy_args={'folds': 3},
+ resampling_strategy="cv",
+ resampling_strategy_args={"folds": 3},
scoring_functions=None,
output_y_hat_optimization=True,
include=None,
@@ -2552,9 +3235,9 @@ def test_eval_cv(self):
)
rval = read_queue(self.queue)
self.assertEqual(len(rval), 1)
- self.assertAlmostEqual(rval[0]['loss'], 0.04999999999999997)
- self.assertEqual(rval[0]['status'], StatusType.SUCCESS)
- self.assertNotIn('bac_metric', rval[0]['additional_run_info'])
+ self.assertAlmostEqual(rval[0]["loss"], 0.04999999999999997)
+ self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
+ self.assertNotIn("bac_metric", rval[0]["additional_run_info"])
def test_eval_cv_all_loss_functions(self):
eval_cv(
@@ -2564,8 +3247,8 @@ def test_eval_cv_all_loss_functions(self):
backend=self.backend,
seed=1,
num_run=1,
- resampling_strategy='cv',
- resampling_strategy_args={'folds': 3},
+ resampling_strategy="cv",
+ resampling_strategy_args={"folds": 3},
scoring_functions=SCORER_LIST,
output_y_hat_optimization=True,
include=None,
@@ -2579,33 +3262,36 @@ def test_eval_cv_all_loss_functions(self):
self.assertEqual(len(rval), 1)
fixture = {
- 'accuracy': 0.04999999999999997,
- 'balanced_accuracy': 0.05130303030303027,
- 'f1_macro': 0.052793650793650775,
- 'f1_micro': 0.04999999999999997,
- 'f1_weighted': 0.050090909090909096,
- 'log_loss': 0.12108563414774837,
- 'precision_macro': 0.04963636363636359,
- 'precision_micro': 0.04999999999999997,
- 'precision_weighted': 0.045757575757575664,
- 'recall_macro': 0.05130303030303027,
- 'recall_micro': 0.04999999999999997,
- 'recall_weighted': 0.04999999999999997,
- 'num_run': 1,
- 'validation_loss': 0.04,
- 'test_loss': 0.04,
- 'train_loss': 0.0,
+ "accuracy": 0.04999999999999997,
+ "balanced_accuracy": 0.05130303030303027,
+ "f1_macro": 0.052793650793650775,
+ "f1_micro": 0.04999999999999997,
+ "f1_weighted": 0.050090909090909096,
+ "log_loss": 0.12108563414774837,
+ "precision_macro": 0.04963636363636359,
+ "precision_micro": 0.04999999999999997,
+ "precision_weighted": 0.045757575757575664,
+ "recall_macro": 0.05130303030303027,
+ "recall_micro": 0.04999999999999997,
+ "recall_weighted": 0.04999999999999997,
+ "num_run": 1,
+ "validation_loss": 0.04,
+ "test_loss": 0.04,
+ "train_loss": 0.0,
}
- additional_run_info = rval[0]['additional_run_info']
+ additional_run_info = rval[0]["additional_run_info"]
for key, value in fixture.items():
self.assertAlmostEqual(additional_run_info[key], fixture[key], msg=key)
- self.assertIn('duration', additional_run_info)
- self.assertEqual(len(additional_run_info), len(fixture) + 1,
- msg=sorted(additional_run_info.items()))
+ self.assertIn("duration", additional_run_info)
+ self.assertEqual(
+ len(additional_run_info),
+ len(fixture) + 1,
+ msg=sorted(additional_run_info.items()),
+ )
- self.assertAlmostEqual(rval[0]['loss'], 0.04999999999999997)
- self.assertEqual(rval[0]['status'], StatusType.SUCCESS)
+ self.assertAlmostEqual(rval[0]["loss"], 0.04999999999999997)
+ self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
# def test_eval_cv_on_subset(self):
# backend_api = backend.create(self.tmp_dir, self.tmp_dir)
@@ -2619,13 +3305,15 @@ def test_eval_cv_all_loss_functions(self):
# self.assertEqual(info[2], 1)
def test_eval_partial_cv(self):
- results = [0.050000000000000044,
- 0.0,
- 0.09999999999999998,
- 0.09999999999999998,
- 0.050000000000000044]
+ results = [
+ 0.050000000000000044,
+ 0.0,
+ 0.09999999999999998,
+ 0.09999999999999998,
+ 0.050000000000000044,
+ ]
for fold in range(5):
- instance = json.dumps({'task_id': 'data', 'fold': fold})
+ instance = json.dumps({"task_id": "data", "fold": fold})
eval_partial_cv(
port=self.port,
queue=self.queue,
@@ -2634,8 +3322,8 @@ def test_eval_partial_cv(self):
seed=1,
num_run=1,
instance=instance,
- resampling_strategy='partial-cv',
- resampling_strategy_args={'folds': 5},
+ resampling_strategy="partial-cv",
+ resampling_strategy_args={"folds": 5},
scoring_functions=None,
output_y_hat_optimization=True,
include=None,
@@ -2646,5 +3334,5 @@ def test_eval_partial_cv(self):
)
rval = read_queue(self.queue)
self.assertEqual(len(rval), 1)
- self.assertAlmostEqual(rval[0]['loss'], results[fold])
- self.assertEqual(rval[0]['status'], StatusType.SUCCESS)
+ self.assertAlmostEqual(rval[0]["loss"], results[fold])
+ self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
diff --git a/test/test_metalearning/__init__.py b/test/test_metalearning/__init__.py
index cc3cd7becd..e298f0f075 100644
--- a/test/test_metalearning/__init__.py
+++ b/test/test_metalearning/__init__.py
@@ -1,2 +1,2 @@
# -*- encoding: utf-8 -*-
-__author__ = 'feurerm'
+__author__ = "feurerm"
diff --git a/test/test_metalearning/pyMetaLearn/__init__.py b/test/test_metalearning/pyMetaLearn/__init__.py
index 8f0ce6cb7c..92bf78f389 100644
--- a/test/test_metalearning/pyMetaLearn/__init__.py
+++ b/test/test_metalearning/pyMetaLearn/__init__.py
@@ -1 +1 @@
-__author__ = 'feurerm'
+__author__ = "feurerm"
diff --git a/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py b/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py
index 6733dca93f..4877379440 100644
--- a/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py
+++ b/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py
@@ -1,8 +1,9 @@
import logging
import unittest
-import numpy as np
+import numpy as np
import pandas as pd
+
from autosklearn.metalearning.metalearning.kNearestDatasets.kND import KNearestDatasets
from autosklearn.metalearning.metalearning.metrics.misc import get_random_metric
@@ -11,15 +12,35 @@ class kNDTest(unittest.TestCase):
_multiprocess_can_split_ = True
def setUp(self):
- self.anneal = pd.Series({"number_of_instances": 898., "number_of_classes": 5.,
- "number_of_features": 38.}, name=232)
- self.krvskp = pd.Series({"number_of_instances": 3196., "number_of_classes":
- 2., "number_of_features": 36.}, name=233)
- self.labor = pd.Series({"number_of_instances": 57., "number_of_classes":
- 2., "number_of_features": 16.}, name=234)
- self.runs = {232: [0.1, 0.5, 0.7],
- 233: [np.NaN, 0.1, 0.7],
- 234: [0.5, 0.7, 0.1]}
+ self.anneal = pd.Series(
+ {
+ "number_of_instances": 898.0,
+ "number_of_classes": 5.0,
+ "number_of_features": 38.0,
+ },
+ name=232,
+ )
+ self.krvskp = pd.Series(
+ {
+ "number_of_instances": 3196.0,
+ "number_of_classes": 2.0,
+ "number_of_features": 36.0,
+ },
+ name=233,
+ )
+ self.labor = pd.Series(
+ {
+ "number_of_instances": 57.0,
+ "number_of_classes": 2.0,
+ "number_of_features": 16.0,
+ },
+ name=234,
+ )
+ self.runs = {
+ 232: [0.1, 0.5, 0.7],
+ 233: [np.NaN, 0.1, 0.7],
+ 234: [0.5, 0.7, 0.1],
+ }
self.runs = pd.DataFrame(self.runs)
self.logger = logging.getLogger()
@@ -30,43 +51,47 @@ def test_fit_l1_distance(self):
self.assertEqual(kND.best_configuration_per_dataset[232], 0)
self.assertEqual(kND.best_configuration_per_dataset[233], 1)
self.assertEqual(kND.best_configuration_per_dataset[234], 2)
- self.assertTrue((kND.metafeatures ==
- pd.DataFrame([self.anneal, self.krvskp, self.labor])).all().all())
+ self.assertTrue(
+ (kND.metafeatures == pd.DataFrame([self.anneal, self.krvskp, self.labor]))
+ .all()
+ .all()
+ )
# TODO: rename to kNearestTasks or something
def test_kNearestDatasets(self):
kND = KNearestDatasets(logger=self.logger)
- kND.fit(pd.DataFrame([self.krvskp, self.labor]),
- self.runs.loc[:, [233, 234]])
+ kND.fit(pd.DataFrame([self.krvskp, self.labor]), self.runs.loc[:, [233, 234]])
neighbor = kND.kNearestDatasets(self.anneal, 1)
self.assertEqual([233], neighbor)
- neighbor, distance = kND.kNearestDatasets(self.anneal, 1,
- return_distance=True)
+ neighbor, distance = kND.kNearestDatasets(self.anneal, 1, return_distance=True)
self.assertEqual([233], neighbor)
np.testing.assert_array_almost_equal([3.8320802803440586], distance)
neighbors = kND.kNearestDatasets(self.anneal, 2)
self.assertEqual([233, 234], neighbors)
- neighbors, distance = kND.kNearestDatasets(self.anneal, 2,
- return_distance=True)
+ neighbors, distance = kND.kNearestDatasets(self.anneal, 2, return_distance=True)
self.assertEqual([233, 234], neighbors)
- np.testing.assert_array_almost_equal([3.8320802803440586, 4.367919719655942], distance)
+ np.testing.assert_array_almost_equal(
+ [3.8320802803440586, 4.367919719655942], distance
+ )
neighbors = kND.kNearestDatasets(self.anneal, -1)
self.assertEqual([233, 234], neighbors)
- neighbors, distance = kND.kNearestDatasets(self.anneal, -1,
- return_distance=True)
+ neighbors, distance = kND.kNearestDatasets(
+ self.anneal, -1, return_distance=True
+ )
self.assertEqual([233, 234], neighbors)
- np.testing.assert_array_almost_equal([3.8320802803440586, 4.367919719655942], distance)
+ np.testing.assert_array_almost_equal(
+ [3.8320802803440586, 4.367919719655942], distance
+ )
self.assertRaises(ValueError, kND.kNearestDatasets, self.anneal, 0)
self.assertRaises(ValueError, kND.kNearestDatasets, self.anneal, -2)
def test_kBestSuggestions(self):
kND = KNearestDatasets(logger=self.logger)
- kND.fit(pd.DataFrame([self.krvskp, self.labor]),
- self.runs.loc[:, [233, 234]])
+ kND.fit(pd.DataFrame([self.krvskp, self.labor]), self.runs.loc[:, [233, 234]])
neighbor = kND.kBestSuggestions(self.anneal, 1)
np.testing.assert_array_almost_equal(
[(233, 3.8320802803440586, 1)],
@@ -87,10 +112,10 @@ def test_kBestSuggestions(self):
self.assertRaises(ValueError, kND.kBestSuggestions, self.anneal, -2)
def test_random_metric(self):
- kND = KNearestDatasets(logger=self.logger,
- metric=get_random_metric(random_state=1))
- kND.fit(pd.DataFrame([self.krvskp, self.labor]),
- self.runs.loc[:, [233, 234]])
+ kND = KNearestDatasets(
+ logger=self.logger, metric=get_random_metric(random_state=1)
+ )
+ kND.fit(pd.DataFrame([self.krvskp, self.labor]), self.runs.loc[:, [233, 234]])
distances = []
for i in range(20):
neighbor = kND.kBestSuggestions(self.anneal, 1)
diff --git a/test/test_metalearning/pyMetaLearn/test_meta_base.py b/test/test_metalearning/pyMetaLearn/test_meta_base.py
index b1ac39ee2a..1c6788e816 100644
--- a/test/test_metalearning/pyMetaLearn/test_meta_base.py
+++ b/test/test_metalearning/pyMetaLearn/test_meta_base.py
@@ -14,7 +14,7 @@ class MetaBaseTest(unittest.TestCase):
def setUp(self):
self.cwd = os.getcwd()
data_dir = os.path.dirname(__file__)
- data_dir = os.path.join(data_dir, 'test_meta_base_data')
+ data_dir = os.path.join(data_dir, "test_meta_base_data")
os.chdir(data_dir)
pipeline = autosklearn.pipeline.classification.SimpleClassificationPipeline()
@@ -33,33 +33,34 @@ def test_get_all_runs(self):
self.assertEqual((125, 125), runs.shape)
def test_get_runs(self):
- runs = self.base.get_runs('233')
+ runs = self.base.get_runs("233")
# TODO update this ASAP
self.assertEqual(125, len(runs))
self.assertIsInstance(runs, pd.Series)
def test_get_metafeatures_single_dataset(self):
- mf = self.base.get_metafeatures('233')
+ mf = self.base.get_metafeatures("233")
self.assertIsInstance(mf, pd.Series)
- self.assertEqual(mf.name, '233')
- self.assertEqual(mf.loc['NumberOfInstances'], 2142.0)
+ self.assertEqual(mf.name, "233")
+ self.assertEqual(mf.loc["NumberOfInstances"], 2142.0)
def test_get_metafeatures_single_feature(self):
- mf = self.base.get_metafeatures(features='NumberOfInstances')
+ mf = self.base.get_metafeatures(features="NumberOfInstances")
self.assertIsInstance(mf, pd.Series)
- self.assertEqual(mf.shape, (132, ))
+ self.assertEqual(mf.shape, (132,))
def test_get_metafeatures_single_dataset_and_single_feature(self):
- mf = self.base.get_metafeatures('233', features='NumberOfInstances')
+ mf = self.base.get_metafeatures("233", features="NumberOfInstances")
self.assertEqual(mf.shape, ())
def test_get_metafeatures_multiple_datasets(self):
- mf = self.base.get_metafeatures(['233', '236'])
+ mf = self.base.get_metafeatures(["233", "236"])
self.assertIsInstance(mf, pd.DataFrame)
self.assertEqual(mf.shape, (2, 46))
def test_get_metafeatures_multiple_features(self):
- mf = self.base.get_metafeatures(features=['NumberOfInstances',
- 'NumberOfClasses'])
+ mf = self.base.get_metafeatures(
+ features=["NumberOfInstances", "NumberOfClasses"]
+ )
self.assertIsInstance(mf, pd.DataFrame)
self.assertEqual(mf.shape, (132, 2))
diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features.py b/test/test_metalearning/pyMetaLearn/test_meta_features.py
index d31f3d0227..6a9bec4dcf 100644
--- a/test/test_metalearning/pyMetaLearn/test_meta_features.py
+++ b/test/test_metalearning/pyMetaLearn/test_meta_features.py
@@ -3,25 +3,21 @@
import tempfile
import unittest
+import arff
+import numpy as np
import pandas as pd
-
import pytest
-
-import arff
from joblib import Memory
-import numpy as np
-from sklearn.datasets import make_multilabel_classification, fetch_openml
+from sklearn.datasets import fetch_openml, make_multilabel_classification
-from autosklearn.pipeline.components.data_preprocessing.feature_type \
- import FeatTypeSplit
-from autosklearn.metalearning.metafeatures.metafeature import MetaFeatureValue
import autosklearn.metalearning.metafeatures.metafeatures as meta_features
+from autosklearn.metalearning.metafeatures.metafeature import MetaFeatureValue
+from autosklearn.pipeline.components.data_preprocessing.feature_type import (
+ FeatTypeSplit,
+)
-@pytest.fixture(
- scope='class',
- params=('pandas', 'numpy')
-)
+@pytest.fixture(scope="class", params=("pandas", "numpy"))
def multilabel_train_data(request):
cache = Memory(location=tempfile.gettempdir())
cached_func = cache.cache(make_multilabel_classification)
@@ -31,20 +27,17 @@ def multilabel_train_data(request):
n_classes=5,
n_labels=5,
return_indicator=True,
- random_state=1
+ random_state=1,
)
- if request.param == 'numpy':
+ if request.param == "numpy":
return X, y
- elif request.param == 'pandas':
+ elif request.param == "pandas":
return pd.DataFrame(X), y
else:
raise ValueError(request.param)
-@pytest.fixture(
- scope='class',
- params=('pandas', 'numpy')
-)
+@pytest.fixture(scope="class", params=("pandas", "numpy"))
def meta_train_data(request):
tests_dir = __file__
os.chdir(os.path.dirname(tests_dir))
@@ -55,40 +48,41 @@ def meta_train_data(request):
# -1 because the last attribute is the class
attribute_types = [
- 'numeric' if type(type_) != list else 'nominal'
- for name, type_ in dataset['attributes'][:-1]]
+ "numeric" if type(type_) != list else "nominal"
+ for name, type_ in dataset["attributes"][:-1]
+ ]
- categorical = {i: True if attribute == 'nominal' else False
- for i, attribute in enumerate(attribute_types)}
+ categorical = {
+ i: True if attribute == "nominal" else False
+ for i, attribute in enumerate(attribute_types)
+ }
- data = np.array(dataset['data'], dtype=np.float64)
+ data = np.array(dataset["data"], dtype=np.float64)
X = data[:, :-1]
y = data[:, -1].reshape((-1,))
- logger = logging.getLogger('Meta')
+ logger = logging.getLogger("Meta")
meta_features.helper_functions.set_value(
- "MissingValues", meta_features.helper_functions["MissingValues"](X, y, logger, categorical),
- )
+ "MissingValues",
+ meta_features.helper_functions["MissingValues"](X, y, logger, categorical),
+ )
meta_features.helper_functions.set_value(
"NumSymbols",
- meta_features.helper_functions["NumSymbols"](X, y, logger, categorical),
+ meta_features.helper_functions["NumSymbols"](X, y, logger, categorical),
)
meta_features.helper_functions.set_value(
"ClassOccurences",
meta_features.helper_functions["ClassOccurences"](X, y, logger),
)
- if request.param == 'numpy':
+ if request.param == "numpy":
return X, y, categorical
- elif request.param == 'pandas':
+ elif request.param == "pandas":
return pd.DataFrame(X), y, categorical
else:
raise ValueError(request.param)
-@pytest.fixture(
- scope='class',
- params=('pandas', 'numpy')
-)
+@pytest.fixture(scope="class", params=("pandas", "numpy"))
def meta_train_data_transformed(request):
tests_dir = __file__
os.chdir(os.path.dirname(tests_dir))
@@ -99,53 +93,67 @@ def meta_train_data_transformed(request):
# -1 because the last attribute is the class
attribute_types = [
- 'numeric' if type(type_) != list else 'nominal'
- for name, type_ in dataset['attributes'][:-1]]
- categorical = {i: True if attribute == 'nominal' else False
- for i, attribute in enumerate(attribute_types)}
+ "numeric" if type(type_) != list else "nominal"
+ for name, type_ in dataset["attributes"][:-1]
+ ]
+ categorical = {
+ i: True if attribute == "nominal" else False
+ for i, attribute in enumerate(attribute_types)
+ }
- data = np.array(dataset['data'], dtype=np.float64)
+ data = np.array(dataset["data"], dtype=np.float64)
X = data[:, :-1]
y = data[:, -1].reshape((-1,))
- logger = logging.getLogger('Meta')
+ logger = logging.getLogger("Meta")
meta_features.helper_functions.set_value(
- "MissingValues", meta_features.helper_functions["MissingValues"](X, y, logger, categorical),
- )
+ "MissingValues",
+ meta_features.helper_functions["MissingValues"](X, y, logger, categorical),
+ )
meta_features.helper_functions.set_value(
"NumSymbols",
- meta_features.helper_functions["NumSymbols"](X, y, logger, categorical),
+ meta_features.helper_functions["NumSymbols"](X, y, logger, categorical),
)
meta_features.helper_functions.set_value(
"ClassOccurences",
meta_features.helper_functions["ClassOccurences"](X, y, logger),
)
- DPP = FeatTypeSplit(feat_type={
- col: 'categorical' if category else 'numerical' for col, category in categorical.items()
- })
+ DPP = FeatTypeSplit(
+ feat_type={
+ col: "categorical" if category else "numerical"
+ for col, category in categorical.items()
+ }
+ )
X_transformed = DPP.fit_transform(X)
number_numerical = np.sum(~np.array(list(categorical.values())))
- categorical_transformed = {i: True if i < (X_transformed.shape[1] - number_numerical) else False
- for i in range(X_transformed.shape[1])}
+ categorical_transformed = {
+ i: True if i < (X_transformed.shape[1] - number_numerical) else False
+ for i in range(X_transformed.shape[1])
+ }
# pre-compute values for transformed inputs
meta_features.helper_functions.set_value(
- "PCA", meta_features.helper_functions["PCA"](X_transformed, y, logger),
+ "PCA",
+ meta_features.helper_functions["PCA"](X_transformed, y, logger),
)
meta_features.helper_functions.set_value(
- "Skewnesses", meta_features.helper_functions["Skewnesses"](
- X_transformed, y, logger, categorical_transformed),
+ "Skewnesses",
+ meta_features.helper_functions["Skewnesses"](
+ X_transformed, y, logger, categorical_transformed
+ ),
)
meta_features.helper_functions.set_value(
- "Kurtosisses", meta_features.helper_functions["Kurtosisses"](
- X_transformed, y, logger, categorical_transformed)
+ "Kurtosisses",
+ meta_features.helper_functions["Kurtosisses"](
+ X_transformed, y, logger, categorical_transformed
+ ),
)
- if request.param == 'numpy':
+ if request.param == "numpy":
return X_transformed, y, categorical_transformed
- elif request.param == 'pandas':
+ elif request.param == "pandas":
return pd.DataFrame(X_transformed), y, categorical_transformed
else:
raise ValueError(request.param)
@@ -154,7 +162,8 @@ def meta_train_data_transformed(request):
def test_number_of_instance(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["NumberOfInstances"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 898
assert isinstance(mf, MetaFeatureValue)
@@ -162,7 +171,8 @@ def test_number_of_instance(meta_train_data):
def test_number_of_classes(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["NumberOfClasses"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 5
assert isinstance(mf, MetaFeatureValue)
@@ -170,7 +180,8 @@ def test_number_of_classes(meta_train_data):
def test_number_of_features(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["NumberOfFeatures"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 38
assert isinstance(mf, MetaFeatureValue)
@@ -178,8 +189,9 @@ def test_number_of_features(meta_train_data):
def test_missing_values(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.helper_functions["MissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
- assert isinstance(mf.value, pd.DataFrame if hasattr(X, 'iloc') else np.ndarray)
+ X, y, logging.getLogger("Meta"), categorical
+ )
+ assert isinstance(mf.value, pd.DataFrame if hasattr(X, "iloc") else np.ndarray)
assert mf.value.shape == X.shape
assert 22175 == np.count_nonzero(mf.value)
@@ -187,7 +199,8 @@ def test_missing_values(meta_train_data):
def test_number_of_Instances_with_missing_values(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["NumberOfInstancesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 898
assert isinstance(mf, MetaFeatureValue)
@@ -197,10 +210,12 @@ def test_percentage_of_Instances_with_missing_values(meta_train_data):
meta_features.metafeatures.set_value(
"NumberOfInstancesWithMissingValues",
meta_features.metafeatures["NumberOfInstancesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical),
- )
+ X, y, logging.getLogger("Meta"), categorical
+ ),
+ )
mf = meta_features.metafeatures["PercentageOfInstancesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert pytest.approx(mf.value) == 1.0
assert isinstance(mf, MetaFeatureValue)
@@ -208,7 +223,8 @@ def test_percentage_of_Instances_with_missing_values(meta_train_data):
def test_number_of_features_with_missing_values(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["NumberOfFeaturesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 29
assert isinstance(mf, MetaFeatureValue)
@@ -218,18 +234,22 @@ def test_percentage_of_features_with_missing_values(meta_train_data):
meta_features.metafeatures.set_value(
"NumberOfFeaturesWithMissingValues",
meta_features.metafeatures["NumberOfFeaturesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical))
+ X, y, logging.getLogger("Meta"), categorical
+ ),
+ )
mf = meta_features.metafeatures["PercentageOfFeaturesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
- assert pytest.approx(mf.value) == float(29)/float(38)
+ X, y, logging.getLogger("Meta"), categorical
+ )
+ assert pytest.approx(mf.value) == float(29) / float(38)
assert isinstance(mf, MetaFeatureValue)
def test_number_of_missing_values(meta_train_data):
X, y, categorical = meta_train_data
- np.save('/tmp/debug', X)
+ np.save("/tmp/debug", X)
mf = meta_features.metafeatures["NumberOfMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 22175
assert isinstance(mf, MetaFeatureValue)
@@ -237,18 +257,23 @@ def test_number_of_missing_values(meta_train_data):
def test_percentage_missing_values(meta_train_data):
X, y, categorical = meta_train_data
meta_features.metafeatures.set_value(
- "NumberOfMissingValues", meta_features.metafeatures["NumberOfMissingValues"](
- X, y, logging.getLogger('Meta'), categorical))
+ "NumberOfMissingValues",
+ meta_features.metafeatures["NumberOfMissingValues"](
+ X, y, logging.getLogger("Meta"), categorical
+ ),
+ )
mf = meta_features.metafeatures["PercentageOfMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
- assert pytest.approx(mf.value) == (float(22175)/float(38*898))
+ X, y, logging.getLogger("Meta"), categorical
+ )
+ assert pytest.approx(mf.value) == (float(22175) / float(38 * 898))
assert isinstance(mf, MetaFeatureValue)
def test_number_of_numeric_features(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["NumberOfNumericFeatures"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 6
assert isinstance(mf, MetaFeatureValue)
@@ -256,7 +281,8 @@ def test_number_of_numeric_features(meta_train_data):
def test_number_of_categorical_features(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["NumberOfCategoricalFeatures"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 32
assert isinstance(mf, MetaFeatureValue)
@@ -264,62 +290,70 @@ def test_number_of_categorical_features(meta_train_data):
def test_ratio_numerical_to_categorical(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["RatioNumericalToNominal"](
- X, y, logging.getLogger('Meta'), categorical)
- assert pytest.approx(mf.value) == (float(6)/float(32))
+ X, y, logging.getLogger("Meta"), categorical
+ )
+ assert pytest.approx(mf.value) == (float(6) / float(32))
assert isinstance(mf, MetaFeatureValue)
def test_ratio_categorical_to_numerical(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["RatioNominalToNumerical"](
- X, y, logging.getLogger('Meta'), categorical)
- assert pytest.approx(mf.value) == (float(32)/float(6))
+ X, y, logging.getLogger("Meta"), categorical
+ )
+ assert pytest.approx(mf.value) == (float(32) / float(6))
assert isinstance(mf, MetaFeatureValue)
def test_dataset_ratio(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["DatasetRatio"](
- X, y, logging.getLogger('Meta'), categorical)
- assert pytest.approx(mf.value) == (float(38)/float(898))
+ X, y, logging.getLogger("Meta"), categorical
+ )
+ assert pytest.approx(mf.value) == (float(38) / float(898))
assert isinstance(mf, MetaFeatureValue)
def test_inverse_dataset_ratio(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["InverseDatasetRatio"](
- X, y, logging.getLogger('Meta'), categorical)
- assert pytest.approx(mf.value) == (float(898)/float(38))
+ X, y, logging.getLogger("Meta"), categorical
+ )
+ assert pytest.approx(mf.value) == (float(898) / float(38))
assert isinstance(mf, MetaFeatureValue)
def test_class_occurences(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.helper_functions["ClassOccurences"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == {0.0: 8.0, 1.0: 99.0, 2.0: 684.0, 4.0: 67.0, 5.0: 40.0}
def test_class_probability_min(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["ClassProbabilityMin"](
- X, y, logging.getLogger('Meta'), categorical)
- assert pytest.approx(mf.value) == (float(8)/float(898))
+ X, y, logging.getLogger("Meta"), categorical
+ )
+ assert pytest.approx(mf.value) == (float(8) / float(898))
assert isinstance(mf, MetaFeatureValue)
def test_class_probability_max(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["ClassProbabilityMax"](
- X, y, logging.getLogger('Meta'), categorical)
- assert pytest.approx(mf.value) == (float(684)/float(898))
+ X, y, logging.getLogger("Meta"), categorical
+ )
+ assert pytest.approx(mf.value) == (float(684) / float(898))
assert isinstance(mf, MetaFeatureValue)
def test_class_probability_mean(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["ClassProbabilityMean"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
classes = np.array((8, 99, 684, 67, 40), dtype=np.float64)
prob_mean = (classes / float(898)).mean()
assert pytest.approx(mf.value) == prob_mean
@@ -329,7 +363,8 @@ def test_class_probability_mean(meta_train_data):
def test_class_probability_std(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["ClassProbabilitySTD"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
classes = np.array((8, 99, 684, 67, 40), dtype=np.float64)
prob_std = (classes / float(898)).std()
assert pytest.approx(mf.value) == prob_std
@@ -339,53 +374,148 @@ def test_class_probability_std(meta_train_data):
def test_num_symbols(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.helper_functions["NumSymbols"](
- X, y, logging.getLogger('Meta'), categorical)
- symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, 0,
- 1, 1, 1, 0, 1, 1, 0, 3, 1, 0, 0, 0, 2, 2, 3, 2]
+ X, y, logging.getLogger("Meta"), categorical
+ )
+ symbol_frequency = [
+ 2,
+ 1,
+ 7,
+ 1,
+ 2,
+ 4,
+ 1,
+ 1,
+ 4,
+ 2,
+ 1,
+ 1,
+ 1,
+ 2,
+ 1,
+ 0,
+ 1,
+ 1,
+ 1,
+ 0,
+ 1,
+ 1,
+ 0,
+ 3,
+ 1,
+ 0,
+ 0,
+ 0,
+ 2,
+ 2,
+ 3,
+ 2,
+ ]
assert mf.value == symbol_frequency
def test_symbols_min(meta_train_data):
X, y, categorical = meta_train_data
- mf = meta_features.metafeatures["SymbolsMin"](X, y, logging.getLogger('Meta'), categorical)
+ mf = meta_features.metafeatures["SymbolsMin"](
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 1
def test_symbols_max(meta_train_data):
X, y, categorical = meta_train_data
# this is attribute steel
- mf = meta_features.metafeatures["SymbolsMax"](X, y, logging.getLogger('Meta'), categorical)
+ mf = meta_features.metafeatures["SymbolsMax"](
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 7
def test_symbols_mean(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["SymbolsMean"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
# Empty looking spaces denote empty attributes
- symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, #
- 1, 1, 1, 1, 1, 3, 1, 2, 2, 3, 2]
+ symbol_frequency = [
+ 2,
+ 1,
+ 7,
+ 1,
+ 2,
+ 4,
+ 1,
+ 1,
+ 4,
+ 2,
+ 1,
+ 1,
+ 1,
+ 2,
+ 1, #
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 3,
+ 1,
+ 2,
+ 2,
+ 3,
+ 2,
+ ]
assert pytest.approx(mf.value) == np.mean(symbol_frequency)
def test_symbols_std(meta_train_data):
X, y, categorical = meta_train_data
- mf = meta_features.metafeatures["SymbolsSTD"](X, y, logging.getLogger('Meta'), categorical)
- symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, #
- 1, 1, 1, 1, 1, 3, 1, 2, 2, 3, 2]
+ mf = meta_features.metafeatures["SymbolsSTD"](
+ X, y, logging.getLogger("Meta"), categorical
+ )
+ symbol_frequency = [
+ 2,
+ 1,
+ 7,
+ 1,
+ 2,
+ 4,
+ 1,
+ 1,
+ 4,
+ 2,
+ 1,
+ 1,
+ 1,
+ 2,
+ 1, #
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 3,
+ 1,
+ 2,
+ 2,
+ 3,
+ 2,
+ ]
assert pytest.approx(mf.value) == np.std(symbol_frequency)
def test_symbols_sum(meta_train_data):
X, y, categorical = meta_train_data
- mf = meta_features.metafeatures["SymbolsSum"](X, y, logging.getLogger('Meta'), categorical)
+ mf = meta_features.metafeatures["SymbolsSum"](
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 49
def test_class_entropy(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.metafeatures["ClassEntropy"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
classes = np.array((8, 99, 684, 67, 40), dtype=np.float64)
classes = classes / sum(classes)
entropy = -np.sum([c * np.log2(c) for c in classes])
@@ -396,15 +526,17 @@ def test_class_entropy(meta_train_data):
def test_calculate_all_metafeatures(meta_train_data):
X, y, categorical = meta_train_data
mf = meta_features.calculate_all_metafeatures(
- X, y, categorical, "2", logger=logging.getLogger('Meta'))
+ X, y, categorical, "2", logger=logging.getLogger("Meta")
+ )
assert 52 == len(mf.metafeature_values)
- assert mf.metafeature_values['NumberOfCategoricalFeatures'].value == 32
+ assert mf.metafeature_values["NumberOfCategoricalFeatures"].value == 32
def test_kurtosisses(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
mf = meta_features.helper_functions["Kurtosisses"](
- X_transformed, y, logging.getLogger('Meta'), categorical_transformed)
+ X_transformed, y, logging.getLogger("Meta"), categorical_transformed
+ )
assert 6 == len(mf.value)
@@ -412,34 +544,39 @@ def test_kurtosis_min(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["KurtosisMin"](
- X_transformed, y, logging.getLogger('Meta'), categorical_transformed)
+ X_transformed, y, logging.getLogger("Meta"), categorical_transformed
+ )
def test_kurtosis_max(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["KurtosisMax"](
- X_transformed, y, logging.getLogger('Meta'), categorical_transformed)
+ X_transformed, y, logging.getLogger("Meta"), categorical_transformed
+ )
def test_kurtosis_mean(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["KurtosisMean"](
- X_transformed, y, logging.getLogger('Meta'), categorical_transformed)
+ X_transformed, y, logging.getLogger("Meta"), categorical_transformed
+ )
def test_kurtosis_std(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["KurtosisSTD"](
- X_transformed, y, logging.getLogger('Meta'), categorical_transformed)
+ X_transformed, y, logging.getLogger("Meta"), categorical_transformed
+ )
def test_skewnesses(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
mf = meta_features.helper_functions["Skewnesses"](
- X_transformed, y, logging.getLogger('Meta'), categorical_transformed)
+ X_transformed, y, logging.getLogger("Meta"), categorical_transformed
+ )
assert 6 == len(mf.value)
@@ -447,62 +584,72 @@ def test_skewness_min(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["SkewnessMin"](
- X_transformed, y, logging.getLogger('Meta'), categorical_transformed)
+ X_transformed, y, logging.getLogger("Meta"), categorical_transformed
+ )
def test_skewness_max(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["SkewnessMax"](
- X_transformed, y, logging.getLogger('Meta'), categorical_transformed)
+ X_transformed, y, logging.getLogger("Meta"), categorical_transformed
+ )
def test_skewness_mean(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["SkewnessMean"](
- X_transformed, y, logging.getLogger('Meta'), categorical_transformed)
+ X_transformed, y, logging.getLogger("Meta"), categorical_transformed
+ )
def test_skewness_std(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["SkewnessSTD"](
- X_transformed, y, logging.getLogger('Meta'), categorical_transformed)
+ X_transformed, y, logging.getLogger("Meta"), categorical_transformed
+ )
def test_landmark_lda(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
- meta_features.metafeatures["LandmarkLDA"](X_transformed, y, logging.getLogger('Meta'))
+ meta_features.metafeatures["LandmarkLDA"](
+ X_transformed, y, logging.getLogger("Meta")
+ )
def test_landmark_naive_bayes(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["LandmarkNaiveBayes"](
- X_transformed, y, logging.getLogger('Meta'))
+ X_transformed, y, logging.getLogger("Meta")
+ )
def test_landmark_decision_tree(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["LandmarkDecisionTree"](
- X_transformed, y, logging.getLogger('Meta'))
+ X_transformed, y, logging.getLogger("Meta")
+ )
def test_decision_node(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["LandmarkDecisionNodeLearner"](
- X_transformed, y, logging.getLogger('Meta'))
+ X_transformed, y, logging.getLogger("Meta")
+ )
def test_random_node(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["LandmarkRandomNodeLearner"](
- X_transformed, y, logging.getLogger('Meta'))
+ X_transformed, y, logging.getLogger("Meta")
+ )
@unittest.skip("Currently not implemented!")
@@ -510,57 +657,72 @@ def test_worst_node(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
meta_features.metafeatures["LandmarkWorstNodeLearner"](
- X_transformed, y, logging.getLogger('Meta'))
+ X_transformed, y, logging.getLogger("Meta")
+ )
def test_1NN(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
# TODO: somehow compute the expected output?
- meta_features.metafeatures["Landmark1NN"](X_transformed, y, logging.getLogger('Meta'))
+ meta_features.metafeatures["Landmark1NN"](
+ X_transformed, y, logging.getLogger("Meta")
+ )
def test_pca(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
- meta_features.helper_functions["PCA"](X_transformed, y, logging.getLogger('Meta'))
+ meta_features.helper_functions["PCA"](X_transformed, y, logging.getLogger("Meta"))
def test_pca_95percent(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
mf = meta_features.metafeatures["PCAFractionOfComponentsFor95PercentVariance"](
- X_transformed, y, logging.getLogger('Meta'))
+ X_transformed, y, logging.getLogger("Meta")
+ )
assert pytest.approx(0.2716049382716049) == mf.value
def test_pca_kurtosis_first_pc(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
mf = meta_features.metafeatures["PCAKurtosisFirstPC"](
- X_transformed, y, logging.getLogger('Meta'))
+ X_transformed, y, logging.getLogger("Meta")
+ )
assert pytest.approx(-0.702850) != mf.value
def test_pca_skewness_first_pc(meta_train_data_transformed):
X_transformed, y, categorical_transformed = meta_train_data_transformed
mf = meta_features.metafeatures["PCASkewnessFirstPC"](
- X_transformed, y, logging.getLogger('Meta'))
+ X_transformed, y, logging.getLogger("Meta")
+ )
assert pytest.approx(0.051210) != mf.value
def test_class_occurences_multilabel(multilabel_train_data):
X, y = multilabel_train_data
- mf = meta_features.helper_functions["ClassOccurences"](X, y, logging.getLogger('Meta'))
- assert mf.value == [{0: 16.0, 1: 84.0},
- {0: 8.0, 1: 92.0},
- {0: 68.0, 1: 32.0},
- {0: 15.0, 1: 85.0},
- {0: 28.0, 1: 72.0}]
+ mf = meta_features.helper_functions["ClassOccurences"](
+ X, y, logging.getLogger("Meta")
+ )
+ assert mf.value == [
+ {0: 16.0, 1: 84.0},
+ {0: 8.0, 1: 92.0},
+ {0: 68.0, 1: 32.0},
+ {0: 15.0, 1: 85.0},
+ {0: 28.0, 1: 72.0},
+ ]
def test_class_probability_min_multilabel(multilabel_train_data):
X, y = multilabel_train_data
meta_features.helper_functions.set_value(
- "ClassOccurences", meta_features.helper_functions["ClassOccurences"](
- X, y, logging.getLogger('Meta')))
- mf = meta_features.metafeatures["ClassProbabilityMin"](X, y, logging.getLogger('Meta'))
+ "ClassOccurences",
+ meta_features.helper_functions["ClassOccurences"](
+ X, y, logging.getLogger("Meta")
+ ),
+ )
+ mf = meta_features.metafeatures["ClassProbabilityMin"](
+ X, y, logging.getLogger("Meta")
+ )
assert pytest.approx(mf.value) == (float(8) / float(100))
assert isinstance(mf, MetaFeatureValue)
@@ -568,9 +730,14 @@ def test_class_probability_min_multilabel(multilabel_train_data):
def test_class_probability_max_multilabel(multilabel_train_data):
X, y = multilabel_train_data
meta_features.helper_functions.set_value(
- "ClassOccurences", meta_features.helper_functions["ClassOccurences"](
- X, y, logging.getLogger('Meta')))
- mf = meta_features.metafeatures["ClassProbabilityMax"](X, y, logging.getLogger('Meta'))
+ "ClassOccurences",
+ meta_features.helper_functions["ClassOccurences"](
+ X, y, logging.getLogger("Meta")
+ ),
+ )
+ mf = meta_features.metafeatures["ClassProbabilityMax"](
+ X, y, logging.getLogger("Meta")
+ )
assert pytest.approx(mf.value) == (float(92) / float(100))
assert isinstance(mf, MetaFeatureValue)
@@ -578,9 +745,14 @@ def test_class_probability_max_multilabel(multilabel_train_data):
def test_class_probability_mean_multilabel(multilabel_train_data):
X, y = multilabel_train_data
meta_features.helper_functions.set_value(
- "ClassOccurences", meta_features.helper_functions["ClassOccurences"](
- X, y, logging.getLogger('Meta')))
- mf = meta_features.metafeatures["ClassProbabilityMean"](X, y, logging.getLogger('Meta'))
+ "ClassOccurences",
+ meta_features.helper_functions["ClassOccurences"](
+ X, y, logging.getLogger("Meta")
+ ),
+ )
+ mf = meta_features.metafeatures["ClassProbabilityMean"](
+ X, y, logging.getLogger("Meta")
+ )
classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)]
probas = np.mean([np.mean(np.array(cls_)) / 100 for cls_ in classes])
assert mf.value == pytest.approx(probas)
@@ -589,7 +761,7 @@ def test_class_probability_mean_multilabel(multilabel_train_data):
def test_number_of_classes_multilabel(multilabel_train_data):
X, y = multilabel_train_data
- mf = meta_features.metafeatures["NumberOfClasses"](X, y, logging.getLogger('Meta'))
+ mf = meta_features.metafeatures["NumberOfClasses"](X, y, logging.getLogger("Meta"))
assert mf.value == 5
assert isinstance(mf, MetaFeatureValue)
@@ -597,18 +769,23 @@ def test_number_of_classes_multilabel(multilabel_train_data):
def test_class_probability_std_multilabel(multilabel_train_data):
X, y = multilabel_train_data
meta_features.helper_functions.set_value(
- "ClassOccurences", meta_features.helper_functions["ClassOccurences"](
- X, y, logging.getLogger('Meta')))
- mf = meta_features.metafeatures["ClassProbabilitySTD"](X, y, logging.getLogger('Meta'))
+ "ClassOccurences",
+ meta_features.helper_functions["ClassOccurences"](
+ X, y, logging.getLogger("Meta")
+ ),
+ )
+ mf = meta_features.metafeatures["ClassProbabilitySTD"](
+ X, y, logging.getLogger("Meta")
+ )
classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)]
- probas = np.mean([np.std(np.array(cls_) / 100.) for cls_ in classes])
+ probas = np.mean([np.std(np.array(cls_) / 100.0) for cls_ in classes])
assert pytest.approx(mf.value) == probas
assert isinstance(mf, MetaFeatureValue)
def test_class_entropy_multilabel(multilabel_train_data):
X, y = multilabel_train_data
- mf = meta_features.metafeatures["ClassEntropy"](X, y, logging.getLogger('Meta'))
+ mf = meta_features.metafeatures["ClassEntropy"](X, y, logging.getLogger("Meta"))
classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)]
entropies = []
@@ -623,39 +800,45 @@ def test_class_entropy_multilabel(multilabel_train_data):
def test_landmark_lda_multilabel(multilabel_train_data):
X, y = multilabel_train_data
- mf = meta_features.metafeatures["LandmarkLDA"](X, y, logging.getLogger('Meta'))
+ mf = meta_features.metafeatures["LandmarkLDA"](X, y, logging.getLogger("Meta"))
assert np.isfinite(mf.value)
def test_landmark_naive_bayes_multilabel(multilabel_train_data):
X, y = multilabel_train_data
- mf = meta_features.metafeatures["LandmarkNaiveBayes"](X, y, logging.getLogger('Meta'))
+ mf = meta_features.metafeatures["LandmarkNaiveBayes"](
+ X, y, logging.getLogger("Meta")
+ )
assert np.isfinite(mf.value)
def test_landmark_decision_tree_multilabel(multilabel_train_data):
X, y = multilabel_train_data
- mf = meta_features.metafeatures["LandmarkDecisionTree"](X, y, logging.getLogger('Meta'))
+ mf = meta_features.metafeatures["LandmarkDecisionTree"](
+ X, y, logging.getLogger("Meta")
+ )
assert np.isfinite(mf.value)
def test_landmark_decision_node_multilabel(multilabel_train_data):
X, y = multilabel_train_data
mf = meta_features.metafeatures["LandmarkDecisionNodeLearner"](
- X, y, logging.getLogger('Meta'))
+ X, y, logging.getLogger("Meta")
+ )
assert np.isfinite(mf.value)
def test_landmark_random_node_multilabel(multilabel_train_data):
X, y = multilabel_train_data
mf = meta_features.metafeatures["LandmarkRandomNodeLearner"](
- X, y, logging.getLogger('Meta'))
+ X, y, logging.getLogger("Meta")
+ )
assert np.isfinite(mf.value)
def test_1NN_multilabel(multilabel_train_data):
X, y = multilabel_train_data
- mf = meta_features.metafeatures["Landmark1NN"](X, y, logging.getLogger('TestMeta'))
+ mf = meta_features.metafeatures["Landmark1NN"](X, y, logging.getLogger("TestMeta"))
assert np.isfinite(mf.value)
@@ -664,7 +847,8 @@ def test_calculate_all_metafeatures_multilabel(multilabel_train_data):
X, y = multilabel_train_data
categorical = {i: False for i in range(10)}
mf = meta_features.calculate_all_metafeatures(
- X, y, categorical, "Generated", logger=logging.getLogger('TestMeta'))
+ X, y, categorical, "Generated", logger=logging.getLogger("TestMeta")
+ )
assert 52 == len(mf.metafeature_values)
@@ -675,77 +859,84 @@ def test_calculate_all_metafeatures_same_results_across_datatypes():
all metafeatures work in this complex dataset
"""
X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=True)
- categorical = {col: True if X[col].dtype.name == 'category' else False
- for col in X.columns}
+ categorical = {
+ col: True if X[col].dtype.name == "category" else False for col in X.columns
+ }
mf = meta_features.calculate_all_metafeatures(
- X, y, categorical, "2", logger=logging.getLogger('Meta'))
+ X, y, categorical, "2", logger=logging.getLogger("Meta")
+ )
assert 52 == len(mf.metafeature_values)
expected = {
- 'PCASkewnessFirstPC': 0.41897660337677867,
- 'PCAKurtosisFirstPC': -0.677692541156901,
- 'PCAFractionOfComponentsFor95PercentVariance': 0.2716049382716049,
- 'ClassEntropy': 1.1898338562043977,
- 'SkewnessSTD': 7.540418815675546,
- 'SkewnessMean': 1.47397188548894,
- 'SkewnessMax': 29.916569235579203,
- 'SkewnessMin': -29.916569235579203,
- 'KurtosisSTD': 153.0563504598898,
- 'KurtosisMean': 56.998860939761165,
- 'KurtosisMax': 893.0011148272025,
- 'KurtosisMin': -3.0,
- 'SymbolsSum': 49,
- 'SymbolsSTD': 1.3679553264445183,
- 'SymbolsMean': 1.8846153846153846,
- 'SymbolsMax': 7,
- 'SymbolsMin': 1,
- 'ClassProbabilitySTD': 0.28282850691819206,
- 'ClassProbabilityMean': 0.2,
- 'ClassProbabilityMax': 0.7616926503340757,
- 'ClassProbabilityMin': 0.008908685968819599,
- 'InverseDatasetRatio': 23.63157894736842,
- 'DatasetRatio': 0.042316258351893093,
- 'RatioNominalToNumerical': 5.333333333333333,
- 'RatioNumericalToNominal': 0.1875,
- 'NumberOfCategoricalFeatures': 32,
- 'NumberOfNumericFeatures': 6,
- 'NumberOfMissingValues': 22175.0,
- 'NumberOfFeaturesWithMissingValues': 29.0,
- 'NumberOfInstancesWithMissingValues': 898.0,
- 'NumberOfFeatures': 38.0,
- 'NumberOfClasses': 5.0,
- 'NumberOfInstances': 898.0,
- 'LogInverseDatasetRatio': 3.162583908575814,
- 'LogDatasetRatio': -3.162583908575814,
- 'PercentageOfMissingValues': 0.6498358926268901,
- 'PercentageOfFeaturesWithMissingValues': 0.7631578947368421,
- 'PercentageOfInstancesWithMissingValues': 1.0,
- 'LogNumberOfFeatures': 3.6375861597263857,
- 'LogNumberOfInstances': 6.8001700683022,
+ "PCASkewnessFirstPC": 0.41897660337677867,
+ "PCAKurtosisFirstPC": -0.677692541156901,
+ "PCAFractionOfComponentsFor95PercentVariance": 0.2716049382716049,
+ "ClassEntropy": 1.1898338562043977,
+ "SkewnessSTD": 7.540418815675546,
+ "SkewnessMean": 1.47397188548894,
+ "SkewnessMax": 29.916569235579203,
+ "SkewnessMin": -29.916569235579203,
+ "KurtosisSTD": 153.0563504598898,
+ "KurtosisMean": 56.998860939761165,
+ "KurtosisMax": 893.0011148272025,
+ "KurtosisMin": -3.0,
+ "SymbolsSum": 49,
+ "SymbolsSTD": 1.3679553264445183,
+ "SymbolsMean": 1.8846153846153846,
+ "SymbolsMax": 7,
+ "SymbolsMin": 1,
+ "ClassProbabilitySTD": 0.28282850691819206,
+ "ClassProbabilityMean": 0.2,
+ "ClassProbabilityMax": 0.7616926503340757,
+ "ClassProbabilityMin": 0.008908685968819599,
+ "InverseDatasetRatio": 23.63157894736842,
+ "DatasetRatio": 0.042316258351893093,
+ "RatioNominalToNumerical": 5.333333333333333,
+ "RatioNumericalToNominal": 0.1875,
+ "NumberOfCategoricalFeatures": 32,
+ "NumberOfNumericFeatures": 6,
+ "NumberOfMissingValues": 22175.0,
+ "NumberOfFeaturesWithMissingValues": 29.0,
+ "NumberOfInstancesWithMissingValues": 898.0,
+ "NumberOfFeatures": 38.0,
+ "NumberOfClasses": 5.0,
+ "NumberOfInstances": 898.0,
+ "LogInverseDatasetRatio": 3.162583908575814,
+ "LogDatasetRatio": -3.162583908575814,
+ "PercentageOfMissingValues": 0.6498358926268901,
+ "PercentageOfFeaturesWithMissingValues": 0.7631578947368421,
+ "PercentageOfInstancesWithMissingValues": 1.0,
+ "LogNumberOfFeatures": 3.6375861597263857,
+ "LogNumberOfInstances": 6.8001700683022,
}
assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected)
expected_landmarks = {
- 'Landmark1NN': 0.9721601489757914,
- 'LandmarkRandomNodeLearner': 0.7616945996275606,
- 'LandmarkDecisionNodeLearner': 0.7827932960893855,
- 'LandmarkDecisionTree': 0.9899875853507139,
- 'LandmarkNaiveBayes': 0.9287150837988827,
- 'LandmarkLDA': 0.9610242085661079,
+ "Landmark1NN": 0.9721601489757914,
+ "LandmarkRandomNodeLearner": 0.7616945996275606,
+ "LandmarkDecisionNodeLearner": 0.7827932960893855,
+ "LandmarkDecisionTree": 0.9899875853507139,
+ "LandmarkNaiveBayes": 0.9287150837988827,
+ "LandmarkLDA": 0.9610242085661079,
}
assert {k: mf[k].value for k in expected_landmarks.keys()} == pytest.approx(
- expected_landmarks, rel=1e-5)
+ expected_landmarks, rel=1e-5
+ )
# Then do numpy!
X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=False)
- categorical = {i: True if category else False
- for i, category in enumerate(categorical.values())}
+ categorical = {
+ i: True if category else False
+ for i, category in enumerate(categorical.values())
+ }
mf = meta_features.calculate_all_metafeatures(
- X, y, categorical, "2", logger=logging.getLogger('Meta'))
+ X, y, categorical, "2", logger=logging.getLogger("Meta")
+ )
assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected)
# The column-reorder of pandas and numpy array are different after
# the data preprocessing. So we cannot directly compare, and landmarking is
# sensible to column order
- expected_landmarks['LandmarkDecisionTree'] = 0.9922098075729361
+ expected_landmarks["LandmarkDecisionTree"] = 0.9922098075729361
assert {k: mf[k].value for k in expected_landmarks.keys()} == pytest.approx(
- expected_landmarks, rel=1e-5)
+ expected_landmarks, rel=1e-5
+ )
diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py b/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py
index 3239184469..856fd595cb 100644
--- a/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py
+++ b/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py
@@ -2,19 +2,16 @@
import os
import arff
-
import numpy as np
-
import pytest
-
from scipy import sparse
-
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
-from autosklearn.pipeline.components.data_preprocessing.feature_type \
- import FeatTypeSplit
import autosklearn.metalearning.metafeatures.metafeatures as meta_features
+from autosklearn.pipeline.components.data_preprocessing.feature_type import (
+ FeatTypeSplit,
+)
@pytest.fixture
@@ -28,12 +25,15 @@ def sparse_data():
# -1 because the last attribute is the class
attribute_types = [
- 'numeric' if type(type_) != list else 'nominal'
- for name, type_ in dataset['attributes'][:-1]]
- categorical = {i: True if attribute == 'nominal' else False
- for i, attribute in enumerate(attribute_types)}
+ "numeric" if type(type_) != list else "nominal"
+ for name, type_ in dataset["attributes"][:-1]
+ ]
+ categorical = {
+ i: True if attribute == "nominal" else False
+ for i, attribute in enumerate(attribute_types)
+ }
- data = np.array(dataset['data'], dtype=np.float64)
+ data = np.array(dataset["data"], dtype=np.float64)
X = data[:, :-1]
y = data[:, -1].reshape((-1,))
@@ -53,19 +53,19 @@ def sparse_data():
helpers.set_value(
"MissingValues",
helpers["MissingValues"](X, y, logger, categorical),
- )
+ )
mf.set_value(
"NumberOfMissingValues",
mf["NumberOfMissingValues"](X, y, logger, categorical),
- )
+ )
helpers.set_value(
"NumSymbols",
helpers["NumSymbols"](X, y, logger, categorical),
- )
+ )
helpers.set_value(
"ClassOccurences",
helpers["ClassOccurences"](X, y, logger),
- )
+ )
return X, y, categorical
@@ -80,12 +80,15 @@ def sparse_data_transformed():
# -1 because the last attribute is the class
attribute_types = [
- 'numeric' if type(type_) != list else 'nominal'
- for name, type_ in dataset['attributes'][:-1]]
- categorical = {i: True if attribute == 'nominal' else False
- for i, attribute in enumerate(attribute_types)}
+ "numeric" if type(type_) != list else "nominal"
+ for name, type_ in dataset["attributes"][:-1]
+ ]
+ categorical = {
+ i: True if attribute == "nominal" else False
+ for i, attribute in enumerate(attribute_types)
+ }
- data = np.array(dataset['data'], dtype=np.float64)
+ data = np.array(dataset["data"], dtype=np.float64)
X = data[:, :-1]
y = data[:, -1].reshape((-1,))
@@ -96,10 +99,12 @@ def sparse_data_transformed():
X_sparse[NaNs] = 0
X_sparse = sparse.csr_matrix(X_sparse)
- ohe = FeatTypeSplit(feat_type={
- col: 'categorical' if category else 'numerical'
- for col, category in categorical.items()
- })
+ ohe = FeatTypeSplit(
+ feat_type={
+ col: "categorical" if category else "numerical"
+ for col, category in categorical.items()
+ }
+ )
X_transformed = X_sparse.copy()
X_transformed = ohe.fit_transform(X_transformed)
imp = SimpleImputer(copy=False)
@@ -109,8 +114,10 @@ def sparse_data_transformed():
# Transform the array which indicates the categorical metafeatures
number_numerical = np.sum(~np.array(list(categorical.values())))
- categorical_transformed = {i: True if i < (X_transformed.shape[1] - number_numerical) else False
- for i in range(X_transformed.shape[1])}
+ categorical_transformed = {
+ i: True if i < (X_transformed.shape[1] - number_numerical) else False
+ for i in range(X_transformed.shape[1])
+ }
X = X_sparse
X_transformed = X_transformed
@@ -123,28 +130,27 @@ def sparse_data_transformed():
helpers.set_value(
"PCA",
helpers["PCA"](X_transformed, y, logger),
- )
+ )
helpers.set_value(
"MissingValues",
helpers["MissingValues"](X, y, logger, categorical),
- )
+ )
mf.set_value(
"NumberOfMissingValues",
mf["NumberOfMissingValues"](X, y, logger, categorical),
- )
+ )
helpers.set_value(
"NumSymbols",
helpers["NumSymbols"](X, y, logger, categorical),
- )
+ )
helpers.set_value(
"ClassOccurences",
helpers["ClassOccurences"](X, y, logger),
- )
+ )
helpers.set_value(
"Skewnesses",
- helpers["Skewnesses"](X_transformed, y, logger,
- categorical_transformed),
- )
+ helpers["Skewnesses"](X_transformed, y, logger, categorical_transformed),
+ )
helpers.set_value(
"Kurtosisses",
helpers["Kurtosisses"](X_transformed, y, logger, categorical_transformed),
@@ -155,7 +161,8 @@ def sparse_data_transformed():
def test_missing_values(sparse_data):
X, y, categorical = sparse_data
mf = meta_features.helper_functions["MissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert sparse.issparse(mf.value)
assert mf.value.shape == X.shape
assert mf.value.dtype == bool
@@ -165,21 +172,24 @@ def test_missing_values(sparse_data):
def test_number_of_missing_values(sparse_data):
X, y, categorical = sparse_data
mf = meta_features.metafeatures["NumberOfMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert 0 == mf.value
def test_percentage_missing_values(sparse_data):
X, y, categorical = sparse_data
mf = meta_features.metafeatures["PercentageOfMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert 0 == mf.value
def test_number_of_Instances_with_missing_values(sparse_data):
X, y, categorical = sparse_data
mf = meta_features.metafeatures["NumberOfInstancesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert 0 == mf.value
@@ -188,16 +198,20 @@ def test_percentage_of_Instances_with_missing_values(sparse_data):
meta_features.metafeatures.set_value(
"NumberOfInstancesWithMissingValues",
meta_features.metafeatures["NumberOfInstancesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical))
+ X, y, logging.getLogger("Meta"), categorical
+ ),
+ )
mf = meta_features.metafeatures["PercentageOfInstancesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert pytest.approx(0) == mf.value
def test_number_of_features_with_missing_values(sparse_data):
X, y, categorical = sparse_data
mf = meta_features.metafeatures["NumberOfFeaturesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert 0 == mf.value
@@ -206,33 +220,72 @@ def test_percentage_of_features_with_missing_values(sparse_data):
meta_features.metafeatures.set_value(
"NumberOfFeaturesWithMissingValues",
meta_features.metafeatures["NumberOfFeaturesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical))
+ X, y, logging.getLogger("Meta"), categorical
+ ),
+ )
mf = meta_features.metafeatures["PercentageOfFeaturesWithMissingValues"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert pytest.approx(0, mf.value)
def test_num_symbols(sparse_data):
X, y, categorical = sparse_data
mf = meta_features.helper_functions["NumSymbols"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
- symbol_frequency = [2, 0, 6, 0, 1, 3, 0, 0, 3, 1, 0, 0, 0, 1, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 2, 2]
+ symbol_frequency = [
+ 2,
+ 0,
+ 6,
+ 0,
+ 1,
+ 3,
+ 0,
+ 0,
+ 3,
+ 1,
+ 0,
+ 0,
+ 0,
+ 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 2,
+ 0,
+ 0,
+ 0,
+ 0,
+ 1,
+ 1,
+ 2,
+ 2,
+ ]
assert mf.value == symbol_frequency
def test_symbols_max(sparse_data):
X, y, categorical = sparse_data
# this is attribute steel
- mf = meta_features.metafeatures["SymbolsMax"](X, y, logging.getLogger('Meta'), categorical)
+ mf = meta_features.metafeatures["SymbolsMax"](
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 6
def test_symbols_mean(sparse_data):
X, y, categorical = sparse_data
mf = meta_features.metafeatures["SymbolsMean"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
# Empty looking spaces denote empty attributes
symbol_frequency = [2, 6, 1, 3, 3, 1, 1, 2, 1, 1, 2, 2]
assert pytest.approx(mf.value) == np.mean(symbol_frequency)
@@ -241,7 +294,8 @@ def test_symbols_mean(sparse_data):
def test_symbols_std(sparse_data):
X, y, categorical = sparse_data
mf = meta_features.metafeatures["SymbolsSTD"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
symbol_frequency = [2, 6, 1, 3, 3, 1, 1, 2, 1, 1, 2, 2]
assert pytest.approx(mf.value) == np.std(symbol_frequency)
@@ -249,19 +303,49 @@ def test_symbols_std(sparse_data):
def test_symbols_sum(sparse_data):
X, y, categorical = sparse_data
mf = meta_features.metafeatures["SymbolsSum"](
- X, y, logging.getLogger('Meta'), categorical)
+ X, y, logging.getLogger("Meta"), categorical
+ )
assert mf.value == 25
def test_skewnesses(sparse_data_transformed):
X_transformed, y, categorical_transformed = sparse_data_transformed
fixture = [
- 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- -0.696970849903357, 0.626346013011262, 0.38099875966240554,
- 1.4762248835141032, 0.07687661087633788, 0.3688979783036015
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ -0.696970849903357,
+ 0.626346013011262,
+ 0.38099875966240554,
+ 1.4762248835141032,
+ 0.07687661087633788,
+ 0.3688979783036015,
]
- mf = meta_features.helper_functions["Skewnesses"](X_transformed, y, logging.getLogger('Meta'))
+ mf = meta_features.helper_functions["Skewnesses"](
+ X_transformed, y, logging.getLogger("Meta")
+ )
print(mf.value)
print(fixture)
np.testing.assert_allclose(mf.value, fixture)
@@ -269,13 +353,42 @@ def test_skewnesses(sparse_data_transformed):
def test_kurtosisses(sparse_data_transformed):
fixture = [
- -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0,
- -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0,
- -3.0, -1.1005836114255763, -1.1786325509475744, -1.23879983823279,
- 1.3934382644137013, -0.9768209837948336, -1.7937072296512784
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -3.0,
+ -1.1005836114255763,
+ -1.1786325509475744,
+ -1.23879983823279,
+ 1.3934382644137013,
+ -0.9768209837948336,
+ -1.7937072296512784,
]
X_transformed, y, categorical_transformed = sparse_data_transformed
- mf = meta_features.helper_functions["Kurtosisses"](X_transformed, y, logging.getLogger('Meta'))
+ mf = meta_features.helper_functions["Kurtosisses"](
+ X_transformed, y, logging.getLogger("Meta")
+ )
print(mf.value)
np.testing.assert_allclose(mf.value, fixture)
@@ -283,26 +396,30 @@ def test_kurtosisses(sparse_data_transformed):
def test_pca_95percent(sparse_data_transformed):
X_transformed, y, categorical_transformed = sparse_data_transformed
mf = meta_features.metafeatures["PCAFractionOfComponentsFor95PercentVariance"](
- X_transformed, y, logging.getLogger('Meta'))
+ X_transformed, y, logging.getLogger("Meta")
+ )
assert pytest.approx(0.7741935483870968) == mf.value
def test_pca_kurtosis_first_pc(sparse_data_transformed):
X_transformed, y, categorical_transformed = sparse_data_transformed
mf = meta_features.metafeatures["PCAKurtosisFirstPC"](
- X_transformed, y, logging.getLogger('Meta'))
+ X_transformed, y, logging.getLogger("Meta")
+ )
assert pytest.approx(-0.15444516166802469) == mf.value
def test_pca_skewness_first_pc(sparse_data_transformed):
X_transformed, y, categorical_transformed = sparse_data_transformed
mf = meta_features.metafeatures["PCASkewnessFirstPC"](
- X_transformed, y, logging.getLogger('Meta'))
+ X_transformed, y, logging.getLogger("Meta")
+ )
assert pytest.approx(0.026514792083623905) == mf.value
def test_calculate_all_metafeatures(sparse_data):
X, y, categorical = sparse_data
mf = meta_features.calculate_all_metafeatures(
- X, y, categorical, "2", logger=logging.getLogger('Meta'))
+ X, y, categorical, "2", logger=logging.getLogger("Meta")
+ )
assert 52 == len(mf.metafeature_values)
diff --git a/test/test_metalearning/pyMetaLearn/test_metalearner.py b/test/test_metalearning/pyMetaLearn/test_metalearner.py
index 58f2ce800a..a8b7d604cb 100644
--- a/test/test_metalearning/pyMetaLearn/test_metalearner.py
+++ b/test/test_metalearning/pyMetaLearn/test_metalearner.py
@@ -1,14 +1,13 @@
import logging
-import numpy as np
import os
import unittest
+import numpy as np
import pandas as pd
-
from ConfigSpace.configuration_space import Configuration
-import autosklearn.pipeline.classification
-import autosklearn.metalearning.optimizers.metalearn_optimizer.metalearner as metalearner
+import autosklearn.metalearning.optimizers.metalearn_optimizer.metalearner as metalearner # noqa: E501
+import autosklearn.pipeline.classification
from autosklearn.metalearning.metalearning.meta_base import MetaBase
logging.basicConfig()
@@ -20,7 +19,7 @@ class MetaLearnerTest(unittest.TestCase):
def setUp(self):
self.cwd = os.getcwd()
data_dir = os.path.dirname(__file__)
- data_dir = os.path.join(data_dir, 'test_meta_base_data')
+ data_dir = os.path.join(data_dir, "test_meta_base_data")
os.chdir(data_dir)
pipeline = autosklearn.pipeline.classification.SimpleClassificationPipeline()
@@ -29,7 +28,8 @@ def setUp(self):
self.logger = logging.getLogger()
meta_base = MetaBase(self.cs, data_dir, logger=self.logger)
self.meta_optimizer = metalearner.MetaLearningOptimizer(
- '233', self.cs, meta_base, logger=self.logger)
+ "233", self.cs, meta_base, logger=self.logger
+ )
def tearDown(self):
os.chdir(self.cwd)
@@ -38,8 +38,8 @@ def test_metalearning_suggest_all(self):
ret = self.meta_optimizer.metalearning_suggest_all()
self.assertEqual(124, len(ret))
# Reduced to 17 as we changed QDA searchspace
- self.assertEqual('gradient_boosting', ret[0]['classifier:__choice__'])
- self.assertEqual('adaboost', ret[1]['classifier:__choice__'])
+ self.assertEqual("gradient_boosting", ret[0]["classifier:__choice__"])
+ self.assertEqual("adaboost", ret[1]["classifier:__choice__"])
# There is no test for exclude_double_configuration as it's not present
# in the test data
@@ -48,17 +48,17 @@ def test_metalearning_suggest_all_nan_metafeatures(self):
ret = self.meta_optimizer.metalearning_suggest_all()
self.assertEqual(124, len(ret))
# Reduced to 17 as we changed QDA searchspace
- self.assertEqual('gradient_boosting', ret[0]['classifier:__choice__'])
- self.assertEqual('gradient_boosting', ret[1]['classifier:__choice__'])
+ self.assertEqual("gradient_boosting", ret[0]["classifier:__choice__"])
+ self.assertEqual("gradient_boosting", ret[1]["classifier:__choice__"])
def test_metalearning_suggest(self):
ret = self.meta_optimizer.metalearning_suggest([])
self.assertIsInstance(ret, Configuration)
- self.assertEqual('gradient_boosting', ret['classifier:__choice__'])
+ self.assertEqual("gradient_boosting", ret["classifier:__choice__"])
ret2 = self.meta_optimizer.metalearning_suggest([ret])
self.assertIsInstance(ret2, Configuration)
- self.assertEqual('adaboost', ret2['classifier:__choice__'])
+ self.assertEqual("adaboost", ret2["classifier:__choice__"])
def test_learn(self):
# Test only some special cases which are probably not yet handled
@@ -67,8 +67,10 @@ def test_learn(self):
self.meta_optimizer._learn()
def test_split_metafeature_array(self):
- ds_metafeatures, other_metafeatures = self.meta_optimizer. \
- _split_metafeature_array()
+ (
+ ds_metafeatures,
+ other_metafeatures,
+ ) = self.meta_optimizer._split_metafeature_array()
self.assertIsInstance(ds_metafeatures, pd.Series)
self.assertEqual(ds_metafeatures.shape, (46,))
self.assertIsInstance(other_metafeatures, pd.DataFrame)
diff --git a/test/test_metalearning/pyMetaLearn/test_optimizer_base.py b/test/test_metalearning/pyMetaLearn/test_optimizer_base.py
index a78a6a7f61..63dc2184da 100644
--- a/test/test_metalearning/pyMetaLearn/test_optimizer_base.py
+++ b/test/test_metalearning/pyMetaLearn/test_optimizer_base.py
@@ -1,5 +1,5 @@
-from collections import OrderedDict
import unittest
+from collections import OrderedDict
from autosklearn.metalearning.optimizers import optimizer_base
@@ -14,8 +14,9 @@ def setUp(self):
def test_parse_hyperopt_string(self):
hyperparameter_string = "x {-5, 0, 5, 10}\ny {0, 5, 10, 15}"
- expected = OrderedDict([["x", ["-5", "0", "5", "10"]],
- ["y", ["0", "5", "10", "15"]]])
+ expected = OrderedDict(
+ [["x", ["-5", "0", "5", "10"]], ["y", ["0", "5", "10", "15"]]]
+ )
ret = optimizer_base.parse_hyperparameter_string(hyperparameter_string)
self.assertEqual(ret, expected)
@@ -28,8 +29,11 @@ def test_parse_hyperopt_string(self):
self.assertEqual(ret, expected)
hyperparameter_string = "x {-5, 0, 5, 10}\ny 0, 5, 10, 15} [5]"
- self.assertRaises(ValueError, optimizer_base.parse_hyperparameter_string,
- hyperparameter_string)
+ self.assertRaises(
+ ValueError,
+ optimizer_base.parse_hyperparameter_string,
+ hyperparameter_string,
+ )
def test_construct_cli_call(self):
cli_call = optimizer_base.construct_cli_call("cv.py", {"x": -5, "y": 0})
diff --git a/test/test_metalearning/test_metalearning.py b/test/test_metalearning/test_metalearning.py
index 6a7e87511d..3ec847a8f5 100644
--- a/test/test_metalearning/test_metalearning.py
+++ b/test/test_metalearning/test_metalearning.py
@@ -1,18 +1,17 @@
# -*- encoding: utf-8 -*-
import unittest
-from autosklearn.pipeline.util import get_dataset
-from autosklearn.classification import AutoSklearnClassifier
+from sklearn.datasets import load_breast_cancer
-from autosklearn.smbo import _calculate_metafeatures, _calculate_metafeatures_encoded
-from autosklearn.constants import REGRESSION, MULTICLASS_CLASSIFICATION
+from autosklearn.classification import AutoSklearnClassifier
+from autosklearn.constants import MULTICLASS_CLASSIFICATION, REGRESSION
from autosklearn.metalearning.mismbo import suggest_via_metalearning
+from autosklearn.pipeline.util import get_dataset
+from autosklearn.smbo import _calculate_metafeatures, _calculate_metafeatures_encoded
from autosklearn.util.pipeline import get_configuration_space
-from sklearn.datasets import load_breast_cancer
class MetafeatureValueDummy(object):
-
def __init__(self, name, value):
self.name = name
self.value = value
@@ -22,83 +21,93 @@ class Test(unittest.TestCase):
_multiprocess_can_split_ = True
def setUp(self):
- self.X_train, self.Y_train, self.X_test, self.Y_test = \
- get_dataset('iris')
+ self.X_train, self.Y_train, self.X_test, self.Y_test = get_dataset("iris")
eliminate_class_two = self.Y_train != 2
self.X_train = self.X_train[eliminate_class_two]
self.Y_train = self.Y_train[eliminate_class_two]
- @unittest.skip('TODO refactor!')
+ @unittest.skip("TODO refactor!")
def test_metalearning(self):
- dataset_name_classification = 'digits'
+ dataset_name_classification = "digits"
initial_challengers_classification = {
- "ACC_METRIC": "--initial-challengers \" "
- "-balancing:strategy 'weighting' "
- "-classifier:__choice__ 'proj_logit'",
- "AUC_METRIC": "--initial-challengers \" "
- "-balancing:strategy 'weighting' "
- "-classifier:__choice__ 'liblinear_svc'",
- "BAC_METRIC": "--initial-challengers \" "
- "-balancing:strategy 'weighting' "
- "-classifier:__choice__ 'proj_logit'",
- "F1_METRIC": "--initial-challengers \" "
- "-balancing:strategy 'weighting' "
- "-classifier:__choice__ 'proj_logit'",
- "PAC_METRIC": "--initial-challengers \" "
- "-balancing:strategy 'none' "
- "-classifier:__choice__ 'random_forest'"
+ "ACC_METRIC": '--initial-challengers " '
+ "-balancing:strategy 'weighting' "
+ "-classifier:__choice__ 'proj_logit'",
+ "AUC_METRIC": '--initial-challengers " '
+ "-balancing:strategy 'weighting' "
+ "-classifier:__choice__ 'liblinear_svc'",
+ "BAC_METRIC": '--initial-challengers " '
+ "-balancing:strategy 'weighting' "
+ "-classifier:__choice__ 'proj_logit'",
+ "F1_METRIC": '--initial-challengers " '
+ "-balancing:strategy 'weighting' "
+ "-classifier:__choice__ 'proj_logit'",
+ "PAC_METRIC": '--initial-challengers " '
+ "-balancing:strategy 'none' "
+ "-classifier:__choice__ 'random_forest'",
}
- dataset_name_regression = 'diabetes'
+ dataset_name_regression = "diabetes"
initial_challengers_regression = {
- "A_METRIC": "--initial-challengers \" "
- "-imputation:strategy 'mean' "
- "-one_hot_encoding:minimum_fraction '0.01' "
- "-one_hot_encoding:use_minimum_fraction 'True' "
- "-preprocessor:__choice__ 'no_preprocessing' "
- "-regressor:__choice__ 'random_forest'",
- "R2_METRIC": "--initial-challengers \" "
- "-imputation:strategy 'mean' "
- "-one_hot_encoding:minimum_fraction '0.01' "
- "-one_hot_encoding:use_minimum_fraction 'True' "
- "-preprocessor:__choice__ 'no_preprocessing' "
- "-regressor:__choice__ 'random_forest'",
+ "A_METRIC": '--initial-challengers " '
+ "-imputation:strategy 'mean' "
+ "-one_hot_encoding:minimum_fraction '0.01' "
+ "-one_hot_encoding:use_minimum_fraction 'True' "
+ "-preprocessor:__choice__ 'no_preprocessing' "
+ "-regressor:__choice__ 'random_forest'",
+ "R2_METRIC": '--initial-challengers " '
+ "-imputation:strategy 'mean' "
+ "-one_hot_encoding:minimum_fraction '0.01' "
+ "-one_hot_encoding:use_minimum_fraction 'True' "
+ "-preprocessor:__choice__ 'no_preprocessing' "
+ "-regressor:__choice__ 'random_forest'",
}
for dataset_name, task, initial_challengers in [
(dataset_name_regression, REGRESSION, initial_challengers_regression),
- (dataset_name_classification, MULTICLASS_CLASSIFICATION,
- initial_challengers_classification)]:
+ (
+ dataset_name_classification,
+ MULTICLASS_CLASSIFICATION,
+ initial_challengers_classification,
+ ),
+ ]:
for metric in initial_challengers:
configuration_space = get_configuration_space(
- {
- 'metric': metric,
- 'task': task,
- 'is_sparse': False
- },
- include={'feature_preprocessor': ['no_preprocessing']})
+ {"metric": metric, "task": task, "is_sparse": False},
+ include={"feature_preprocessor": ["no_preprocessing"]},
+ )
X_train, Y_train, X_test, Y_test = get_dataset(dataset_name)
categorical = {i: False for i in range(X_train.shape[1])}
meta_features_label = _calculate_metafeatures(
- X_train, Y_train, categorical, dataset_name, task)
+ X_train, Y_train, categorical, dataset_name, task
+ )
meta_features_encoded_label = _calculate_metafeatures_encoded(
- X_train, Y_train, categorical, dataset_name, task)
-
- initial_configuration_strings_for_smac = \
- suggest_via_metalearning(
- meta_features_label,
- meta_features_encoded_label,
- configuration_space, dataset_name, metric,
- task, False, 1, None)
+ X_train, Y_train, categorical, dataset_name, task
+ )
+
+ initial_configuration_strings_for_smac = suggest_via_metalearning(
+ meta_features_label,
+ meta_features_encoded_label,
+ configuration_space,
+ dataset_name,
+ metric,
+ task,
+ False,
+ 1,
+ None,
+ )
print(metric)
print(initial_configuration_strings_for_smac[0])
- self.assertTrue(initial_configuration_strings_for_smac[
- 0].startswith(initial_challengers[metric]))
+ self.assertTrue(
+ initial_configuration_strings_for_smac[0].startswith(
+ initial_challengers[metric]
+ )
+ )
def test_metadata_directory(self):
# Test that metadata directory is set correctly (if user specifies,
@@ -108,11 +117,10 @@ def test_metadata_directory(self):
automl1 = AutoSklearnClassifier(
time_left_for_this_task=30,
per_run_time_limit=5,
- metadata_directory="pyMetaLearn/metadata_dir", # user specified metadata_dir
+ metadata_directory="pyMetaLearn/metadata_dir", # user metadata_dir
dask_client=dask_client,
)
- self.assertEqual(automl1.metadata_directory,
- "pyMetaLearn/metadata_dir")
+ self.assertEqual(automl1.metadata_directory, "pyMetaLearn/metadata_dir")
automl2 = AutoSklearnClassifier( # default metadata_dir
time_left_for_this_task=30,
@@ -130,6 +138,11 @@ def test_metadata_directory(self):
ensemble_size=0,
)
X, y = load_breast_cancer(return_X_y=True)
- self.assertRaisesRegex(ValueError, "The specified metadata directory "
- "\'%s\' does not exist!" % nonexistent_dir,
- automl3.fit, X=X, y=y)
+ self.assertRaisesRegex(
+ ValueError,
+ "The specified metadata directory "
+ "'%s' does not exist!" % nonexistent_dir,
+ automl3.fit,
+ X=X,
+ y=y,
+ )
diff --git a/test/test_metric/__init__.py b/test/test_metric/__init__.py
index cc3cd7becd..e298f0f075 100644
--- a/test/test_metric/__init__.py
+++ b/test/test_metric/__init__.py
@@ -1,2 +1,2 @@
# -*- encoding: utf-8 -*-
-__author__ = 'feurerm'
+__author__ = "feurerm"
diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py
index 3c6ff73c2b..334a485fe3 100644
--- a/test/test_metric/test_metrics.py
+++ b/test/test_metric/test_metrics.py
@@ -1,27 +1,24 @@
import unittest
import warnings
-import pytest
-
import numpy as np
+import pytest
import sklearn.metrics
+from smac.utils.constants import MAXINT
import autosklearn.metrics
-
-from autosklearn.metrics import calculate_score, calculate_loss, calculate_metric
from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION
-
-from smac.utils.constants import MAXINT
+from autosklearn.metrics import calculate_loss, calculate_metric, calculate_score
class TestScorer(unittest.TestCase):
-
def test_predict_scorer_binary(self):
y_true = np.array([0, 0, 1, 1])
y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
scorer = autosklearn.metrics._PredictScorer(
- 'accuracy', sklearn.metrics.accuracy_score, 1, 0, 1, {})
+ "accuracy", sklearn.metrics.accuracy_score, 1, 0, 1, {}
+ )
score = scorer(y_true, y_pred)
self.assertAlmostEqual(score, 1.0)
@@ -35,15 +32,20 @@ def test_predict_scorer_binary(self):
self.assertAlmostEqual(score, 0.5)
scorer = autosklearn.metrics._PredictScorer(
- 'bac', sklearn.metrics.balanced_accuracy_score,
- 1, 0, 1, {})
+ "bac", sklearn.metrics.balanced_accuracy_score, 1, 0, 1, {}
+ )
score = scorer(y_true, y_pred)
self.assertAlmostEqual(score, 0.5)
scorer = autosklearn.metrics._PredictScorer(
- name='accuracy', score_func=sklearn.metrics.accuracy_score,
- optimum=1, worst_possible_result=0, sign=-1, kwargs={})
+ name="accuracy",
+ score_func=sklearn.metrics.accuracy_score,
+ optimum=1,
+ worst_possible_result=0,
+ sign=-1,
+ kwargs={},
+ )
y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
score = scorer(y_true, y_pred)
@@ -54,7 +56,8 @@ def test_predict_scorer_multiclass(self):
y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
scorer = autosklearn.metrics._PredictScorer(
- 'accuracy', sklearn.metrics.accuracy_score, 1, 0, 1, {})
+ "accuracy", sklearn.metrics.accuracy_score, 1, 0, 1, {}
+ )
score = scorer(y_true, y_pred)
self.assertAlmostEqual(score, 1.0)
@@ -68,14 +71,15 @@ def test_predict_scorer_multiclass(self):
self.assertAlmostEqual(score, 0.333333333)
scorer = autosklearn.metrics._PredictScorer(
- 'bac', sklearn.metrics.balanced_accuracy_score,
- 1, 0, 1, {})
+ "bac", sklearn.metrics.balanced_accuracy_score, 1, 0, 1, {}
+ )
score = scorer(y_true, y_pred)
self.assertAlmostEqual(score, 0.333333333)
scorer = autosklearn.metrics._PredictScorer(
- 'accuracy', sklearn.metrics.accuracy_score, 1, 0, -1, {})
+ "accuracy", sklearn.metrics.accuracy_score, 1, 0, -1, {}
+ )
y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
score = scorer(y_true, y_pred)
@@ -86,7 +90,8 @@ def test_predict_scorer_multilabel(self):
y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
scorer = autosklearn.metrics._PredictScorer(
- 'accuracy', sklearn.metrics.accuracy_score, 1, 0, 1, {})
+ "accuracy", sklearn.metrics.accuracy_score, 1, 0, 1, {}
+ )
score = scorer(y_true, y_pred)
self.assertAlmostEqual(score, 1.0)
@@ -100,7 +105,8 @@ def test_predict_scorer_multilabel(self):
self.assertAlmostEqual(score, 0.25)
scorer = autosklearn.metrics._PredictScorer(
- 'accuracy', sklearn.metrics.accuracy_score, 1, 0, -1, {})
+ "accuracy", sklearn.metrics.accuracy_score, 1, 0, -1, {}
+ )
y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
score = scorer(y_true, y_pred)
@@ -111,7 +117,8 @@ def test_predict_scorer_regression(self):
y_pred = y_true.copy()
scorer = autosklearn.metrics._PredictScorer(
- 'r2', sklearn.metrics.r2_score, 1, 0, 1, {})
+ "r2", sklearn.metrics.r2_score, 1, 0, 1, {}
+ )
score = scorer(y_true, y_pred)
self.assertAlmostEqual(score, 1.0)
@@ -125,7 +132,8 @@ def test_proba_scorer_binary(self):
y_pred = [[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]
scorer = autosklearn.metrics._ProbaScorer(
- 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, 1, {})
+ "log_loss", sklearn.metrics.log_loss, 0, MAXINT, 1, {}
+ )
score = scorer(y_true, y_pred)
self.assertAlmostEqual(score, 0.0)
@@ -139,7 +147,8 @@ def test_proba_scorer_binary(self):
self.assertAlmostEqual(score, 0.69314718055994529)
scorer = autosklearn.metrics._ProbaScorer(
- 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, -1, {})
+ "log_loss", sklearn.metrics.log_loss, 0, MAXINT, -1, {}
+ )
y_pred = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]
score = scorer(y_true, y_pred)
@@ -150,7 +159,8 @@ def test_proba_scorer_multiclass(self):
y_pred = [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]
scorer = autosklearn.metrics._ProbaScorer(
- 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, 1, {})
+ "log_loss", sklearn.metrics.log_loss, 0, MAXINT, 1, {}
+ )
score = scorer(y_true, y_pred)
self.assertAlmostEqual(score, 0.0)
@@ -164,7 +174,8 @@ def test_proba_scorer_multiclass(self):
self.assertAlmostEqual(score, 1.0986122886681096)
scorer = autosklearn.metrics._ProbaScorer(
- 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, -1, {})
+ "log_loss", sklearn.metrics.log_loss, 0, MAXINT, -1, {}
+ )
y_pred = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]
score = scorer(y_true, y_pred)
@@ -175,7 +186,8 @@ def test_proba_scorer_multilabel(self):
y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
scorer = autosklearn.metrics._ProbaScorer(
- 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, 1, {})
+ "log_loss", sklearn.metrics.log_loss, 0, MAXINT, 1, {}
+ )
score = scorer(y_true, y_pred)
self.assertAlmostEqual(score, 0.34657359027997314)
@@ -189,7 +201,8 @@ def test_proba_scorer_multilabel(self):
self.assertAlmostEqual(score, 0.69314718055994529)
scorer = autosklearn.metrics._ProbaScorer(
- 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, -1, {})
+ "log_loss", sklearn.metrics.log_loss, 0, MAXINT, -1, {}
+ )
y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
score = scorer(y_true, y_pred)
@@ -200,7 +213,8 @@ def test_threshold_scorer_binary(self):
y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
scorer = autosklearn.metrics._ThresholdScorer(
- 'roc_auc', sklearn.metrics.roc_auc_score, 1, 0, 1, {})
+ "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, 1, {}
+ )
score = scorer(y_true, y_pred)
self.assertAlmostEqual(score, 1.0)
@@ -214,7 +228,8 @@ def test_threshold_scorer_binary(self):
self.assertAlmostEqual(score, 0.5)
scorer = autosklearn.metrics._ThresholdScorer(
- 'roc_auc', sklearn.metrics.roc_auc_score, 1, 0, -1, {})
+ "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, -1, {}
+ )
y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
score = scorer(y_true, y_pred)
@@ -225,7 +240,8 @@ def test_threshold_scorer_multilabel(self):
y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
scorer = autosklearn.metrics._ThresholdScorer(
- 'roc_auc', sklearn.metrics.roc_auc_score, 1, 0, 1, {})
+ "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, 1, {}
+ )
score = scorer(y_true, y_pred)
self.assertAlmostEqual(score, 1.0)
@@ -239,7 +255,8 @@ def test_threshold_scorer_multilabel(self):
self.assertAlmostEqual(score, 0.5)
scorer = autosklearn.metrics._ThresholdScorer(
- 'roc_auc', sklearn.metrics.roc_auc_score, 1, 0, -1, {})
+ "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, -1, {}
+ )
y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
score = scorer(y_true, y_pred)
@@ -250,7 +267,8 @@ def test_sign_flip(self):
y_pred = y_true.copy()
scorer = autosklearn.metrics.make_scorer(
- 'r2', sklearn.metrics.r2_score, greater_is_better=True)
+ "r2", sklearn.metrics.r2_score, greater_is_better=True
+ )
score = scorer(y_true, y_pred + 1.0)
self.assertAlmostEqual(score, -9.0)
@@ -262,7 +280,8 @@ def test_sign_flip(self):
self.assertAlmostEqual(score, 1.0)
scorer = autosklearn.metrics.make_scorer(
- 'r2', sklearn.metrics.r2_score, greater_is_better=False)
+ "r2", sklearn.metrics.r2_score, greater_is_better=False
+ )
score = scorer(y_true, y_pred + 1.0)
self.assertAlmostEqual(score, 9.0)
@@ -275,49 +294,44 @@ def test_sign_flip(self):
class TestMetricsDoNotAlterInput(unittest.TestCase):
-
def test_regression_metrics(self):
for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items():
y_true = np.random.random(100).reshape((-1, 1))
y_pred = y_true.copy() + np.random.randn(100, 1) * 0.1
- if metric == 'mean_squared_log_error':
+ if metric == "mean_squared_log_error":
y_true = np.abs(y_true)
y_pred = np.abs(y_pred)
y_true_2 = y_true.copy()
y_pred_2 = y_pred.copy()
self.assertTrue(np.isfinite(scorer(y_true_2, y_pred_2)))
- np.testing.assert_array_almost_equal(y_true, y_true_2,
- err_msg=metric)
- np.testing.assert_array_almost_equal(y_pred, y_pred_2,
- err_msg=metric)
+ np.testing.assert_array_almost_equal(y_true, y_true_2, err_msg=metric)
+ np.testing.assert_array_almost_equal(y_pred, y_pred_2, err_msg=metric)
def test_classification_metrics(self):
for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items():
y_true = np.random.randint(0, 2, size=(100, 1))
y_pred = np.random.random(200).reshape((-1, 2))
- y_pred = np.array([y_pred[i] / np.sum(y_pred[i])
- for i in range(100)])
+ y_pred = np.array([y_pred[i] / np.sum(y_pred[i]) for i in range(100)])
y_true_2 = y_true.copy()
y_pred_2 = y_pred.copy()
try:
self.assertTrue(np.isfinite(scorer(y_true_2, y_pred_2)))
- np.testing.assert_array_almost_equal(y_true, y_true_2,
- err_msg=metric)
- np.testing.assert_array_almost_equal(y_pred, y_pred_2,
- err_msg=metric)
+ np.testing.assert_array_almost_equal(y_true, y_true_2, err_msg=metric)
+ np.testing.assert_array_almost_equal(y_pred, y_pred_2, err_msg=metric)
except ValueError as e:
- if e.args[0] == 'Samplewise metrics are not available outside' \
- ' of multilabel classification.':
+ if (
+ e.args[0] == "Samplewise metrics are not available outside"
+ " of multilabel classification."
+ ):
pass
else:
raise e
class TestMetric(unittest.TestCase):
-
def test_regression_all(self):
for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items():
@@ -331,7 +345,7 @@ def test_regression_all(self):
current_score = scorer(y_true, y_pred)
self.assertLess(current_score, previous_score)
- if scorer.name == 'mean_squared_log_error':
+ if scorer.name == "mean_squared_log_error":
continue
y_pred = np.array([-1, 0, -1, 0])
@@ -352,31 +366,39 @@ def test_classification_binary(self):
# TODO: but its behavior is not right. When y_pred is completely
# TODO: wrong, it does return 0.5, but when it is not completely
# TODO: wrong, it returns value smaller than 0.5.
- if metric in ['average_precision',
- 'precision_samples', 'recall_samples', 'f1_samples']:
+ if metric in [
+ "average_precision",
+ "precision_samples",
+ "recall_samples",
+ "f1_samples",
+ ]:
continue
y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0])
- y_pred = \
- np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]])
+ y_pred = np.array(
+ [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]
+ )
previous_score = scorer._optimum
current_score = scorer(y_true, y_pred)
self.assertAlmostEqual(current_score, previous_score)
- y_pred = \
- np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]])
+ y_pred = np.array(
+ [[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]]
+ )
previous_score = current_score
current_score = scorer(y_true, y_pred)
self.assertLess(current_score, previous_score)
- y_pred = \
- np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]])
+ y_pred = np.array(
+ [[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]
+ )
previous_score = current_score
current_score = scorer(y_true, y_pred)
self.assertLess(current_score, previous_score)
- y_pred = \
- np.array([[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]])
+ y_pred = np.array(
+ [[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]
+ )
previous_score = current_score
current_score = scorer(y_true, y_pred)
self.assertLess(current_score, previous_score)
@@ -390,76 +412,86 @@ def test_classification_multiclass(self):
#
# This test should be parameterized so we can identify which metrics
# cause which warning specifically and rectify if needed.
- ignored_warnings = [
- (UserWarning, 'y_pred contains classes not in y_true')
- ]
+ ignored_warnings = [(UserWarning, "y_pred contains classes not in y_true")]
for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items():
# Skip functions not applicable for multiclass classification.
- if metric in ['roc_auc', 'average_precision',
- 'precision', 'recall', 'f1', 'precision_samples',
- 'recall_samples', 'f1_samples']:
+ if metric in [
+ "roc_auc",
+ "average_precision",
+ "precision",
+ "recall",
+ "f1",
+ "precision_samples",
+ "recall_samples",
+ "f1_samples",
+ ]:
continue
- y_true = np.array(
- [0.0, 0.0, 1.0, 1.0, 2.0]
- )
+ y_true = np.array([0.0, 0.0, 1.0, 1.0, 2.0])
- y_pred = np.array([
- [1.0, 0.0, 0.0],
- [1.0, 0.0, 0.0],
- [0.0, 1.0, 0.0],
- [0.0, 1.0, 0.0],
- [0.0, 0.0, 1.0]
- ])
+ y_pred = np.array(
+ [
+ [1.0, 0.0, 0.0],
+ [1.0, 0.0, 0.0],
+ [0.0, 1.0, 0.0],
+ [0.0, 1.0, 0.0],
+ [0.0, 0.0, 1.0],
+ ]
+ )
previous_score = scorer._optimum
current_score = scorer(y_true, y_pred)
self.assertAlmostEqual(current_score, previous_score)
- y_pred = np.array([
- [1.0, 0.0, 0.0],
- [1.0, 0.0, 0.0],
- [1.0, 0.0, 0.0],
- [0.0, 1.0, 0.0],
- [0.0, 0.0, 1.0],
- ])
+ y_pred = np.array(
+ [
+ [1.0, 0.0, 0.0],
+ [1.0, 0.0, 0.0],
+ [1.0, 0.0, 0.0],
+ [0.0, 1.0, 0.0],
+ [0.0, 0.0, 1.0],
+ ]
+ )
previous_score = current_score
current_score = scorer(y_true, y_pred)
self.assertLess(current_score, previous_score)
- y_pred = np.array([
- [0.0, 0.0, 1.0],
- [0.0, 1.0, 0.0],
- [1.0, 0.0, 0.0],
- [0.0, 1.0, 0.0],
- [0.0, 1.0, 0.0]
- ])
+ y_pred = np.array(
+ [
+ [0.0, 0.0, 1.0],
+ [0.0, 1.0, 0.0],
+ [1.0, 0.0, 0.0],
+ [0.0, 1.0, 0.0],
+ [0.0, 1.0, 0.0],
+ ]
+ )
previous_score = current_score
current_score = scorer(y_true, y_pred)
self.assertLess(current_score, previous_score)
- y_pred = np.array([
- [0.0, 0.0, 1.0],
- [0.0, 0.0, 1.0],
- [1.0, 0.0, 0.0],
- [1.0, 0.0, 0.0],
- [0.0, 1.0, 0.0]
- ])
+ y_pred = np.array(
+ [
+ [0.0, 0.0, 1.0],
+ [0.0, 0.0, 1.0],
+ [1.0, 0.0, 0.0],
+ [1.0, 0.0, 0.0],
+ [0.0, 1.0, 0.0],
+ ]
+ )
previous_score = current_score
current_score = scorer(y_true, y_pred)
self.assertLess(current_score, previous_score)
# less labels in the targets than in the predictions
y_true = np.array([0.0, 0.0, 1.0, 1.0])
- y_pred = np.array([
- [1.0, 0.0, 0.0], [1.0, 0.0, 0.0],
- [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]
+ y_pred = np.array(
+ [[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]
)
with warnings.catch_warnings():
for category, message in ignored_warnings:
warnings.filterwarnings(
- 'ignore', category=category, message=message
+ "ignore", category=category, message=message
)
score = scorer(y_true, y_pred)
@@ -469,8 +501,14 @@ def test_classification_multilabel(self):
for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items():
# Skip functions not applicable for multi-label classification.
- if metric in ['roc_auc', 'log_loss',
- 'precision', 'recall', 'f1', 'balanced_accuracy']:
+ if metric in [
+ "roc_auc",
+ "log_loss",
+ "precision",
+ "recall",
+ "f1",
+ "balanced_accuracy",
+ ]:
continue
y_true = np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]])
y_pred = y_true.copy()
@@ -495,11 +533,11 @@ def test_classification_multilabel(self):
class TestCalculateScore(unittest.TestCase):
-
def test_unsupported_task_type(self):
y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0])
- y_pred = \
- np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]])
+ y_pred = np.array(
+ [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]
+ )
scorer = autosklearn.metrics.accuracy
raised = False
@@ -513,17 +551,22 @@ def test_classification_scoring_functions(self):
scoring_functions = list(autosklearn.metrics.CLASSIFICATION_METRICS.values())
scoring_functions.remove(autosklearn.metrics.accuracy)
- fail_metrics = ['precision_samples', 'recall_samples', 'f1_samples']
+ fail_metrics = ["precision_samples", "recall_samples", "f1_samples"]
success_metrics = list(autosklearn.metrics.CLASSIFICATION_METRICS.keys())
for metric in fail_metrics:
success_metrics.remove(metric)
y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0])
- y_pred = \
- np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]])
- score_dict = calculate_score(y_true, y_pred, BINARY_CLASSIFICATION,
- autosklearn.metrics.accuracy,
- scoring_functions)
+ y_pred = np.array(
+ [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]
+ )
+ score_dict = calculate_score(
+ y_true,
+ y_pred,
+ BINARY_CLASSIFICATION,
+ autosklearn.metrics.accuracy,
+ scoring_functions,
+ )
self.assertIsInstance(score_dict, dict)
self.assertTrue(len(success_metrics), len(score_dict))
@@ -531,8 +574,10 @@ def test_classification_scoring_functions(self):
self.assertNotIn(metric, score_dict.keys())
for metric in success_metrics:
self.assertIn(metric, score_dict.keys())
- self.assertAlmostEqual(autosklearn.metrics.CLASSIFICATION_METRICS[metric]._optimum,
- score_dict[metric])
+ self.assertAlmostEqual(
+ autosklearn.metrics.CLASSIFICATION_METRICS[metric]._optimum,
+ score_dict[metric],
+ )
def test_regression_scoring_functions(self):
@@ -540,26 +585,33 @@ def test_regression_scoring_functions(self):
scoring_functions.remove(autosklearn.metrics.root_mean_squared_error)
metrics = list(autosklearn.metrics.REGRESSION_METRICS.keys())
- metrics.remove('mean_squared_log_error')
+ metrics.remove("mean_squared_log_error")
y_true = np.array([1, 2, 3, -4])
y_pred = y_true.copy()
- score_dict = calculate_score(y_true, y_pred, REGRESSION,
- autosklearn.metrics.root_mean_squared_error,
- scoring_functions)
+ score_dict = calculate_score(
+ y_true,
+ y_pred,
+ REGRESSION,
+ autosklearn.metrics.root_mean_squared_error,
+ scoring_functions,
+ )
self.assertIsInstance(score_dict, dict)
self.assertTrue(len(metrics), len(score_dict))
for metric in metrics:
self.assertIn(metric, score_dict.keys())
- self.assertAlmostEqual(autosklearn.metrics.REGRESSION_METRICS[metric]._optimum,
- score_dict[metric])
+ self.assertAlmostEqual(
+ autosklearn.metrics.REGRESSION_METRICS[metric]._optimum,
+ score_dict[metric],
+ )
def test_classification_only_metric(self):
y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0])
- y_pred = \
- np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]])
+ y_pred = np.array(
+ [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]
+ )
scorer = autosklearn.metrics.accuracy
score = calculate_score(y_true, y_pred, BINARY_CLASSIFICATION, scorer)
@@ -602,22 +654,28 @@ def test_calculate_loss():
prediction=y_pred,
task_type=BINARY_CLASSIFICATION,
metric=autosklearn.metrics.accuracy,
- scoring_functions=[autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy]
+ scoring_functions=[
+ autosklearn.metrics.accuracy,
+ autosklearn.metrics.balanced_accuracy,
+ ],
)
expected_score_dict = {
- 'accuracy': 0.9,
- 'balanced_accuracy': 0.9285714285714286,
+ "accuracy": 0.9,
+ "balanced_accuracy": 0.9285714285714286,
}
loss_dict = calculate_loss(
solution=y_true,
prediction=y_pred,
task_type=BINARY_CLASSIFICATION,
metric=autosklearn.metrics.accuracy,
- scoring_functions=[autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy]
+ scoring_functions=[
+ autosklearn.metrics.accuracy,
+ autosklearn.metrics.balanced_accuracy,
+ ],
)
for expected_metric, expected_score in expected_score_dict.items():
assert pytest.approx(expected_score) == score_dict[expected_metric]
- assert pytest.approx(1-expected_score) == loss_dict[expected_metric]
+ assert pytest.approx(1 - expected_score) == loss_dict[expected_metric]
# Lastly make sure that metrics whose optimum is zero
# are also properly working
diff --git a/test/test_optimizer/test_smbo.py b/test/test_optimizer/test_smbo.py
index 4b7f0ffd79..fafd7b5a42 100644
--- a/test/test_optimizer/test_smbo.py
+++ b/test/test_optimizer/test_smbo.py
@@ -1,36 +1,39 @@
import logging.handlers
-from ConfigSpace.configuration_space import Configuration
-
import pytest
+from ConfigSpace.configuration_space import Configuration
import autosklearn.metrics
-from autosklearn.smbo import AutoMLSMBO
import autosklearn.pipeline.util as putil
from autosklearn.automl import AutoML
from autosklearn.constants import BINARY_CLASSIFICATION
from autosklearn.data.xy_data_manager import XYDataManager
+from autosklearn.smbo import AutoMLSMBO
from autosklearn.util.stopwatch import StopWatch
-@pytest.mark.parametrize("context", ['fork', 'forkserver'])
+@pytest.mark.parametrize("context", ["fork", "forkserver"])
def test_smbo_metalearning_configurations(backend, context, dask_client):
# Get the inputs to the optimizer
- X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
- config_space = AutoML(delete_tmp_folder_after_terminate=False,
- metric=autosklearn.metrics.accuracy,
- time_left_for_this_task=20,
- per_run_time_limit=5).fit(
- X_train, Y_train,
- task=BINARY_CLASSIFICATION,
- only_return_configuration_space=True)
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
+ config_space = AutoML(
+ delete_tmp_folder_after_terminate=False,
+ metric=autosklearn.metrics.accuracy,
+ time_left_for_this_task=20,
+ per_run_time_limit=5,
+ ).fit(
+ X_train,
+ Y_train,
+ task=BINARY_CLASSIFICATION,
+ only_return_configuration_space=True,
+ )
watcher = StopWatch()
# Create an optimizer
smbo = AutoMLSMBO(
config_space=config_space,
- dataset_name='iris',
+ dataset_name="iris",
backend=backend,
total_walltime_limit=10,
func_eval_time_limit=5,
@@ -49,11 +52,13 @@ def test_smbo_metalearning_configurations(backend, context, dask_client):
# Create the inputs to metalearning
datamanager = XYDataManager(
- X_train, Y_train,
- X_test, Y_test,
+ X_train,
+ Y_train,
+ X_test,
+ Y_test,
task=BINARY_CLASSIFICATION,
- dataset_name='iris',
- feat_type={i: 'numerical' for i in range(X_train.shape[1])},
+ dataset_name="iris",
+ feat_type={i: "numerical" for i in range(X_train.shape[1])},
)
backend.save_datamanager(datamanager)
smbo.task = BINARY_CLASSIFICATION
diff --git a/test/test_pipeline/components/__init__.py b/test/test_pipeline/components/__init__.py
index 8f0ce6cb7c..92bf78f389 100644
--- a/test/test_pipeline/components/__init__.py
+++ b/test/test_pipeline/components/__init__.py
@@ -1 +1 @@
-__author__ = 'feurerm'
+__author__ = "feurerm"
diff --git a/test/test_pipeline/components/classification/__init__.py b/test/test_pipeline/components/classification/__init__.py
index 8f0ce6cb7c..92bf78f389 100644
--- a/test/test_pipeline/components/classification/__init__.py
+++ b/test/test_pipeline/components/classification/__init__.py
@@ -1 +1 @@
-__author__ = 'feurerm'
+__author__ = "feurerm"
diff --git a/test/test_pipeline/components/classification/test_adaboost.py b/test/test_pipeline/components/classification/test_adaboost.py
index 3c0d96f9a6..f41ba3319f 100644
--- a/test/test_pipeline/components/classification/test_adaboost.py
+++ b/test/test_pipeline/components/classification/test_adaboost.py
@@ -1,7 +1,7 @@
import sklearn.ensemble
-from autosklearn.pipeline.components.classification.adaboost import \
- AdaboostClassifier
+from autosklearn.pipeline.components.classification.adaboost import AdaboostClassifier
+
from .test_base import BaseClassificationComponentTest
diff --git a/test/test_pipeline/components/classification/test_base.py b/test/test_pipeline/components/classification/test_base.py
index 4fc381af56..a524759bc5 100644
--- a/test/test_pipeline/components/classification/test_base.py
+++ b/test/test_pipeline/components/classification/test_base.py
@@ -1,15 +1,18 @@
-from typing import Optional, Dict
+from typing import Dict, Optional
import unittest
-from autosklearn.pipeline.util import _test_classifier, \
- _test_classifier_predict_proba, _test_classifier_iterative_fit
-from autosklearn.pipeline.constants import SPARSE
-
-import sklearn.metrics
import numpy as np
+import sklearn.metrics
-from test.test_pipeline.ignored_warnings import ignore_warnings, classifier_warnings
+from autosklearn.pipeline.constants import SPARSE
+from autosklearn.pipeline.util import (
+ _test_classifier,
+ _test_classifier_iterative_fit,
+ _test_classifier_predict_proba,
+)
+
+from test.test_pipeline.ignored_warnings import classifier_warnings, ignore_warnings
class BaseClassificationComponentTest(unittest.TestCase):
@@ -29,14 +32,14 @@ def test_default_iris(self):
return
for i in range(2):
- predictions, targets, n_calls = \
- _test_classifier(dataset="iris",
- classifier=self.module)
- self.assertAlmostEqual(self.res["default_iris"],
- sklearn.metrics.accuracy_score(targets,
- predictions),
- places=self.res.get(
- "default_iris_places", 7))
+ predictions, targets, n_calls = _test_classifier(
+ dataset="iris", classifier=self.module
+ )
+ self.assertAlmostEqual(
+ self.res["default_iris"],
+ sklearn.metrics.accuracy_score(targets, predictions),
+ places=self.res.get("default_iris_places", 7),
+ )
if self.res.get("iris_n_calls"):
self.assertEqual(self.res["iris_n_calls"], n_calls)
@@ -45,7 +48,7 @@ def test_get_max_iter(self):
if self.__class__ == BaseClassificationComponentTest:
return
- if not hasattr(self.module, 'iterative_fit'):
+ if not hasattr(self.module, "iterative_fit"):
return
self.module.get_max_iter()
@@ -55,23 +58,25 @@ def test_default_iris_iterative_fit(self):
if self.__class__ == BaseClassificationComponentTest:
return
- if not hasattr(self.module, 'iterative_fit'):
+ if not hasattr(self.module, "iterative_fit"):
return
for i in range(2):
- predictions, targets, classifier = \
- _test_classifier_iterative_fit(dataset="iris",
- classifier=self.module)
- self.assertAlmostEqual(self.res["default_iris_iterative"],
- sklearn.metrics.accuracy_score(targets,
- predictions),
- places=self.res.get(
- "default_iris_iterative_places", 7))
+ predictions, targets, classifier = _test_classifier_iterative_fit(
+ dataset="iris", classifier=self.module
+ )
+ self.assertAlmostEqual(
+ self.res["default_iris_iterative"],
+ sklearn.metrics.accuracy_score(targets, predictions),
+ places=self.res.get("default_iris_iterative_places", 7),
+ )
if self.step_hyperparameter is not None:
self.assertEqual(
- getattr(classifier.estimator, self.step_hyperparameter['name']),
- self.res.get("iris_iterative_n_iter", self.step_hyperparameter['value'])
+ getattr(classifier.estimator, self.step_hyperparameter["name"]),
+ self.res.get(
+ "iris_iterative_n_iter", self.step_hyperparameter["value"]
+ ),
)
def test_default_iris_predict_proba(self):
@@ -86,7 +91,7 @@ def test_default_iris_predict_proba(self):
self.assertAlmostEqual(
self.res["default_iris_proba"],
sklearn.metrics.log_loss(targets, predictions),
- places=self.res.get("default_iris_proba_places", 7)
+ places=self.res.get("default_iris_proba_places", 7),
)
def test_default_iris_sparse(self):
@@ -98,15 +103,14 @@ def test_default_iris_sparse(self):
return
for i in range(2):
- predictions, targets, _ = \
- _test_classifier(dataset="iris",
- classifier=self.module,
- sparse=True)
- self.assertAlmostEqual(self.res["default_iris_sparse"],
- sklearn.metrics.accuracy_score(targets,
- predictions),
- places=self.res.get(
- "default_iris_sparse_places", 7))
+ predictions, targets, _ = _test_classifier(
+ dataset="iris", classifier=self.module, sparse=True
+ )
+ self.assertAlmostEqual(
+ self.res["default_iris_sparse"],
+ sklearn.metrics.accuracy_score(targets, predictions),
+ places=self.res.get("default_iris_sparse_places", 7),
+ )
def test_default_digits_binary(self):
@@ -114,15 +118,14 @@ def test_default_digits_binary(self):
return
for i in range(2):
- predictions, targets, _ = \
- _test_classifier(classifier=self.module,
- dataset='digits', sparse=False,
- make_binary=True)
- self.assertAlmostEqual(self.res["default_digits_binary"],
- sklearn.metrics.accuracy_score(
- targets, predictions),
- places=self.res.get(
- "default_digits_binary_places", 7))
+ predictions, targets, _ = _test_classifier(
+ classifier=self.module, dataset="digits", sparse=False, make_binary=True
+ )
+ self.assertAlmostEqual(
+ self.res["default_digits_binary"],
+ sklearn.metrics.accuracy_score(targets, predictions),
+ places=self.res.get("default_digits_binary_places", 7),
+ )
def test_default_digits(self):
@@ -130,14 +133,14 @@ def test_default_digits(self):
return
for i in range(2):
- predictions, targets, n_calls = \
- _test_classifier(dataset="digits",
- classifier=self.module)
- self.assertAlmostEqual(self.res["default_digits"],
- sklearn.metrics.accuracy_score(targets,
- predictions),
- places=self.res.get(
- "default_digits_places", 7))
+ predictions, targets, n_calls = _test_classifier(
+ dataset="digits", classifier=self.module
+ )
+ self.assertAlmostEqual(
+ self.res["default_digits"],
+ sklearn.metrics.accuracy_score(targets, predictions),
+ places=self.res.get("default_digits_places", 7),
+ )
if self.res.get("digits_n_calls"):
self.assertEqual(self.res["digits_n_calls"], n_calls)
@@ -147,23 +150,25 @@ def test_default_digits_iterative_fit(self):
if self.__class__ == BaseClassificationComponentTest:
return
- if not hasattr(self.module, 'iterative_fit'):
+ if not hasattr(self.module, "iterative_fit"):
return
for i in range(2):
- predictions, targets, classifier = \
- _test_classifier_iterative_fit(dataset="digits",
- classifier=self.module)
- self.assertAlmostEqual(self.res["default_digits_iterative"],
- sklearn.metrics.accuracy_score(targets,
- predictions),
- places=self.res.get(
- "default_digits_iterative_places", 7))
+ predictions, targets, classifier = _test_classifier_iterative_fit(
+ dataset="digits", classifier=self.module
+ )
+ self.assertAlmostEqual(
+ self.res["default_digits_iterative"],
+ sklearn.metrics.accuracy_score(targets, predictions),
+ places=self.res.get("default_digits_iterative_places", 7),
+ )
if self.step_hyperparameter is not None:
self.assertEqual(
- getattr(classifier.estimator, self.step_hyperparameter['name']),
- self.res.get("digits_iterative_n_iter", self.step_hyperparameter['value'])
+ getattr(classifier.estimator, self.step_hyperparameter["name"]),
+ self.res.get(
+ "digits_iterative_n_iter", self.step_hyperparameter["value"]
+ ),
)
def test_default_digits_multilabel(self):
@@ -176,15 +181,16 @@ def test_default_digits_multilabel(self):
for _ in range(2):
predictions, targets, _ = _test_classifier(
- classifier=self.module, dataset='digits', make_multilabel=True
+ classifier=self.module, dataset="digits", make_multilabel=True
)
score = sklearn.metrics.precision_score(
- targets, predictions, average='macro', zero_division=0
+ targets, predictions, average="macro", zero_division=0
)
self.assertAlmostEqual(
- self.res["default_digits_multilabel"], score,
- places=self.res.get("default_digits_multilabel_places", 7)
+ self.res["default_digits_multilabel"],
+ score,
+ places=self.res.get("default_digits_multilabel_places", 7),
)
def test_default_digits_multilabel_predict_proba(self):
@@ -196,15 +202,15 @@ def test_default_digits_multilabel_predict_proba(self):
return
for i in range(2):
- predictions, targets = \
- _test_classifier_predict_proba(classifier=self.module,
- make_multilabel=True)
+ predictions, targets = _test_classifier_predict_proba(
+ classifier=self.module, make_multilabel=True
+ )
self.assertEqual(predictions.shape, ((50, 3)))
- self.assertAlmostEqual(self.res["default_digits_multilabel_proba"],
- sklearn.metrics.roc_auc_score(
- targets, predictions, average='macro'),
- places=self.res.get(
- "default_digits_multilabel_proba_places", 7))
+ self.assertAlmostEqual(
+ self.res["default_digits_multilabel_proba"],
+ sklearn.metrics.roc_auc_score(targets, predictions, average="macro"),
+ places=self.res.get("default_digits_multilabel_proba_places", 7),
+ )
def test_target_algorithm_multioutput_multiclass_support(self):
@@ -218,42 +224,66 @@ def test_target_algorithm_multioutput_multiclass_support(self):
X = np.random.random((10, 10))
y = np.random.randint(0, 1, size=(10, 10))
self.assertRaisesRegex(
- ValueError,
- 'bad input shape \\(10, 10\\)',
- cls.fit,
- X,
- y
+ ValueError, "bad input shape \\(10, 10\\)", cls.fit, X, y
)
else:
return
def test_module_idempotent(self):
- """ Fitting twice with the same config gives the same model params.
+ """Fitting twice with the same config gives the same model params.
- This is only valid when the random_state passed is an int. If a
- RandomState object is passed then repeated calls to fit will have
- different results. See the section on "Controlling Randomness" in the
- sklearn docs.
+ This is only valid when the random_state passed is an int. If a
+ RandomState object is passed then repeated calls to fit will have
+ different results. See the section on "Controlling Randomness" in the
+ sklearn docs.
- https://scikit-learn.org/0.24/common_pitfalls.html#controlling-randomness
+ https://scikit-learn.org/0.24/common_pitfalls.html#controlling-randomness
"""
if self.__class__ == BaseClassificationComponentTest:
return
classifier_cls = self.module
- X = np.array([
- [0, 0], [0, 1], [1, 0], [1, 1],
- [0, 0], [0, 1], [1, 0], [1, 1],
- [0, 0], [0, 1], [1, 0], [1, 1],
- [0, 0], [0, 1], [1, 0], [1, 1],
- ])
- y = np.array([
- 0, 1, 1, 0,
- 0, 1, 1, 0,
- 0, 1, 1, 0,
- 0, 1, 1, 0,
- ])
+ X = np.array(
+ [
+ [0, 0],
+ [0, 1],
+ [1, 0],
+ [1, 1],
+ [0, 0],
+ [0, 1],
+ [1, 0],
+ [1, 1],
+ [0, 0],
+ [0, 1],
+ [1, 0],
+ [1, 1],
+ [0, 0],
+ [0, 1],
+ [1, 0],
+ [1, 1],
+ ]
+ )
+ y = np.array(
+ [
+ 0,
+ 1,
+ 1,
+ 0,
+ 0,
+ 1,
+ 1,
+ 0,
+ 0,
+ 1,
+ 1,
+ 0,
+ 0,
+ 1,
+ 1,
+ 0,
+ ]
+ )
# There are certain errors we ignore so we wrap this in a function
def fitted_params(model) -> Optional[Dict]:
@@ -268,12 +298,18 @@ def is_QDA_error(err):
# We are okay if the BaseClassifier in AdaBoostClassifier is worse
# than random so no ensemble can be fit
def is_AdaBoostClassifier_error(err):
- return ("BaseClassifier in AdaBoostClassifier ensemble is worse"
- + " than random, ensemble can not be fit." in err.args[0])
+ return (
+ "BaseClassifier in AdaBoostClassifier ensemble is worse"
+ + " than random, ensemble can not be fit."
+ in err.args[0]
+ )
def is_unset_param_raw_predictions_val_error(err):
- return ("local variable 'raw_predictions_val' referenced before"
- + " assignment" in err.args[0])
+ return (
+ "local variable 'raw_predictions_val' referenced before"
+ + " assignment"
+ in err.args[0]
+ )
try:
with ignore_warnings(classifier_warnings):
@@ -288,7 +324,7 @@ def is_unset_param_raw_predictions_val_error(err):
return model.estimator.get_params()
# We ignore certain keys when comparing
- param_keys_ignored = ['base_estimator']
+ param_keys_ignored = ["base_estimator"]
# We use the default config + sampled ones
configuration_space = classifier_cls.get_hyperparameter_search_space()
@@ -302,12 +338,12 @@ def is_unset_param_raw_predictions_val_error(err):
# Get the parameters on the first and second fit with config params
params_first = fitted_params(classifier)
- if hasattr(classifier.estimator, 'random_state'):
+ if hasattr(classifier.estimator, "random_state"):
rs_1 = classifier.random_state
rs_estimator_1 = classifier.estimator.random_state
params_second = fitted_params(classifier)
- if hasattr(classifier.estimator, 'random_state'):
+ if hasattr(classifier.estimator, "random_state"):
rs_2 = classifier.random_state
rs_estimator_2 = classifier.estimator.random_state
@@ -322,10 +358,13 @@ def is_unset_param_raw_predictions_val_error(err):
del params[key]
# They should have equal parameters
- self.assertEqual(params_first, params_second,
- f"Failed with model args {model_args}")
- if hasattr(classifier.estimator, 'random_state'):
- assert all([
- seed == random_state
- for random_state in [rs_1, rs_estimator_1, rs_2, rs_estimator_2]
- ])
+ self.assertEqual(
+ params_first, params_second, f"Failed with model args {model_args}"
+ )
+ if hasattr(classifier.estimator, "random_state"):
+ assert all(
+ [
+ seed == random_state
+ for random_state in [rs_1, rs_estimator_1, rs_2, rs_estimator_2]
+ ]
+ )
diff --git a/test/test_pipeline/components/classification/test_bernoulli_nb.py b/test/test_pipeline/components/classification/test_bernoulli_nb.py
index 8384119393..2def3a385f 100644
--- a/test/test_pipeline/components/classification/test_bernoulli_nb.py
+++ b/test/test_pipeline/components/classification/test_bernoulli_nb.py
@@ -1,7 +1,6 @@
import sklearn.naive_bayes
-from autosklearn.pipeline.components.classification.bernoulli_nb import \
- BernoulliNB
+from autosklearn.pipeline.components.classification.bernoulli_nb import BernoulliNB
from .test_base import BaseClassificationComponentTest
diff --git a/test/test_pipeline/components/classification/test_decision_tree.py b/test/test_pipeline/components/classification/test_decision_tree.py
index e32a6536c7..546040e645 100644
--- a/test/test_pipeline/components/classification/test_decision_tree.py
+++ b/test/test_pipeline/components/classification/test_decision_tree.py
@@ -1,7 +1,6 @@
import sklearn.tree
-from autosklearn.pipeline.components.classification.decision_tree import \
- DecisionTree
+from autosklearn.pipeline.components.classification.decision_tree import DecisionTree
from .test_base import BaseClassificationComponentTest
diff --git a/test/test_pipeline/components/classification/test_extra_trees.py b/test/test_pipeline/components/classification/test_extra_trees.py
index e7b1935db0..213bfbd916 100644
--- a/test/test_pipeline/components/classification/test_extra_trees.py
+++ b/test/test_pipeline/components/classification/test_extra_trees.py
@@ -1,6 +1,8 @@
import sklearn.ensemble
-from autosklearn.pipeline.components.classification.extra_trees import ExtraTreesClassifier
+from autosklearn.pipeline.components.classification.extra_trees import (
+ ExtraTreesClassifier,
+)
from .test_base import BaseClassificationComponentTest
@@ -12,12 +14,12 @@ class ExtraTreesComponentTest(BaseClassificationComponentTest):
res = dict()
res["default_iris"] = 0.96
res["iris_n_calls"] = 9
- res["default_iris_iterative"] = res['default_iris']
+ res["default_iris_iterative"] = res["default_iris"]
res["default_iris_proba"] = 0.10053485167017469
res["default_iris_sparse"] = 0.74
res["default_digits"] = 0.9216757741347905
res["digits_n_calls"] = 9
- res["default_digits_iterative"] = res['default_digits']
+ res["default_digits_iterative"] = res["default_digits"]
res["default_digits_iterative_places"] = 3
res["default_digits_binary"] = 0.994535519125683
res["default_digits_multilabel"] = 0.9983621593291405
@@ -26,6 +28,6 @@ class ExtraTreesComponentTest(BaseClassificationComponentTest):
sk_mod = sklearn.ensemble.ExtraTreesClassifier
module = ExtraTreesClassifier
step_hyperparameter = {
- 'name': 'n_estimators',
- 'value': module.get_max_iter(),
+ "name": "n_estimators",
+ "value": module.get_max_iter(),
}
diff --git a/test/test_pipeline/components/classification/test_gaussian_nb.py b/test/test_pipeline/components/classification/test_gaussian_nb.py
index ea5ce7cc5b..2f813b4293 100644
--- a/test/test_pipeline/components/classification/test_gaussian_nb.py
+++ b/test/test_pipeline/components/classification/test_gaussian_nb.py
@@ -1,7 +1,6 @@
import sklearn.naive_bayes
-from autosklearn.pipeline.components.classification.gaussian_nb import \
- GaussianNB
+from autosklearn.pipeline.components.classification.gaussian_nb import GaussianNB
from .test_base import BaseClassificationComponentTest
diff --git a/test/test_pipeline/components/classification/test_gradient_boosting.py b/test/test_pipeline/components/classification/test_gradient_boosting.py
index efa3a3cca8..4bfadfa74c 100644
--- a/test/test_pipeline/components/classification/test_gradient_boosting.py
+++ b/test/test_pipeline/components/classification/test_gradient_boosting.py
@@ -1,7 +1,8 @@
import sklearn.ensemble
-from autosklearn.pipeline.components.classification.gradient_boosting import \
- GradientBoostingClassifier
+from autosklearn.pipeline.components.classification.gradient_boosting import (
+ GradientBoostingClassifier,
+)
from .test_base import BaseClassificationComponentTest
@@ -24,6 +25,6 @@ class GradientBoostingComponentTest(BaseClassificationComponentTest):
sk_mod = sklearn.ensemble.ExtraTreesClassifier
module = GradientBoostingClassifier
step_hyperparameter = {
- 'name': 'max_iter',
- 'value': module.get_max_iter(),
+ "name": "max_iter",
+ "value": module.get_max_iter(),
}
diff --git a/test/test_pipeline/components/classification/test_k_nearest_neighbor.py b/test/test_pipeline/components/classification/test_k_nearest_neighbor.py
index 8209e2a674..d09512d07d 100644
--- a/test/test_pipeline/components/classification/test_k_nearest_neighbor.py
+++ b/test/test_pipeline/components/classification/test_k_nearest_neighbor.py
@@ -1,7 +1,8 @@
import sklearn.neighbors
-from autosklearn.pipeline.components.classification.k_nearest_neighbors import \
- KNearestNeighborsClassifier
+from autosklearn.pipeline.components.classification.k_nearest_neighbors import (
+ KNearestNeighborsClassifier,
+)
from .test_base import BaseClassificationComponentTest
diff --git a/test/test_pipeline/components/classification/test_liblinear.py b/test/test_pipeline/components/classification/test_liblinear.py
index bb2d2a1894..1aec8e227e 100644
--- a/test/test_pipeline/components/classification/test_liblinear.py
+++ b/test/test_pipeline/components/classification/test_liblinear.py
@@ -1,7 +1,6 @@
import sklearn.svm
-from autosklearn.pipeline.components.classification.liblinear_svc import \
- LibLinear_SVC
+from autosklearn.pipeline.components.classification.liblinear_svc import LibLinear_SVC
from .test_base import BaseClassificationComponentTest
diff --git a/test/test_pipeline/components/classification/test_libsvm_svc.py b/test/test_pipeline/components/classification/test_libsvm_svc.py
index dcab429fc1..6fe95f5b62 100644
--- a/test/test_pipeline/components/classification/test_libsvm_svc.py
+++ b/test/test_pipeline/components/classification/test_libsvm_svc.py
@@ -2,8 +2,7 @@
import sklearn.svm
from autosklearn.pipeline.components.classification.libsvm_svc import LibSVM_SVC
-from autosklearn.pipeline.util import get_dataset, \
- _test_classifier_predict_proba
+from autosklearn.pipeline.util import _test_classifier_predict_proba, get_dataset
from .test_base import BaseClassificationComponentTest
@@ -30,22 +29,23 @@ def test_default_configuration_predict_proba_individual(self):
# Leave this additional test here
for i in range(2):
predictions, targets = _test_classifier_predict_proba(
- LibSVM_SVC, sparse=True, dataset='digits',
- train_size_maximum=500)
- self.assertAlmostEqual(5.273502056835706,
- sklearn.metrics.log_loss(targets,
- predictions))
+ LibSVM_SVC, sparse=True, dataset="digits", train_size_maximum=500
+ )
+ self.assertAlmostEqual(
+ 5.273502056835706, sklearn.metrics.log_loss(targets, predictions)
+ )
for i in range(2):
predictions, targets = _test_classifier_predict_proba(
- LibSVM_SVC, sparse=True, dataset='iris')
- self.assertAlmostEqual(0.8408320837510618,
- sklearn.metrics.log_loss(targets,
- predictions))
+ LibSVM_SVC, sparse=True, dataset="iris"
+ )
+ self.assertAlmostEqual(
+ 0.8408320837510618, sklearn.metrics.log_loss(targets, predictions)
+ )
# 2 class
for i in range(2):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris")
remove_training_data = Y_train == 2
remove_test_data = Y_test == 2
X_train = X_train[~remove_training_data]
@@ -57,11 +57,19 @@ def test_default_configuration_predict_proba_individual(self):
configuration_space = LibSVM_SVC.get_hyperparameter_search_space()
default = configuration_space.get_default_configuration()
- cls = LibSVM_SVC(random_state=1, **{hp_name: default[hp_name]
- for hp_name in default
- if default[hp_name] is not None})
+ cls = LibSVM_SVC(
+ random_state=1,
+ **{
+ hp_name: default[hp_name]
+ for hp_name in default
+ if default[hp_name] is not None
+ },
+ )
cls = cls.fit(X_train, Y_train)
prediction = cls.predict_proba(X_test)
- self.assertAlmostEqual(sklearn.metrics.log_loss(Y_test, prediction),
- 0.6927962762794081, places=4)
+ self.assertAlmostEqual(
+ sklearn.metrics.log_loss(Y_test, prediction),
+ 0.6927962762794081,
+ places=4,
+ )
diff --git a/test/test_pipeline/components/classification/test_mlp.py b/test/test_pipeline/components/classification/test_mlp.py
index b8d559b1bc..e1c4286d83 100644
--- a/test/test_pipeline/components/classification/test_mlp.py
+++ b/test/test_pipeline/components/classification/test_mlp.py
@@ -43,6 +43,6 @@ class MLPComponentTest(BaseClassificationComponentTest):
sk_mod = sklearn.neural_network.MLPClassifier
module = MLPClassifier
step_hyperparameter = {
- 'name': 'n_iter_',
- 'value': module.get_max_iter(),
+ "name": "n_iter_",
+ "value": module.get_max_iter(),
}
diff --git a/test/test_pipeline/components/classification/test_multinomial_nb.py b/test/test_pipeline/components/classification/test_multinomial_nb.py
index 2c982c41ef..c82b938679 100644
--- a/test/test_pipeline/components/classification/test_multinomial_nb.py
+++ b/test/test_pipeline/components/classification/test_multinomial_nb.py
@@ -1,10 +1,8 @@
import numpy as np
-
import sklearn.naive_bayes
import sklearn.preprocessing
-from autosklearn.pipeline.components.classification.multinomial_nb import \
- MultinomialNB
+from autosklearn.pipeline.components.classification.multinomial_nb import MultinomialNB
from autosklearn.pipeline.util import get_dataset
from .test_base import BaseClassificationComponentTest
@@ -32,17 +30,21 @@ class MultinomialNBComponentTest(BaseClassificationComponentTest):
def test_default_configuration_negative_values(self):
# Custon preprocessing test to check if clipping to zero works
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
ss = sklearn.preprocessing.StandardScaler()
X_train = ss.fit_transform(X_train)
configuration_space = MultinomialNB.get_hyperparameter_search_space()
default = configuration_space.get_default_configuration()
- cls = MultinomialNB(random_state=1, **{hp_name: default[hp_name]
- for hp_name in default
- if default[hp_name] is not None})
+ cls = MultinomialNB(
+ random_state=1,
+ **{
+ hp_name: default[hp_name]
+ for hp_name in default
+ if default[hp_name] is not None
+ },
+ )
cls = cls.fit(X_train, Y_train)
prediction = cls.predict(X_test)
- self.assertAlmostEqual(np.nanmean(prediction == Y_test),
- 0.88888888888888884)
+ self.assertAlmostEqual(np.nanmean(prediction == Y_test), 0.88888888888888884)
diff --git a/test/test_pipeline/components/classification/test_passive_aggressive.py b/test/test_pipeline/components/classification/test_passive_aggressive.py
index d904f9e569..b83dbaf120 100644
--- a/test/test_pipeline/components/classification/test_passive_aggressive.py
+++ b/test/test_pipeline/components/classification/test_passive_aggressive.py
@@ -1,7 +1,8 @@
import sklearn.linear_model
-from autosklearn.pipeline.components.classification.passive_aggressive import \
- PassiveAggressive
+from autosklearn.pipeline.components.classification.passive_aggressive import (
+ PassiveAggressive,
+)
from .test_base import BaseClassificationComponentTest
@@ -13,13 +14,13 @@ class PassiveAggressiveComponentTest(BaseClassificationComponentTest):
res = dict()
res["default_iris"] = 0.98
res["iris_n_calls"] = 6
- res["default_iris_iterative"] = res['default_iris']
+ res["default_iris_iterative"] = res["default_iris"]
res["iris_iterative_n_iter"] = 64
res["default_iris_proba"] = 0.27840521921952033
res["default_iris_sparse"] = 0.48
res["default_digits"] = 0.9162112932604736
res["digits_n_calls"] = 6
- res["default_digits_iterative"] = res['default_digits']
+ res["default_digits_iterative"] = res["default_digits"]
res["digits_iterative_n_iter"] = 64
res["default_digits_binary"] = 0.99210686095932
res["default_digits_multilabel"] = 0.910908768565592
@@ -29,6 +30,6 @@ class PassiveAggressiveComponentTest(BaseClassificationComponentTest):
module = PassiveAggressive
step_hyperparameter = {
- 'name': 'max_iter',
- 'value': module.get_max_iter(),
+ "name": "max_iter",
+ "value": module.get_max_iter(),
}
diff --git a/test/test_pipeline/components/classification/test_random_forest.py b/test/test_pipeline/components/classification/test_random_forest.py
index 8e2c1136d3..f96869c270 100644
--- a/test/test_pipeline/components/classification/test_random_forest.py
+++ b/test/test_pipeline/components/classification/test_random_forest.py
@@ -1,7 +1,6 @@
import sklearn.ensemble
-from autosklearn.pipeline.components.classification.random_forest import \
- RandomForest
+from autosklearn.pipeline.components.classification.random_forest import RandomForest
from .test_base import BaseClassificationComponentTest
@@ -13,12 +12,12 @@ class RandomForestComponentTest(BaseClassificationComponentTest):
res = dict()
res["default_iris"] = 0.96
res["iris_n_calls"] = 9
- res["default_iris_iterative"] = res['default_iris']
+ res["default_iris_iterative"] = res["default_iris"]
res["default_iris_proba"] = 0.0996785324703419
res["default_iris_sparse"] = 0.85999999999999999
res["default_digits"] = 0.8998178506375227
res["digits_n_calls"] = 9
- res["default_digits_iterative"] = res['default_digits']
+ res["default_digits_iterative"] = res["default_digits"]
res["default_digits_binary"] = 0.9896782027929569
res["default_digits_multilabel"] = 0.9973653110879388
res["default_digits_multilabel_proba"] = 0.9965660960196189
@@ -26,6 +25,6 @@ class RandomForestComponentTest(BaseClassificationComponentTest):
sk_mod = sklearn.ensemble.RandomForestClassifier
module = RandomForest
step_hyperparameter = {
- 'name': 'n_estimators',
- 'value': module.get_max_iter(),
+ "name": "n_estimators",
+ "value": module.get_max_iter(),
}
diff --git a/test/test_pipeline/components/classification/test_sgd.py b/test/test_pipeline/components/classification/test_sgd.py
index defe8af81d..8f1d7821e1 100644
--- a/test/test_pipeline/components/classification/test_sgd.py
+++ b/test/test_pipeline/components/classification/test_sgd.py
@@ -1,6 +1,7 @@
import sklearn.linear_model
from autosklearn.pipeline.components.classification.sgd import SGD
+
from .test_base import BaseClassificationComponentTest
@@ -11,12 +12,12 @@ class SGDComponentTest(BaseClassificationComponentTest):
res = dict()
res["default_iris"] = 0.69999999999999996
res["iris_n_calls"] = 9
- res["default_iris_iterative"] = res['default_iris']
+ res["default_iris_iterative"] = res["default_iris"]
res["default_iris_proba"] = 0.5996114465819011
res["default_iris_sparse"] = 0.54
res["default_digits"] = 0.9198542805100182
res["digits_n_calls"] = 7
- res["default_digits_iterative"] = res['default_digits']
+ res["default_digits_iterative"] = res["default_digits"]
res["default_digits_binary"] = 0.9951426836672739
res["default_digits_multilabel"] = -1
res["default_digits_multilabel_proba"] = -1
diff --git a/test/test_pipeline/components/data_preprocessing/__init__.py b/test/test_pipeline/components/data_preprocessing/__init__.py
index 8f0ce6cb7c..92bf78f389 100644
--- a/test/test_pipeline/components/data_preprocessing/__init__.py
+++ b/test/test_pipeline/components/data_preprocessing/__init__.py
@@ -1 +1 @@
-__author__ = 'feurerm'
+__author__ = "feurerm"
diff --git a/test/test_pipeline/components/data_preprocessing/test_balancing.py b/test/test_pipeline/components/data_preprocessing/test_balancing.py
index 268a8ea542..cf8dc103b8 100644
--- a/test/test_pipeline/components/data_preprocessing/test_balancing.py
+++ b/test/test_pipeline/components/data_preprocessing/test_balancing.py
@@ -1,4 +1,4 @@
-__author__ = 'feurerm'
+__author__ = "feurerm"
import copy
import unittest
@@ -7,86 +7,111 @@
import sklearn.datasets
import sklearn.metrics
-from autosklearn.pipeline.components.data_preprocessing.balancing.balancing \
- import Balancing
from autosklearn.pipeline.classification import SimpleClassificationPipeline
from autosklearn.pipeline.components.classification.adaboost import AdaboostClassifier
from autosklearn.pipeline.components.classification.decision_tree import DecisionTree
-from autosklearn.pipeline.components.classification.extra_trees import ExtraTreesClassifier
-from autosklearn.pipeline.components.classification.random_forest import RandomForest
+from autosklearn.pipeline.components.classification.extra_trees import (
+ ExtraTreesClassifier,
+)
+from autosklearn.pipeline.components.classification.gradient_boosting import (
+ GradientBoostingClassifier,
+)
from autosklearn.pipeline.components.classification.liblinear_svc import LibLinear_SVC
from autosklearn.pipeline.components.classification.libsvm_svc import LibSVM_SVC
+from autosklearn.pipeline.components.classification.passive_aggressive import (
+ PassiveAggressive,
+)
+from autosklearn.pipeline.components.classification.random_forest import RandomForest
from autosklearn.pipeline.components.classification.sgd import SGD
-from autosklearn.pipeline.components.classification.gradient_boosting \
- import GradientBoostingClassifier
-from autosklearn.pipeline.components.classification.passive_aggressive import PassiveAggressive
-from autosklearn.pipeline.components.feature_preprocessing\
- .extra_trees_preproc_for_classification import ExtraTreesPreprocessorClassification
-from autosklearn.pipeline.components.feature_preprocessing.liblinear_svc_preprocessor import \
- LibLinear_Preprocessor
+from autosklearn.pipeline.components.data_preprocessing.balancing.balancing import (
+ Balancing,
+)
+from autosklearn.pipeline.components.feature_preprocessing.extra_trees_preproc_for_classification import ( # noqa: E501
+ ExtraTreesPreprocessorClassification,
+)
+from autosklearn.pipeline.components.feature_preprocessing.liblinear_svc_preprocessor import ( # noqa: E501
+ LibLinear_Preprocessor,
+)
class BalancingComponentTest(unittest.TestCase):
def test_balancing_get_weights_treed_single_label(self):
Y = np.array([0] * 80 + [1] * 20)
- balancing = Balancing(strategy='weighting')
- init_params, fit_params = balancing.get_weights(
- Y, 'adaboost', None, None, None)
+ balancing = Balancing(strategy="weighting")
+ init_params, fit_params = balancing.get_weights(Y, "adaboost", None, None, None)
self.assertAlmostEqual(
- np.mean(fit_params['classifier:sample_weight']), 1,
+ np.mean(fit_params["classifier:sample_weight"]),
+ 1,
)
np.testing.assert_allclose(
- fit_params['classifier:sample_weight'],
+ fit_params["classifier:sample_weight"],
np.array([0.625] * 80 + [2.5] * 20),
)
def test_balancing_get_weights_treed_multilabel(self):
- Y = np.array([[0, 0, 0]] * 100 + [[1, 0, 0]] * 100 + [[0, 1, 0]] * 100 +
- [[1, 1, 0]] * 100 + [[0, 0, 1]] * 100 + [[1, 0, 1]] * 10)
- balancing = Balancing(strategy='weighting')
- init_params, fit_params = balancing.get_weights(
- Y, 'adaboost', None, None, None)
- print(fit_params['classifier:sample_weight'])
+ Y = np.array(
+ [[0, 0, 0]] * 100
+ + [[1, 0, 0]] * 100
+ + [[0, 1, 0]] * 100
+ + [[1, 1, 0]] * 100
+ + [[0, 0, 1]] * 100
+ + [[1, 0, 1]] * 10
+ )
+ balancing = Balancing(strategy="weighting")
+ init_params, fit_params = balancing.get_weights(Y, "adaboost", None, None, None)
+ print(fit_params["classifier:sample_weight"])
self.assertAlmostEqual(
- np.mean(fit_params['classifier:sample_weight']), 1,
+ np.mean(fit_params["classifier:sample_weight"]),
+ 1,
)
np.testing.assert_allclose(
- fit_params['classifier:sample_weight'],
+ fit_params["classifier:sample_weight"],
np.array([0.85] * 500 + [8.5] * 10),
)
def test_balancing_get_weights_svm_sgd(self):
Y = np.array([0] * 80 + [1] * 20)
- balancing = Balancing(strategy='weighting')
+ balancing = Balancing(strategy="weighting")
init_params, fit_params = balancing.get_weights(
- Y, 'libsvm_svc', None, None, None)
- self.assertEqual(("classifier:class_weight", "balanced"),
- list(init_params.items())[0])
+ Y, "libsvm_svc", None, None, None
+ )
+ self.assertEqual(
+ ("classifier:class_weight", "balanced"), list(init_params.items())[0]
+ )
init_params, fit_params = balancing.get_weights(
- Y, None, 'liblinear_svc_preprocessor', None, None)
- self.assertEqual(("feature_preprocessor:class_weight", "balanced"),
- list(init_params.items())[0])
+ Y, None, "liblinear_svc_preprocessor", None, None
+ )
+ self.assertEqual(
+ ("feature_preprocessor:class_weight", "balanced"),
+ list(init_params.items())[0],
+ )
def test_weighting_effect(self):
data = sklearn.datasets.make_classification(
- n_samples=200, n_features=10, n_redundant=2, n_informative=2,
- n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2],
- random_state=1)
+ n_samples=200,
+ n_features=10,
+ n_redundant=2,
+ n_informative=2,
+ n_repeated=2,
+ n_clusters_per_class=2,
+ weights=[0.8, 0.2],
+ random_state=1,
+ )
for name, clf, acc_no_weighting, acc_weighting, places in [
- ('adaboost', AdaboostClassifier, 0.810, 0.735, 3),
- ('decision_tree', DecisionTree, 0.780, 0.643, 3),
- ('extra_trees', ExtraTreesClassifier, 0.78, 0.8, 3),
- ('random_forest', RandomForest, 0.75, 0.789, 3),
- ('libsvm_svc', LibSVM_SVC, 0.769, 0.72, 3),
- ('liblinear_svc', LibLinear_SVC, 0.762, 0.735, 3),
- ('passive_aggressive', PassiveAggressive, 0.16, 0.222, 3),
- ('sgd', SGD, 0.818, 0.567, 2),
- ('gradient_boosting', GradientBoostingClassifier, 0.666, 0.682, 2)
- ]:
+ ("adaboost", AdaboostClassifier, 0.810, 0.735, 3),
+ ("decision_tree", DecisionTree, 0.780, 0.643, 3),
+ ("extra_trees", ExtraTreesClassifier, 0.78, 0.8, 3),
+ ("random_forest", RandomForest, 0.75, 0.789, 3),
+ ("libsvm_svc", LibSVM_SVC, 0.769, 0.72, 3),
+ ("liblinear_svc", LibLinear_SVC, 0.762, 0.735, 3),
+ ("passive_aggressive", PassiveAggressive, 0.16, 0.222, 3),
+ ("sgd", SGD, 0.818, 0.567, 2),
+ ("gradient_boosting", GradientBoostingClassifier, 0.666, 0.682, 2),
+ ]:
for strategy, acc in [
- ('none', acc_no_weighting),
- ('weighting', acc_weighting)
+ ("none", acc_no_weighting),
+ ("weighting", acc_weighting),
]:
# Fit
data_ = copy.copy(data)
@@ -98,23 +123,25 @@ def test_weighting_effect(self):
model_args = {
"random_state": 1,
"include": {
- 'classifier': [name],
- 'feature_preprocessor': ['no_preprocessing']
- }
+ "classifier": [name],
+ "feature_preprocessor": ["no_preprocessing"],
+ },
}
classifier = SimpleClassificationPipeline(**model_args)
cs = classifier.get_hyperparameter_search_space()
default = cs.get_default_configuration()
- default._values['balancing:strategy'] = strategy
+ default._values["balancing:strategy"] = strategy
classifier = SimpleClassificationPipeline(config=default, **model_args)
classifier.fit(X_train, Y_train)
predictions1 = classifier.predict(X_test)
self.assertAlmostEqual(
- sklearn.metrics.f1_score(predictions1, Y_test), acc,
- places=places, msg=(name, strategy)
+ sklearn.metrics.f1_score(predictions1, Y_test),
+ acc,
+ places=places,
+ msg=(name, strategy),
)
# fit_transformer and fit_estimator
@@ -130,39 +157,53 @@ def test_weighting_effect(self):
predictions2 = classifier.predict(X_test)
np.testing.assert_allclose(
- predictions1, predictions2,
- err_msg=f"name = {name}, strategy = {strategy}"
+ predictions1,
+ predictions2,
+ err_msg=f"name = {name}, strategy = {strategy}",
)
self.assertAlmostEqual(
- sklearn.metrics.f1_score(predictions2, Y_test), acc,
- places=places, msg=(name, strategy)
+ sklearn.metrics.f1_score(predictions2, Y_test),
+ acc,
+ places=places,
+ msg=(name, strategy),
)
- for name, pre, acc_no_weighting, acc_weighting in \
- [('extra_trees_preproc_for_classification',
- ExtraTreesPreprocessorClassification, 0.810, 0.590),
- ('liblinear_svc_preprocessor', LibLinear_Preprocessor,
- 0.837, 0.562)]:
- for strategy, acc in [('none', acc_no_weighting),
- ('weighting', acc_weighting)]:
+ for name, pre, acc_no_weighting, acc_weighting in [
+ (
+ "extra_trees_preproc_for_classification",
+ ExtraTreesPreprocessorClassification,
+ 0.810,
+ 0.590,
+ ),
+ ("liblinear_svc_preprocessor", LibLinear_Preprocessor, 0.837, 0.562),
+ ]:
+ for strategy, acc in [
+ ("none", acc_no_weighting),
+ ("weighting", acc_weighting),
+ ]:
data_ = copy.copy(data)
X_train = data_[0][:100]
Y_train = data_[1][:100]
X_test = data_[0][100:]
Y_test = data_[1][100:]
- include = {'classifier': ['sgd'], 'feature_preprocessor': [name]}
+ include = {"classifier": ["sgd"], "feature_preprocessor": [name]}
- classifier = SimpleClassificationPipeline(random_state=1, include=include)
+ classifier = SimpleClassificationPipeline(
+ random_state=1, include=include
+ )
cs = classifier.get_hyperparameter_search_space()
default = cs.get_default_configuration()
- default._values['balancing:strategy'] = strategy
+ default._values["balancing:strategy"] = strategy
classifier.set_hyperparameters(default)
predictor = classifier.fit(X_train, Y_train)
predictions = predictor.predict(X_test)
self.assertAlmostEqual(
- sklearn.metrics.f1_score(predictions, Y_test), acc,
- places=3, msg=(name, strategy))
+ sklearn.metrics.f1_score(predictions, Y_test),
+ acc,
+ places=3,
+ msg=(name, strategy),
+ )
# fit_transformer and fit_estimator
data_ = copy.copy(data)
@@ -171,11 +212,13 @@ def test_weighting_effect(self):
X_test = data_[0][100:]
Y_test = data_[1][100:]
- default._values['balancing:strategy'] = strategy
- classifier = SimpleClassificationPipeline(default, random_state=1, include=include)
+ default._values["balancing:strategy"] = strategy
+ classifier = SimpleClassificationPipeline(
+ default, random_state=1, include=include
+ )
Xt, fit_params = classifier.fit_transformer(X_train, Y_train)
classifier.fit_estimator(Xt, Y_train, **fit_params)
predictions = classifier.predict(X_test)
self.assertAlmostEqual(
- sklearn.metrics.f1_score(predictions, Y_test), acc,
- places=3)
+ sklearn.metrics.f1_score(predictions, Y_test), acc, places=3
+ )
diff --git a/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py b/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py
index 2767093179..d50e8cf842 100644
--- a/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py
+++ b/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py
@@ -1,11 +1,11 @@
import numpy as np
-from scipy import sparse
-
import pandas as pd
import pytest
+from scipy import sparse
-from autosklearn.pipeline.components.data_preprocessing.imputation.categorical_imputation\
- import CategoricalImputation
+from autosklearn.pipeline.components.data_preprocessing.imputation.categorical_imputation import ( # noqa: E501
+ CategoricalImputation,
+)
@pytest.fixture
@@ -14,15 +14,15 @@ def input_data_imputation(request):
X = np.array(np.random.randint(3, 10, size=size), dtype=float)
mask = np.logical_not(np.random.randint(0, 5, size=size), dtype=bool)
X[mask] = np.nan
- if request.param == 'numpy':
+ if request.param == "numpy":
pass
- elif request.param == 'pandas':
+ elif request.param == "pandas":
X = pd.DataFrame(X)
return X, mask
-@pytest.mark.parametrize('input_data_imputation', ('numpy', 'pandas'), indirect=True)
-@pytest.mark.parametrize('categorical', (True, False))
+@pytest.mark.parametrize("input_data_imputation", ("numpy", "pandas"), indirect=True)
+@pytest.mark.parametrize("categorical", (True, False))
def test_default_imputation(input_data_imputation, categorical):
"""
Makes sure that imputation works for both numerical and categorical data.
@@ -30,8 +30,8 @@ def test_default_imputation(input_data_imputation, categorical):
"""
X, mask = input_data_imputation
if categorical:
- imputation_value = 'missing_value'
- X = X.astype('str').astype('object')
+ imputation_value = "missing_value"
+ X = X.astype("str").astype("object")
X[mask] = np.nan
else:
imputation_value = min(np.unique(X)) - 1
@@ -42,15 +42,15 @@ def test_default_imputation(input_data_imputation, categorical):
assert np.array_equal(Y != imputation_value, ~mask)
-@pytest.mark.parametrize('format_type', ('numpy', 'pandas'))
+@pytest.mark.parametrize("format_type", ("numpy", "pandas"))
def test_nonzero_numerical_imputation(format_type):
# First try with an array with 0 as only valid category. The imputation should
# happen with -1
X = np.full(fill_value=np.nan, shape=(10, 10))
X[0, :] = 0
- if 'pandas' in format_type:
+ if "pandas" in format_type:
X = pd.DataFrame(X)
- elif 'numpy' in format_type:
+ elif "numpy" in format_type:
pass
else:
pytest.fail(format_type)
@@ -61,13 +61,13 @@ def test_nonzero_numerical_imputation(format_type):
X = np.full(fill_value=np.nan, shape=(10, 10))
X[0, :] = 0
X[1, :] = -1
- if 'pandas' in format_type:
+ if "pandas" in format_type:
X = pd.DataFrame(X)
Y = CategoricalImputation().fit_transform(X.copy())
np.testing.assert_equal(np.nan_to_num(X, nan=-2, copy=True), Y)
-@pytest.mark.parametrize('input_data_imputation', ('numpy'), indirect=True)
+@pytest.mark.parametrize("input_data_imputation", ("numpy"), indirect=True)
def test_default_sparse(input_data_imputation):
X, mask = input_data_imputation
X = sparse.csr_matrix(X)
diff --git a/test/test_pipeline/components/data_preprocessing/test_category_shift.py b/test/test_pipeline/components/data_preprocessing/test_category_shift.py
index d49e6a84f0..ce637f50d4 100644
--- a/test/test_pipeline/components/data_preprocessing/test_category_shift.py
+++ b/test/test_pipeline/components/data_preprocessing/test_category_shift.py
@@ -1,19 +1,21 @@
import unittest
+
import numpy as np
import scipy.sparse
-from autosklearn.pipeline.components.data_preprocessing.category_shift.\
- category_shift import CategoryShift
+from autosklearn.pipeline.components.data_preprocessing.category_shift.category_shift import ( # noqa: E501
+ CategoryShift,
+)
class CategoryShiftTest(unittest.TestCase):
-
def test_data_type_consistency(self):
X = np.random.randint(0, 255, (3, 4))
Y = CategoryShift().fit_transform(X)
self.assertFalse(scipy.sparse.issparse(Y))
X = scipy.sparse.csc_matrix(
- ([1, 2, 0, 4], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4))
+ ([1, 2, 0, 4], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)
+ )
Y = CategoryShift().fit_transform(X)
self.assertTrue(scipy.sparse.issparse(Y))
diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py
index 5e6f89ad3a..ac8e9abbe2 100644
--- a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py
+++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py
@@ -1,13 +1,14 @@
import unittest
+
import numpy as np
from scipy import sparse
-from autosklearn.pipeline.components.data_preprocessing.feature_type \
- import FeatTypeSplit
+from autosklearn.pipeline.components.data_preprocessing.feature_type import (
+ FeatTypeSplit,
+)
class PreprocessingPipelineTest(unittest.TestCase):
-
def do_a_fit_transform(self, sparse_input):
# X will be the input and Y is what we expect after transform. categ_feat stores
# indicators of feature type (True if categorical, False if numerical)
@@ -21,58 +22,57 @@ def do_a_fit_transform(self, sparse_input):
# This feature should be normalized by having its mean subtracted from all
# elements and by having them divided by the standard deviation.
categ_feat.append(False)
- nf = np.array([1., 2., 3.]).reshape(3, 1) # mean = 2.
- sdev = np.sqrt(2. / 3.)
- shift = 0 if sparse_input else 2. # if sparse_input, there is no mean subtraction
+ nf = np.array([1.0, 2.0, 3.0]).reshape(3, 1) # mean = 2.
+ sdev = np.sqrt(2.0 / 3.0)
+ shift = (
+ 0 if sparse_input else 2.0
+ ) # if sparse_input, there is no mean subtraction
nft = (nf - shift) / sdev
X.append(nf)
Y.append(nft)
# Feature 3 (numerical):
- # This feature has a missing value that should be imputed by the mean of the other
- # values (2.). This feature should also be normalized as in the previous feature.
+ # This feature has a missing value that should be imputed by the mean of the
+ # other values (2.).
+ # This feature should also be normalized as in the previous feature.
categ_feat.append(False)
- X.append(np.array([1., np.nan, 3.]).reshape(3, 1))
+ X.append(np.array([1.0, np.nan, 3.0]).reshape(3, 1))
Y.append(nft.copy())
# Feature 4 (categorical)
# This feature should be one hot encoded.
categ_feat.append(True)
X.append(np.array([1, 3, 2]).reshape(3, 1))
- Y.append(np.array([
- [1, 0, 0],
- [0, 0, 1],
- [0, 1, 0]]))
+ Y.append(np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0]]))
# Feature 5 (categorical)
# This feature should be one hot encoded. (A discontinuous category set or
# a category 0 shouldn't be problems.)
categ_feat.append(True)
X.append(np.array([2, 1, 9]).reshape(3, 1))
- Y.append(np.array([
- [0, 1, 0],
- [1, 0, 0],
- [0, 0, 1]]))
+ Y.append(np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]]))
# Feature 6 (categorical)
# This feature should be one hot encoded. The missing value gets imputed as
# a category on its own.
categ_feat.append(True)
X.append(np.array([1, 1, np.nan]).reshape(3, 1))
- Y.append(np.array([
- [0, 1],
- [0, 1],
- [1, 0]]))
+ Y.append(np.array([[0, 1], [0, 1], [1, 0]]))
# Combine datasets and shuffle columns:
n_feats = len(categ_feat)
random_order = np.random.choice(np.arange(n_feats), size=n_feats, replace=False)
# Shuffle X according to random_order
X = np.array(X)[random_order]
X_comb = np.hstack(X)
- # Shuffle Y according to random_order and reorder it as the PreprocessingPipeline
- # does (i.e. categorical features come first in Y).
+ # Shuffle Y according to random_order and reorder it as the
+ # PreprocessingPipeline does (i.e. categorical features come first in Y).
- categ_feat = {i: 'categorical' if categ_feat[order] else 'numerical'
- for i, order in enumerate(random_order)}
- cat_to_left_order = [index for col, index in sorted(
- [(col_type, i) for i, col_type in categ_feat.items()]
- )]
+ categ_feat = {
+ i: "categorical" if categ_feat[order] else "numerical"
+ for i, order in enumerate(random_order)
+ }
+ cat_to_left_order = [
+ index
+ for col, index in sorted(
+ [(col_type, i) for i, col_type in categ_feat.items()]
+ )
+ ]
# Sort so that Y Matches the random ordering
Y = [Y[n] for n in random_order]
# Then move the categorical columns to the left
@@ -101,15 +101,21 @@ def test_fit_transform_sparse(self):
def test_string_categories(self):
# Numerical dataset (as used in NumericalPreprocessingPipelineTest)
- X_num = np.array([
- [3.14, 1., 1.], # noqa : matrix legibility
- [3.14, 2., np.nan], # noqa : matrix legibility
- [3.14, 3., 3.]]) # noqa : matrix legibility
+ X_num = np.array(
+ [
+ [3.14, 1.0, 1.0], # noqa : matrix legibility
+ [3.14, 2.0, np.nan], # noqa : matrix legibility
+ [3.14, 3.0, 3.0],
+ ]
+ ) # noqa : matrix legibility
# Categorical string dataset
- X_cat = np.array([
- ['red', 'medium', 'small'],
- ['blue', 'short', 'big'],
- ['white', 'tall', np.nan]])
+ X_cat = np.array(
+ [
+ ["red", "medium", "small"],
+ ["blue", "short", "big"],
+ ["white", "tall", np.nan],
+ ]
+ )
# Combined dataset with shuffled columns:
X_comb = np.hstack((X_num, X_cat))
categ_feat = [False] * 3 + [True] * 3
@@ -118,6 +124,8 @@ def test_string_categories(self):
categ_feat = [categ_feat[order] for order in random_order]
# Strings are not allowed, therefore:
with self.assertRaises(ValueError):
- categ_feat = {i: 'categorical' if feat else 'numerical'
- for i, feat in enumerate(categ_feat)}
+ categ_feat = {
+ i: "categorical" if feat else "numerical"
+ for i, feat in enumerate(categ_feat)
+ }
FeatTypeSplit(feat_type=categ_feat).fit_transform(X_comb)
diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py
index dbffe26f51..1d693eb150 100644
--- a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py
+++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py
@@ -1,34 +1,35 @@
import unittest
-import numpy as np
-from scipy import sparse
+import numpy as np
import pytest
+from scipy import sparse
-from autosklearn.pipeline.components.data_preprocessing.feature_type_categorical \
- import CategoricalPreprocessingPipeline
+from autosklearn.pipeline.components.data_preprocessing.feature_type_categorical import ( # noqa: E501
+ CategoricalPreprocessingPipeline,
+)
class CategoricalPreprocessingPipelineTest(unittest.TestCase):
-
def test_data_type_consistency(self):
X = np.random.randint(3, 6, (3, 4))
Y = CategoricalPreprocessingPipeline().fit_transform(X)
self.assertFalse(sparse.issparse(Y))
X = sparse.csc_matrix(
- ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4))
+ ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)
+ )
Y = CategoricalPreprocessingPipeline().fit_transform(X)
self.assertTrue(sparse.issparse(Y))
def test_fit_transform(self):
- X = np.array([
- [1, 2, 1],
- [3, 1, 1],
- [2, 9, np.nan]])
- Y = np.array([
- [1, 0, 0, 0, 1, 0, 0, 1],
- [0, 0, 1, 1, 0, 0, 0, 1],
- [0, 1, 0, 0, 0, 1, 1, 0]])
+ X = np.array([[1, 2, 1], [3, 1, 1], [2, 9, np.nan]])
+ Y = np.array(
+ [
+ [1, 0, 0, 0, 1, 0, 0, 1],
+ [0, 0, 1, 1, 0, 0, 0, 1],
+ [0, 1, 0, 0, 0, 1, 1, 0],
+ ]
+ )
# dense input
# Notice the X.copy() here as the imputation
# is in place to save resources
@@ -41,30 +42,30 @@ def test_fit_transform(self):
np.testing.assert_array_equal(Yt, Y)
def test_transform(self):
- X1 = np.array([
- [1, 2, 0],
- [3, 0, 0],
- [2, 9, np.nan]])
- Y1 = np.array([
- [1, 0, 0, 0, 1, 0, 0, 1],
- [0, 0, 1, 1, 0, 0, 0, 1],
- [0, 1, 0, 0, 0, 1, 1, 0]])
- X2 = np.array([
- [2, 2, 1],
- [3, 0, 0],
- [2, np.nan, np.nan]])
- Y2 = np.array([
- [0, 1, 0, 0, 1, 0, 0, 0],
- [0, 0, 1, 1, 0, 0, 0, 1],
- [0, 1, 0, 0, 0, 0, 1, 0]])
- X3 = np.array([
- [3, np.nan, 0],
- [3, 9, np.nan],
- [2, 2, 5]])
- Y3 = np.array([
- [0, 0, 1, 0, 0, 0, 0, 1],
- [0, 0, 1, 0, 0, 1, 1, 0],
- [0, 1, 0, 0, 1, 0, 0, 0]])
+ X1 = np.array([[1, 2, 0], [3, 0, 0], [2, 9, np.nan]])
+ Y1 = np.array(
+ [
+ [1, 0, 0, 0, 1, 0, 0, 1],
+ [0, 0, 1, 1, 0, 0, 0, 1],
+ [0, 1, 0, 0, 0, 1, 1, 0],
+ ]
+ )
+ X2 = np.array([[2, 2, 1], [3, 0, 0], [2, np.nan, np.nan]])
+ Y2 = np.array(
+ [
+ [0, 1, 0, 0, 1, 0, 0, 0],
+ [0, 0, 1, 1, 0, 0, 0, 1],
+ [0, 1, 0, 0, 0, 0, 1, 0],
+ ]
+ )
+ X3 = np.array([[3, np.nan, 0], [3, 9, np.nan], [2, 2, 5]])
+ Y3 = np.array(
+ [
+ [0, 0, 1, 0, 0, 0, 0, 1],
+ [0, 0, 1, 0, 0, 1, 1, 0],
+ [0, 1, 0, 0, 1, 0, 0, 0],
+ ]
+ )
# "fit"
CPPL = CategoricalPreprocessingPipeline()
CPPL.fit_transform(X1)
@@ -81,13 +82,15 @@ def test_transform(self):
def test_transform_with_coalescence(self):
# Generates an array with categories 0, 20, 5, 6, 10, and occurences of 60%,
# 30%, 19% 0.5% and 0.5% respectively
- X = np.vstack((
- np.ones((120, 10)) * 0,
- np.ones((60, 10)) * 20,
- np.ones((18, 10)) * 5,
- np.ones((1, 10)) * 6,
- np.ones((1, 10)) * 10,
- ))
+ X = np.vstack(
+ (
+ np.ones((120, 10)) * 0,
+ np.ones((60, 10)) * 20,
+ np.ones((18, 10)) * 5,
+ np.ones((1, 10)) * 6,
+ np.ones((1, 10)) * 10,
+ )
+ )
for col in range(X.shape[1]):
np.random.shuffle(X[:, col])
@@ -100,10 +103,12 @@ def test_transform_with_coalescence(self):
Y2t = CPPL.transform(X)
np.testing.assert_array_equal(Y1t, Y2t)
- @pytest.mark.xfail(reason=(
- "Encoding step does not support sparse matrices to convert negative labels to"
- " positive ones as it does with non-sparse matrices"
- ))
+ @pytest.mark.xfail(
+ reason=(
+ "Encoding step does not support sparse matrices to convert negative labels"
+ " to positive ones as it does with non-sparse matrices"
+ )
+ )
def test_transform_with_sparse_column_with_negative_labels(self):
X = sparse.csr_matrix([[0], [-1]])
CategoricalPreprocessingPipeline().fit_transform(X)
diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py
index 6a0b9d37fc..5a0a840501 100644
--- a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py
+++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py
@@ -1,69 +1,64 @@
import unittest
-import numpy as np
+import numpy as np
from scipy import sparse
-from autosklearn.pipeline.components.data_preprocessing.feature_type_numerical \
- import NumericalPreprocessingPipeline
+from autosklearn.pipeline.components.data_preprocessing.feature_type_numerical import (
+ NumericalPreprocessingPipeline,
+)
class NumericalPreprocessingPipelineTest(unittest.TestCase):
-
def test_data_type_consistency(self):
X = np.random.rand(3, 4)
Y = NumericalPreprocessingPipeline().fit_transform(X)
self.assertFalse(sparse.issparse(Y))
X = sparse.csc_matrix(
- ([3., 6., 4., 5.], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4))
+ ([3.0, 6.0, 4.0, 5.0], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)
+ )
Y = NumericalPreprocessingPipeline().fit_transform(X)
self.assertTrue(sparse.issparse(Y))
def test_fit_transform(self):
- X = np.array([
- [3.14, 1., 1.],
- [3.14, 2., np.nan],
- [3.14, 3., 3.]]) # noqa : matrix legibility
+ X = np.array(
+ [[3.14, 1.0, 1.0], [3.14, 2.0, np.nan], [3.14, 3.0, 3.0]]
+ ) # noqa : matrix legibility
# 1st column should be droped due to low variance
# The 2nd should be be standardized (default rescaling algorithm)
- # The 3rd will get a value imputed by the mean (2.), therefore the transformation
- # here will have the same effect as on the the 2nd column
+ # The 3rd will get a value imputed by the mean (2.), therefore the
+ # transformation here will have the same effect as on the the 2nd column
sdev = np.sqrt(2 / 3)
- Y1 = np.array([
- [-1/sdev, -1/sdev],
- [ 0., 0.], # noqa : matrix legibility
- [ 1/sdev, 1/sdev]]) # noqa : matrix legibility
+ Y1 = np.array(
+ [
+ [-1 / sdev, -1 / sdev],
+ [0.0, 0.0], # noqa : matrix legibility
+ [1 / sdev, 1 / sdev],
+ ]
+ ) # noqa : matrix legibility
# dense input
Yt = NumericalPreprocessingPipeline().fit_transform(X)
np.testing.assert_array_almost_equal(Yt, Y1)
# sparse input (uses with_mean=False)
- Y2 = np.array([
- [1., 1.],
- [2., 2.],
- [3., 3.]]) / sdev
+ Y2 = np.array([[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]]) / sdev
X_sparse = sparse.csc_matrix(X)
Yt = NumericalPreprocessingPipeline().fit_transform(X_sparse)
np.testing.assert_array_almost_equal(Yt.todense(), Y2)
def test_transform(self):
- X1 = np.array([
- [3.14, 1., 1.],
- [3.14, 2., np.nan],
- [3.14, 3., 3.]]) # noqa : matrix legibility
+ X1 = np.array(
+ [[3.14, 1.0, 1.0], [3.14, 2.0, np.nan], [3.14, 3.0, 3.0]]
+ ) # noqa : matrix legibility
sdev = np.sqrt(2 / 3)
# fit
NPP = NumericalPreprocessingPipeline()
NPP.fit_transform(X1)
# transform
- X2 = np.array([
- [1., 5., 8.],
- [2., 6., 9.],
- [3., 7., np.nan]])
+ X2 = np.array([[1.0, 5.0, 8.0], [2.0, 6.0, 9.0], [3.0, 7.0, np.nan]])
Yt = NPP.transform(X2)
# imputation, variance_threshold and rescaling are done using the data already
# fitted, therefore:
- Y2 = np.array([
- [3/sdev, 6/sdev],
- [4/sdev, 7/sdev],
- [5/sdev, 0.]]) # noqa : matrix legibility
+ Y2 = np.array(
+ [[3 / sdev, 6 / sdev], [4 / sdev, 7 / sdev], [5 / sdev, 0.0]]
+ ) # noqa : matrix legibility
np.testing.assert_array_almost_equal(Yt, Y2)
diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_text.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_text.py
index 0e39c5d7e9..0a2e3d5188 100644
--- a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_text.py
+++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_text.py
@@ -1,33 +1,57 @@
import unittest
+
import numpy as np
import pandas as pd
-from autosklearn.pipeline.components.data_preprocessing.text_encoding.bag_of_word_encoding import \
- BagOfWordEncoder as BOW
-from autosklearn.pipeline.components.data_preprocessing.\
- text_encoding.bag_of_word_encoding_distinct import BagOfWordEncoder as BOW_distinct
+from autosklearn.pipeline.components.data_preprocessing.text_encoding.bag_of_word_encoding import ( # noqa: E501
+ BagOfWordEncoder as BOW,
+)
+from autosklearn.pipeline.components.data_preprocessing.text_encoding.bag_of_word_encoding_distinct import ( # noqa: E501
+ BagOfWordEncoder as BOW_distinct,
+)
class TextPreprocessingPipelineTest(unittest.TestCase):
-
def test_fit_transform(self):
- X = pd.DataFrame({"col1": ["hello world",
- "This is a test"],
- "col2": ["hello mars",
- "This is the second column"]}).astype({"col1": "string",
- "col2": "string"})
- BOW_fitted = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
- min_df_relative=0, random_state=1).fit(X.copy())
+ X = pd.DataFrame(
+ {
+ "col1": ["hello world", "This is a test"],
+ "col2": ["hello mars", "This is the second column"],
+ }
+ ).astype({"col1": "string", "col2": "string"})
+ BOW_fitted = BOW(
+ ngram_range=1,
+ min_df_choice="min_df_absolute",
+ min_df_absolute=0,
+ min_df_relative=0,
+ random_state=1,
+ ).fit(X.copy())
Yt = BOW_fitted.preprocessor.vocabulary_
- words = sorted(["hello", "world", "this", "is", "test", # "a" is not added, len(...)=1
- "mars", "the", "second", "column"]) # is ignored by CountVectorizer
+ words = sorted(
+ [
+ "hello",
+ "world",
+ "this",
+ "is",
+ "test", # "a" is not added, len(...)=1
+ "mars",
+ "the",
+ "second",
+ "column",
+ ]
+ ) # is ignored by CountVectorizer
Y = {key: idx for idx, key in enumerate(words)}
np.testing.assert_array_equal(Yt, Y)
- BOW_fitted = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
- min_df_relative=0, random_state=1).fit(X.copy())
+ BOW_fitted = BOW_distinct(
+ ngram_range=1,
+ min_df_choice="min_df_absolute",
+ min_df_absolute=0,
+ min_df_relative=0,
+ random_state=1,
+ ).fit(X.copy())
for key in BOW_fitted.preprocessor:
y = []
@@ -38,58 +62,89 @@ def test_fit_transform(self):
np.testing.assert_array_equal(yt, y)
def test_transform(self):
- X = pd.DataFrame({"col1": ["hello world",
- "this is a test"],
- "col2": ["hello mars",
- "this is the second column"]}).astype({"col1": "string",
- "col2": "string"})
- X_t = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
- min_df_relative=0, random_state=1).fit_transform(X.copy())
+ X = pd.DataFrame(
+ {
+ "col1": ["hello world", "this is a test"],
+ "col2": ["hello mars", "this is the second column"],
+ }
+ ).astype({"col1": "string", "col2": "string"})
+ X_t = BOW(
+ ngram_range=1,
+ min_df_choice="min_df_absolute",
+ min_df_absolute=0,
+ min_df_relative=0,
+ random_state=1,
+ ).fit_transform(X.copy())
# ['column', 'hello', 'is', 'mars', 'second', 'test', 'the', 'this', 'world']
- y = np.array([[0, 2, 0, 1, 0, 0, 0, 0, 1],
- [1, 0, 2, 0, 1, 1, 1, 2, 0]])
+ y = np.array([[0, 2, 0, 1, 0, 0, 0, 0, 1], [1, 0, 2, 0, 1, 1, 1, 2, 0]])
np.testing.assert_array_equal(X_t.toarray(), y)
- X_t = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
- min_df_relative=0, random_state=1).fit_transform(X.copy())
+ X_t = BOW_distinct(
+ ngram_range=1,
+ min_df_choice="min_df_absolute",
+ min_df_absolute=0,
+ min_df_relative=0,
+ random_state=1,
+ ).fit_transform(X.copy())
# 'hello', 'is', 'test', 'this', 'world',
# 'column', 'hello', 'is', 'mars', 'second', 'the', 'this'
- y = np.array([[1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
- [0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]])
+ y = np.array(
+ [[1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]]
+ )
np.testing.assert_array_equal(X_t.toarray(), y)
def test_check_shape(self):
- X = pd.DataFrame({"col1": ["hello world",
- "this is test"],
- "col2": ["test test",
- "test test"]}).astype({"col1": "string",
- "col2": "string"})
- X_t = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
- min_df_relative=0, random_state=1).fit_transform(X.copy())
+ X = pd.DataFrame(
+ {
+ "col1": ["hello world", "this is test"],
+ "col2": ["test test", "test test"],
+ }
+ ).astype({"col1": "string", "col2": "string"})
+ X_t = BOW(
+ ngram_range=1,
+ min_df_choice="min_df_absolute",
+ min_df_absolute=0,
+ min_df_relative=0,
+ random_state=1,
+ ).fit_transform(X.copy())
self.assertEqual(X_t.shape, (2, 5))
- X_t = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
- min_df_relative=0, random_state=1).fit_transform(X.copy())
+ X_t = BOW_distinct(
+ ngram_range=1,
+ min_df_choice="min_df_absolute",
+ min_df_absolute=0,
+ min_df_relative=0,
+ random_state=1,
+ ).fit_transform(X.copy())
self.assertEqual(X_t.shape, (2, 6))
def test_check_nan(self):
- X = pd.DataFrame({"col1": ["hello world",
- "this is test",
- None],
- "col2": ["test test",
- "test test",
- "test"]}).astype({"col1": "string",
- "col2": "string"})
- X_t = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
- min_df_relative=0, random_state=1).fit_transform(X.copy())
+ X = pd.DataFrame(
+ {
+ "col1": ["hello world", "this is test", None],
+ "col2": ["test test", "test test", "test"],
+ }
+ ).astype({"col1": "string", "col2": "string"})
+ X_t = BOW(
+ ngram_range=1,
+ min_df_choice="min_df_absolute",
+ min_df_absolute=0,
+ min_df_relative=0,
+ random_state=1,
+ ).fit_transform(X.copy())
self.assertEqual(X_t.shape, (3, 5))
- X_t = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
- min_df_relative=0, random_state=1).fit_transform(X.copy())
+ X_t = BOW_distinct(
+ ngram_range=1,
+ min_df_choice="min_df_absolute",
+ min_df_absolute=0,
+ min_df_relative=0,
+ random_state=1,
+ ).fit_transform(X.copy())
self.assertEqual(X_t.shape, (3, 6))
diff --git a/test/test_pipeline/components/data_preprocessing/test_minority_coalescence.py b/test/test_pipeline/components/data_preprocessing/test_minority_coalescence.py
index 7b3e35763e..8e73e963ab 100644
--- a/test/test_pipeline/components/data_preprocessing/test_minority_coalescence.py
+++ b/test/test_pipeline/components/data_preprocessing/test_minority_coalescence.py
@@ -1,23 +1,25 @@
import unittest
-import numpy as np
+import numpy as np
import scipy.sparse
-from autosklearn.pipeline.components.data_preprocessing.minority_coalescense\
- .minority_coalescer import MinorityCoalescer
-from autosklearn.pipeline.components.data_preprocessing.minority_coalescense\
- .no_coalescense import NoCoalescence
+from autosklearn.pipeline.components.data_preprocessing.minority_coalescense.minority_coalescer import ( # noqa: E501
+ MinorityCoalescer,
+)
+from autosklearn.pipeline.components.data_preprocessing.minority_coalescense.no_coalescense import ( # noqa: E501
+ NoCoalescence,
+)
class MinorityCoalescerTest(unittest.TestCase):
-
def test_data_type_consistency(self):
X = np.random.randint(3, 6, (3, 4))
Y = MinorityCoalescer().fit_transform(X)
self.assertFalse(scipy.sparse.issparse(Y))
X = scipy.sparse.csc_matrix(
- ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4))
+ ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)
+ )
Y = MinorityCoalescer().fit_transform(X)
self.assertTrue(scipy.sparse.issparse(Y))
diff --git a/test/test_pipeline/components/data_preprocessing/test_numerical_imputation.py b/test/test_pipeline/components/data_preprocessing/test_numerical_imputation.py
index 35d9d23a6d..d3354c3730 100644
--- a/test/test_pipeline/components/data_preprocessing/test_numerical_imputation.py
+++ b/test/test_pipeline/components/data_preprocessing/test_numerical_imputation.py
@@ -1,8 +1,9 @@
from scipy import sparse
-from autosklearn.pipeline.components.data_preprocessing.imputation.numerical_imputation\
- import NumericalImputation
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase
+from autosklearn.pipeline.components.data_preprocessing.imputation.numerical_imputation import ( # noqa: E501
+ NumericalImputation,
+)
+from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing
class NumericalImputationTest(PreprocessingTestCase):
@@ -14,13 +15,13 @@ def test_default_configuration(self):
self.assertTrue((transformation == original).all())
transformations.append(transformation)
if len(transformations) > 1:
- self.assertTrue(
- (transformations[-1] == transformations[-2]).all())
+ self.assertTrue((transformations[-1] == transformations[-2]).all())
def test_default_configuration_sparse_data(self):
transformations = []
- transformation, original = _test_preprocessing(NumericalImputation,
- make_sparse=True)
+ transformation, original = _test_preprocessing(
+ NumericalImputation, make_sparse=True
+ )
self.assertEqual(transformation.shape, original.shape)
self.assertTrue((transformation.data == original.data).all())
self.assertIsInstance(transformation, sparse.csc_matrix)
@@ -28,4 +29,5 @@ def test_default_configuration_sparse_data(self):
def test_preprocessing_dtype(self):
super(NumericalImputationTest, self)._test_preprocessing_dtype(
- NumericalImputation, add_NaNs=True)
+ NumericalImputation, add_NaNs=True
+ )
diff --git a/test/test_pipeline/components/data_preprocessing/test_one_hot_encoding.py b/test/test_pipeline/components/data_preprocessing/test_one_hot_encoding.py
index ba99724964..08d2cadd9e 100644
--- a/test/test_pipeline/components/data_preprocessing/test_one_hot_encoding.py
+++ b/test/test_pipeline/components/data_preprocessing/test_one_hot_encoding.py
@@ -3,10 +3,12 @@
import numpy as np
from scipy import sparse
-from autosklearn.pipeline.components.data_preprocessing.categorical_encoding.\
- one_hot_encoding import OneHotEncoder
-from autosklearn.pipeline.components.data_preprocessing.categorical_encoding.\
- no_encoding import NoEncoding
+from autosklearn.pipeline.components.data_preprocessing.categorical_encoding.no_encoding import ( # noqa: E501
+ NoEncoding,
+)
+from autosklearn.pipeline.components.data_preprocessing.categorical_encoding.one_hot_encoding import ( # noqa: E501
+ OneHotEncoder,
+)
from autosklearn.pipeline.util import _test_preprocessing
@@ -18,7 +20,6 @@ def create_X(instances=1000, n_feats=10, categs_per_feat=5, seed=0):
class OneHotEncoderTest(unittest.TestCase):
-
def setUp(self):
self.X_train = create_X()
@@ -28,7 +29,8 @@ def test_data_type_consistency(self):
self.assertFalse(sparse.issparse(Y))
X = sparse.csc_matrix(
- ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4))
+ ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)
+ )
Y = OneHotEncoder().fit_transform(X)
self.assertTrue(sparse.issparse(Y))
@@ -54,8 +56,7 @@ def test_default_configuration_no_encoding(self):
self.assertTrue((transformation == original).all())
transformations.append(transformation)
if len(transformations) > 1:
- self.assertTrue(
- (transformations[-1] == transformations[-2]).all())
+ self.assertTrue((transformations[-1] == transformations[-2]).all())
def test_default_configuration_sparse_data(self):
transformations = []
@@ -74,17 +75,18 @@ def test_default_configuration_sparse_data(self):
transformations.append(Xt)
if len(transformations) > 1:
self.assertEqual(
- (transformations[-1] != transformations[-2]).count_nonzero(), 0)
+ (transformations[-1] != transformations[-2]).count_nonzero(), 0
+ )
def test_default_configuration_sparse_no_encoding(self):
transformations = []
for i in range(2):
- transformation, original = _test_preprocessing(NoEncoding,
- make_sparse=True)
+ transformation, original = _test_preprocessing(NoEncoding, make_sparse=True)
self.assertEqual(transformation.shape, original.shape)
self.assertTrue((transformation.todense() == original.todense()).all())
transformations.append(transformation)
if len(transformations) > 1:
self.assertEqual(
- (transformations[-1] != transformations[-2]).count_nonzero(), 0)
+ (transformations[-1] != transformations[-2]).count_nonzero(), 0
+ )
diff --git a/test/test_pipeline/components/data_preprocessing/test_scaling.py b/test/test_pipeline/components/data_preprocessing/test_scaling.py
index f800930dda..7f8249e3f1 100644
--- a/test/test_pipeline/components/data_preprocessing/test_scaling.py
+++ b/test/test_pipeline/components/data_preprocessing/test_scaling.py
@@ -12,13 +12,14 @@ def _test_helper(self, Preprocessor, dataset=None, make_sparse=False):
X_train, Y_train, X_test, Y_test = get_dataset(
dataset=dataset,
make_sparse=make_sparse,
- )
+ )
- dataset_properties = {'sparse': make_sparse}
+ dataset_properties = {"sparse": make_sparse}
original_X_train = X_train.copy()
- configuration_space = Preprocessor(dataset_properties).\
- get_hyperparameter_search_space(dataset_properties)
+ configuration_space = Preprocessor(
+ dataset_properties
+ ).get_hyperparameter_search_space(dataset_properties)
default = configuration_space.get_default_configuration()
preprocessor = Preprocessor(dataset_properties, random_state=1)
@@ -28,31 +29,31 @@ def _test_helper(self, Preprocessor, dataset=None, make_sparse=False):
return transformer.transform(X_train), original_X_train
def test_boston_is_not_scaled(self):
- data = sklearn.datasets.load_boston()['data']
+ data = sklearn.datasets.load_boston()["data"]
self.assertGreaterEqual(np.max(data), 100)
def test_default_configuration(self):
transformations = []
for i in range(2):
- transformation, original = self._test_helper(RescalingChoice,
- dataset='boston')
+ transformation, original = self._test_helper(
+ RescalingChoice, dataset="boston"
+ )
# The maximum is around 1.95 for the transformed array...
self.assertAlmostEqual(np.mean(transformation), 0, places=5)
self.assertAlmostEqual(np.std(transformation), 1, places=5)
self.assertFalse((original == transformation).all())
transformations.append(transformation)
if len(transformations) > 1:
- self.assertTrue(
- (transformations[-1] == transformations[-2]).all())
+ self.assertTrue((transformations[-1] == transformations[-2]).all())
def test_default_configuration_with_sparse_data(self):
- preprocessing = self._test_helper(RescalingChoice, dataset='boston',
- make_sparse=True)
+ preprocessing = self._test_helper(
+ RescalingChoice, dataset="boston", make_sparse=True
+ )
transformation, original = preprocessing
self.assertEqual(original.getnnz(), transformation.getnnz())
self.assertTrue(~np.allclose(original.data, transformation.data))
@unittest.skip("Does not work at the moment.")
def test_preprocessing_dtype(self):
- super(ScalingComponentTest, self)._test_helper(
- RescalingChoice)
+ super(ScalingComponentTest, self)._test_helper(RescalingChoice)
diff --git a/test/test_pipeline/components/data_preprocessing/test_variance_threshold.py b/test/test_pipeline/components/data_preprocessing/test_variance_threshold.py
index 4da441828d..a9ba4083ca 100644
--- a/test/test_pipeline/components/data_preprocessing/test_variance_threshold.py
+++ b/test/test_pipeline/components/data_preprocessing/test_variance_threshold.py
@@ -1,8 +1,9 @@
from scipy import sparse
-from autosklearn.pipeline.components.data_preprocessing.variance_threshold.variance_threshold \
- import VarianceThreshold
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase
+from autosklearn.pipeline.components.data_preprocessing.variance_threshold.variance_threshold import ( # noqa: E501
+ VarianceThreshold,
+)
+from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing
class VarianceThresholdTest(PreprocessingTestCase):
@@ -14,13 +15,13 @@ def test_default_configuration(self):
self.assertTrue((transformation == original).all())
transformations.append(transformation)
if len(transformations) > 1:
- self.assertTrue(
- (transformations[-1] == transformations[-2]).all())
+ self.assertTrue((transformations[-1] == transformations[-2]).all())
def test_default_configuration_sparse_data(self):
transformations = []
- transformation, original = _test_preprocessing(VarianceThreshold,
- make_sparse=True)
+ transformation, original = _test_preprocessing(
+ VarianceThreshold, make_sparse=True
+ )
self.assertEqual(transformation.shape, (100, 3))
self.assertTrue((transformation.toarray() == original.toarray()[:, 1:]).all())
self.assertIsInstance(transformation, sparse.csr_matrix)
diff --git a/test/test_pipeline/components/dummy_components/dummy_component_1.py b/test/test_pipeline/components/dummy_components/dummy_component_1.py
index 06074db983..0af3466787 100644
--- a/test/test_pipeline/components/dummy_components/dummy_component_1.py
+++ b/test/test_pipeline/components/dummy_components/dummy_component_1.py
@@ -5,7 +5,7 @@
# Add the parent directory to the path to import the parent component
this_directory = os.path.dirname(os.path.abspath(__file__))
-parent_directory = os.path.abspath(os.path.join(this_directory, '..'))
+parent_directory = os.path.abspath(os.path.join(this_directory, ".."))
sys.path.append(parent_directory)
diff --git a/test/test_pipeline/components/dummy_components/dummy_component_2.py b/test/test_pipeline/components/dummy_components/dummy_component_2.py
index 9b67230e4c..f941dcdb40 100644
--- a/test/test_pipeline/components/dummy_components/dummy_component_2.py
+++ b/test/test_pipeline/components/dummy_components/dummy_component_2.py
@@ -6,7 +6,7 @@
# Add the parent directory to the path to import the parent component as
# dummy_components.dummy_component_2.DummyComponent1
this_directory = os.path.dirname(os.path.abspath(__file__))
-parent_directory = os.path.abspath(os.path.join(this_directory, '..'))
+parent_directory = os.path.abspath(os.path.join(this_directory, ".."))
sys.path.append(parent_directory)
diff --git a/test/test_pipeline/components/dummy_components/dummy_component_import.py b/test/test_pipeline/components/dummy_components/dummy_component_import.py
index f7981a40a3..a4cb764215 100644
--- a/test/test_pipeline/components/dummy_components/dummy_component_import.py
+++ b/test/test_pipeline/components/dummy_components/dummy_component_import.py
@@ -1,2 +1,4 @@
-from autosklearn.pipeline.components.base import find_components # noqa
-from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm # noqa
+from autosklearn.pipeline.components.base import find_components # noqa
+from autosklearn.pipeline.components.base import ( # noqa
+ AutoSklearnClassificationAlgorithm,
+)
diff --git a/test/test_pipeline/components/feature_preprocessing/__init__.py b/test/test_pipeline/components/feature_preprocessing/__init__.py
index 8f0ce6cb7c..92bf78f389 100644
--- a/test/test_pipeline/components/feature_preprocessing/__init__.py
+++ b/test/test_pipeline/components/feature_preprocessing/__init__.py
@@ -1 +1 @@
-__author__ = 'feurerm'
+__author__ = "feurerm"
diff --git a/test/test_pipeline/components/feature_preprocessing/test_NoPreprocessing.py b/test/test_pipeline/components/feature_preprocessing/test_NoPreprocessing.py
index 22811e75bb..440f2fd50d 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_NoPreprocessing.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_NoPreprocessing.py
@@ -1,7 +1,9 @@
import numpy as np
-from autosklearn.pipeline.components.feature_preprocessing.no_preprocessing import NoPreprocessing
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase
+from autosklearn.pipeline.components.feature_preprocessing.no_preprocessing import (
+ NoPreprocessing,
+)
+from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing
class NoneComponentTest(PreprocessingTestCase):
diff --git a/test/test_pipeline/components/feature_preprocessing/test_choice.py b/test/test_pipeline/components/feature_preprocessing/test_choice.py
index 525ec38356..516cf318bf 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_choice.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_choice.py
@@ -6,27 +6,27 @@
class FeatureProcessingTest(unittest.TestCase):
def test_get_available_components(self):
# Target type
- for target_type, num_values in [('classification', 15),
- ('regression', 14)]:
- data_properties = {'target_type': target_type}
+ for target_type, num_values in [("classification", 15), ("regression", 14)]:
+ data_properties = {"target_type": target_type}
- available_components = fp.FeaturePreprocessorChoice(data_properties)\
- .get_available_components(data_properties)
+ available_components = fp.FeaturePreprocessorChoice(
+ data_properties
+ ).get_available_components(data_properties)
self.assertEqual(len(available_components), num_values)
# Multiclass
- data_properties = {'target_type': 'classification',
- 'multiclass': True}
- available_components = fp.FeaturePreprocessorChoice(data_properties) \
- .get_available_components(data_properties)
+ data_properties = {"target_type": "classification", "multiclass": True}
+ available_components = fp.FeaturePreprocessorChoice(
+ data_properties
+ ).get_available_components(data_properties)
self.assertEqual(len(available_components), 15)
# Multilabel
- data_properties = {'target_type': 'classification',
- 'multilabel': True}
- available_components = fp.FeaturePreprocessorChoice(data_properties) \
- .get_available_components(data_properties)
+ data_properties = {"target_type": "classification", "multilabel": True}
+ available_components = fp.FeaturePreprocessorChoice(
+ data_properties
+ ).get_available_components(data_properties)
self.assertEqual(len(available_components), 12)
diff --git a/test/test_pipeline/components/feature_preprocessing/test_densifier.py b/test/test_pipeline/components/feature_preprocessing/test_densifier.py
index 6f02ee0e5b..9831a53e57 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_densifier.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_densifier.py
@@ -1,7 +1,7 @@
import numpy as np
from autosklearn.pipeline.components.feature_preprocessing.densifier import Densifier
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase
+from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing
class DensifierComponentTest(PreprocessingTestCase):
diff --git a/test/test_pipeline/components/feature_preprocessing/test_extra_trees_classification.py b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_classification.py
index 6b69462fec..2db52679c7 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_extra_trees_classification.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_classification.py
@@ -1,29 +1,36 @@
-from sklearn.linear_model import RidgeClassifier
-from autosklearn.pipeline.components.feature_preprocessing.\
- extra_trees_preproc_for_classification import \
- ExtraTreesPreprocessorClassification
-from autosklearn.pipeline.util import _test_preprocessing, \
- PreprocessingTestCase, get_dataset
import sklearn.metrics
+from sklearn.linear_model import RidgeClassifier
+
+from autosklearn.pipeline.components.feature_preprocessing.extra_trees_preproc_for_classification import ( # noqa: E501
+ ExtraTreesPreprocessorClassification,
+)
+from autosklearn.pipeline.util import (
+ PreprocessingTestCase,
+ _test_preprocessing,
+ get_dataset,
+)
class ExtreTreesClassificationComponentTest(PreprocessingTestCase):
def test_default_configuration(self):
transformation, original = _test_preprocessing(
- ExtraTreesPreprocessorClassification)
+ ExtraTreesPreprocessorClassification
+ )
self.assertEqual(transformation.shape[0], original.shape[0])
self.assertFalse((transformation == 0).all())
def test_default_configuration_classify(self):
for i in range(2):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
- make_sparse=False)
- configuration_space = ExtraTreesPreprocessorClassification.\
- get_hyperparameter_search_space()
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="digits", make_sparse=False
+ )
+ configuration_space = (
+ ExtraTreesPreprocessorClassification.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
preprocessor = ExtraTreesPreprocessorClassification(
- random_state=1,
- **{hp_name: default[hp_name] for hp_name in default})
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)
@@ -37,14 +44,16 @@ def test_default_configuration_classify(self):
def test_default_configuration_classify_sparse(self):
for i in range(2):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
- make_sparse=True)
- configuration_space = ExtraTreesPreprocessorClassification.\
- get_hyperparameter_search_space()
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="digits", make_sparse=True
+ )
+ configuration_space = (
+ ExtraTreesPreprocessorClassification.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
preprocessor = ExtraTreesPreprocessorClassification(
- random_state=1,
- **{hp_name: default[hp_name] for hp_name in default})
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)
@@ -57,5 +66,6 @@ def test_default_configuration_classify_sparse(self):
self.assertAlmostEqual(accuracy, 0.43715846994535518, places=2)
def test_preprocessing_dtype(self):
- super(ExtreTreesClassificationComponentTest, self).\
- _test_preprocessing_dtype(ExtraTreesPreprocessorClassification)
+ super(ExtreTreesClassificationComponentTest, self)._test_preprocessing_dtype(
+ ExtraTreesPreprocessorClassification
+ )
diff --git a/test/test_pipeline/components/feature_preprocessing/test_extra_trees_regression.py b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_regression.py
index b850d5aa99..cd6ae3dd21 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_extra_trees_regression.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_regression.py
@@ -1,29 +1,34 @@
-from sklearn.ensemble import ExtraTreesRegressor
-from autosklearn.pipeline.components.feature_preprocessing.\
- extra_trees_preproc_for_regression import \
- ExtraTreesPreprocessorRegression
-from autosklearn.pipeline.util import _test_preprocessing, \
- PreprocessingTestCase, get_dataset
import sklearn.metrics
+from sklearn.ensemble import ExtraTreesRegressor
+
+from autosklearn.pipeline.components.feature_preprocessing.extra_trees_preproc_for_regression import ( # noqa: E501
+ ExtraTreesPreprocessorRegression,
+)
+from autosklearn.pipeline.util import (
+ PreprocessingTestCase,
+ _test_preprocessing,
+ get_dataset,
+)
class ExtraTreesRegressionComponentTest(PreprocessingTestCase):
def test_default_configuration(self):
- transformation, original = _test_preprocessing(
- ExtraTreesPreprocessorRegression)
+ transformation, original = _test_preprocessing(ExtraTreesPreprocessorRegression)
self.assertEqual(transformation.shape[0], original.shape[0])
self.assertFalse((transformation == 0).all())
def test_default_configuration_regression(self):
for i in range(2):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston',
- make_sparse=False)
- configuration_space = ExtraTreesPreprocessorRegression.\
- get_hyperparameter_search_space()
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="boston", make_sparse=False
+ )
+ configuration_space = (
+ ExtraTreesPreprocessorRegression.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
preprocessor = ExtraTreesPreprocessorRegression(
- random_state=1,
- **{hp_name: default[hp_name] for hp_name in default})
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)
@@ -37,14 +42,16 @@ def test_default_configuration_regression(self):
def test_default_configuration_classify_sparse(self):
for i in range(2):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston',
- make_sparse=True)
- configuration_space = ExtraTreesPreprocessorRegression.\
- get_hyperparameter_search_space()
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="boston", make_sparse=True
+ )
+ configuration_space = (
+ ExtraTreesPreprocessorRegression.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
preprocessor = ExtraTreesPreprocessorRegression(
- random_state=1,
- **{hp_name: default[hp_name] for hp_name in default})
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)
@@ -57,5 +64,6 @@ def test_default_configuration_classify_sparse(self):
self.assertAlmostEqual(error, 55.69613978965742, places=2)
def test_preprocessing_dtype(self):
- super(ExtraTreesRegressionComponentTest, self).\
- _test_preprocessing_dtype(ExtraTreesPreprocessorRegression)
+ super(ExtraTreesRegressionComponentTest, self)._test_preprocessing_dtype(
+ ExtraTreesPreprocessorRegression
+ )
diff --git a/test/test_pipeline/components/feature_preprocessing/test_fast_ica.py b/test/test_pipeline/components/feature_preprocessing/test_fast_ica.py
index ae22d65c54..a38097a60e 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_fast_ica.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_fast_ica.py
@@ -1,28 +1,30 @@
import unittest
-from sklearn.linear_model import Ridge
-from autosklearn.pipeline.components.feature_preprocessing.fast_ica import \
- FastICA
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \
- get_dataset
import sklearn.metrics
+from sklearn.linear_model import Ridge
+
+from autosklearn.pipeline.components.feature_preprocessing.fast_ica import FastICA
+from autosklearn.pipeline.util import (
+ PreprocessingTestCase,
+ _test_preprocessing,
+ get_dataset,
+)
class FastICAComponentTest(PreprocessingTestCase):
def test_default_configuration(self):
- transformation, original = _test_preprocessing(FastICA,
- dataset="diabetes")
+ transformation, original = _test_preprocessing(FastICA, dataset="diabetes")
self.assertEqual(transformation.shape[0], original.shape[0])
self.assertFalse((transformation == 0).all())
def test_default_configuration_regression(self):
for i in range(5):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="diabetes")
configuration_space = FastICA.get_hyperparameter_search_space()
default = configuration_space.get_default_configuration()
- preprocessor = FastICA(random_state=1,
- **{hp_name: default[hp_name] for hp_name in
- default})
+ preprocessor = FastICA(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)
@@ -36,5 +38,6 @@ def test_default_configuration_regression(self):
@unittest.skip("Always returns float64")
def test_preprocessing_dtype(self):
- super(FastICAComponentTest,
- self)._test_preprocessing_dtype(FastICA, dataset='diabetes')
+ super(FastICAComponentTest, self)._test_preprocessing_dtype(
+ FastICA, dataset="diabetes"
+ )
diff --git a/test/test_pipeline/components/feature_preprocessing/test_feature_agglomeration.py b/test/test_pipeline/components/feature_preprocessing/test_feature_agglomeration.py
index 0cac9426d2..afccd79c31 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_feature_agglomeration.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_feature_agglomeration.py
@@ -1,9 +1,14 @@
-from sklearn.ensemble import RandomForestClassifier
-from autosklearn.pipeline.components.feature_preprocessing.feature_agglomeration import \
- FeatureAgglomeration
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \
- get_dataset
import sklearn.metrics
+from sklearn.ensemble import RandomForestClassifier
+
+from autosklearn.pipeline.components.feature_preprocessing.feature_agglomeration import ( # noqa: E501
+ FeatureAgglomeration,
+)
+from autosklearn.pipeline.util import (
+ PreprocessingTestCase,
+ _test_preprocessing,
+ get_dataset,
+)
class FeatureAgglomerationComponentTest(PreprocessingTestCase):
@@ -14,13 +19,14 @@ def test_default_configuration(self):
def test_default_configuration_classify(self):
for i in range(3):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
- make_sparse=False)
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="digits", make_sparse=False
+ )
configuration_space = FeatureAgglomeration.get_hyperparameter_search_space()
default = configuration_space.get_default_configuration()
- preprocessor = FeatureAgglomeration(random_state=1,
- **{hp_name: default[hp_name] for
- hp_name in default})
+ preprocessor = FeatureAgglomeration(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)
@@ -33,6 +39,6 @@ def test_default_configuration_classify(self):
self.assertAlmostEqual(accuracy, 0.8761384335154827)
def test_preprocessing_dtype(self):
- super(FeatureAgglomerationComponentTest,
- self)._test_preprocessing_dtype(FeatureAgglomeration,
- test_sparse=False)
+ super(FeatureAgglomerationComponentTest, self)._test_preprocessing_dtype(
+ FeatureAgglomeration, test_sparse=False
+ )
diff --git a/test/test_pipeline/components/feature_preprocessing/test_kernel_pca.py b/test/test_pipeline/components/feature_preprocessing/test_kernel_pca.py
index 19b1368a49..2c5a8c865b 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_kernel_pca.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_kernel_pca.py
@@ -1,38 +1,46 @@
import unittest
-from sklearn.linear_model import RidgeClassifier
-from autosklearn.pipeline.components.feature_preprocessing.kernel_pca import \
- KernelPCA
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \
- get_dataset
import sklearn.metrics
+from sklearn.linear_model import RidgeClassifier
+
+from autosklearn.pipeline.components.feature_preprocessing.kernel_pca import KernelPCA
+from autosklearn.pipeline.util import (
+ PreprocessingTestCase,
+ _test_preprocessing,
+ get_dataset,
+)
class KernelPCAComponentTest(PreprocessingTestCase):
def test_default_configuration(self):
- transformation, original = _test_preprocessing(KernelPCA,
- dataset='digits',
- train_size_maximum=2000)
+ transformation, original = _test_preprocessing(
+ KernelPCA, dataset="digits", train_size_maximum=2000
+ )
self.assertEqual(transformation.shape[0], original.shape[0])
self.assertFalse((transformation == 0).all())
def test_default_configuration_sparse(self):
- transformation, original = _test_preprocessing(KernelPCA,
- make_sparse=True,
- dataset='digits')
+ transformation, original = _test_preprocessing(
+ KernelPCA, make_sparse=True, dataset="digits"
+ )
self.assertEqual(transformation.shape[0], original.shape[0])
self.assertFalse((transformation == 0).all())
def test_default_configuration_classify(self):
for i in range(5):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
- make_sparse=False,
- train_size_maximum=1000)
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="digits", make_sparse=False, train_size_maximum=1000
+ )
configuration_space = KernelPCA.get_hyperparameter_search_space()
default = configuration_space.get_default_configuration()
- preprocessor = KernelPCA(random_state=1,
- **{hp_name: default[hp_name] for hp_name in
- default if default[hp_name] is not None})
+ preprocessor = KernelPCA(
+ random_state=1,
+ **{
+ hp_name: default[hp_name]
+ for hp_name in default
+ if default[hp_name] is not None
+ },
+ )
preprocessor.fit(X_train, Y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)
@@ -46,5 +54,4 @@ def test_default_configuration_classify(self):
@unittest.skip("Always returns float64")
def test_preprocessing_dtype(self):
- super(KernelPCAComponentTest,
- self)._test_preprocessing_dtype(KernelPCA)
+ super(KernelPCAComponentTest, self)._test_preprocessing_dtype(KernelPCA)
diff --git a/test/test_pipeline/components/feature_preprocessing/test_kitchen_sinks.py b/test/test_pipeline/components/feature_preprocessing/test_kitchen_sinks.py
index c94e6f9a55..16ef41198d 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_kitchen_sinks.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_kitchen_sinks.py
@@ -1,7 +1,9 @@
import unittest
-from autosklearn.pipeline.components.feature_preprocessing.kitchen_sinks import RandomKitchenSinks
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase
+from autosklearn.pipeline.components.feature_preprocessing.kitchen_sinks import (
+ RandomKitchenSinks,
+)
+from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing
class KitchenSinkComponent(PreprocessingTestCase):
@@ -13,5 +15,4 @@ def test_default_configuration(self):
@unittest.skip("Right now, the RBFSampler returns a float64 array!")
def test_preprocessing_dtype(self):
- super(KitchenSinkComponent,
- self)._test_preprocessing_dtype(RandomKitchenSinks)
+ super(KitchenSinkComponent, self)._test_preprocessing_dtype(RandomKitchenSinks)
diff --git a/test/test_pipeline/components/feature_preprocessing/test_liblinear.py b/test/test_pipeline/components/feature_preprocessing/test_liblinear.py
index 19b56b6eac..0195dfb701 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_liblinear.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_liblinear.py
@@ -1,15 +1,22 @@
-from sklearn.linear_model import RidgeClassifier
-from autosklearn.pipeline.components.feature_preprocessing.liblinear_svc_preprocessor import \
- LibLinear_Preprocessor
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \
- get_dataset
import sklearn.metrics
+from sklearn.linear_model import RidgeClassifier
-from test.test_pipeline.ignored_warnings import ignore_warnings, feature_preprocessing_warnings
+from autosklearn.pipeline.components.feature_preprocessing.liblinear_svc_preprocessor import ( # noqa: E501
+ LibLinear_Preprocessor,
+)
+from autosklearn.pipeline.util import (
+ PreprocessingTestCase,
+ _test_preprocessing,
+ get_dataset,
+)
+from test.test_pipeline.ignored_warnings import (
+ feature_preprocessing_warnings,
+ ignore_warnings,
+)
-class LiblinearComponentTest(PreprocessingTestCase):
+class LiblinearComponentTest(PreprocessingTestCase):
def test_default_configuration(self):
with ignore_warnings(feature_preprocessing_warnings):
transformation, original = _test_preprocessing(LibLinear_Preprocessor)
@@ -19,15 +26,21 @@ def test_default_configuration(self):
def test_default_configuration_classify(self):
for i in range(2):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
- make_sparse=False)
- configuration_space = LibLinear_Preprocessor.get_hyperparameter_search_space()
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="digits", make_sparse=False
+ )
+ configuration_space = (
+ LibLinear_Preprocessor.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = LibLinear_Preprocessor(random_state=1,
- **{hp_name: default[hp_name]
- for hp_name in
- default if default[
- hp_name] is not None})
+ preprocessor = LibLinear_Preprocessor(
+ random_state=1,
+ **{
+ hp_name: default[hp_name]
+ for hp_name in default
+ if default[hp_name] is not None
+ },
+ )
with ignore_warnings(feature_preprocessing_warnings):
preprocessor.fit(X_train, Y_train)
diff --git a/test/test_pipeline/components/feature_preprocessing/test_nystroem_sampler.py b/test/test_pipeline/components/feature_preprocessing/test_nystroem_sampler.py
index b3db49ebca..d6244c362f 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_nystroem_sampler.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_nystroem_sampler.py
@@ -3,8 +3,9 @@
import numpy as np
import sklearn.preprocessing
-from autosklearn.pipeline.components.feature_preprocessing.nystroem_sampler import \
- Nystroem
+from autosklearn.pipeline.components.feature_preprocessing.nystroem_sampler import (
+ Nystroem,
+)
from autosklearn.pipeline.util import _test_preprocessing, get_dataset
@@ -16,7 +17,7 @@ def test_default_configuration(self):
self.assertFalse((transformation == 0).all())
# Custon preprocessing test to check if clipping to zero works
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
original_X_train = X_train.copy()
ss = sklearn.preprocessing.StandardScaler()
X_train = ss.fit_transform(X_train)
@@ -25,12 +26,15 @@ def test_default_configuration(self):
preprocessor = Nystroem(
random_state=1,
- **{hp_name: default[hp_name] for hp_name in default if default[hp_name] is not None},
- )
+ **{
+ hp_name: default[hp_name]
+ for hp_name in default
+ if default[hp_name] is not None
+ },
+ )
transformer = preprocessor.fit(X_train, Y_train)
- transformation, original = transformer.transform(
- X_train), original_X_train
+ transformation, original = transformer.transform(X_train), original_X_train
self.assertEqual(transformation.shape[0], original.shape[0])
self.assertEqual(transformation.shape[1], 100)
@@ -46,7 +50,7 @@ def _test_preprocessing_dtype(self):
preprocessor = Nystroem(
random_state=1,
**{hp.hyperparameter.name: hp.value for hp in default.values.values()},
- )
+ )
preprocessor.fit(X_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float32)
@@ -59,7 +63,7 @@ def _test_preprocessing_dtype(self):
preprocessor = Nystroem(
random_state=1,
**{hp.hyperparameter.name: hp.value for hp in default.values.values()},
- )
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float64)
@@ -73,7 +77,7 @@ def _test_preprocessing_dtype(self):
preprocessor = Nystroem(
random_state=1,
**{hp.hyperparameter.name: hp.value for hp in default.values.values()},
- )
+ )
preprocessor.fit(X_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float32)
@@ -86,7 +90,7 @@ def _test_preprocessing_dtype(self):
preprocessor = Nystroem(
random_state=1,
**{hp.hyperparameter.name: hp.value for hp in default.values.values()},
- )
+ )
preprocessor.fit(X_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float64)
diff --git a/test/test_pipeline/components/feature_preprocessing/test_pca.py b/test/test_pipeline/components/feature_preprocessing/test_pca.py
index 02ab8bdd0e..b73da8aa64 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_pca.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_pca.py
@@ -1,7 +1,7 @@
import numpy as np
from autosklearn.pipeline.components.feature_preprocessing.pca import PCA
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase
+from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing
class PCAComponentTest(PreprocessingTestCase):
@@ -13,9 +13,9 @@ def test_default_configuration(self):
self.assertFalse((transformation == original).all())
transformations.append(transformation)
if len(transformations) > 1:
- np.testing.assert_allclose(transformations[-1],
- transformations[-2], rtol=1e-4)
+ np.testing.assert_allclose(
+ transformations[-1], transformations[-2], rtol=1e-4
+ )
def test_preprocessing_dtype(self):
- super(PCAComponentTest, self)._test_preprocessing_dtype(PCA,
- test_sparse=False)
+ super(PCAComponentTest, self)._test_preprocessing_dtype(PCA, test_sparse=False)
diff --git a/test/test_pipeline/components/feature_preprocessing/test_polynomial.py b/test/test_pipeline/components/feature_preprocessing/test_polynomial.py
index 28f84bc595..3c9e93a49c 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_polynomial.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_polynomial.py
@@ -1,9 +1,14 @@
-from sklearn.tree import DecisionTreeClassifier
-from autosklearn.pipeline.components.feature_preprocessing.polynomial import \
- PolynomialFeatures
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \
- get_dataset
import sklearn.metrics
+from sklearn.tree import DecisionTreeClassifier
+
+from autosklearn.pipeline.components.feature_preprocessing.polynomial import (
+ PolynomialFeatures,
+)
+from autosklearn.pipeline.util import (
+ PreprocessingTestCase,
+ _test_preprocessing,
+ get_dataset,
+)
class PolynomialFeaturesComponentTest(PreprocessingTestCase):
@@ -14,13 +19,14 @@ def test_default_configuration(self):
def test_default_configuration_classify(self):
for i in range(2):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='breast_cancer',
- make_sparse=False)
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="breast_cancer", make_sparse=False
+ )
configuration_space = PolynomialFeatures.get_hyperparameter_search_space()
default = configuration_space.get_default_configuration()
- preprocessor = PolynomialFeatures(random_state=1,
- **{hp_name: default[hp_name] for
- hp_name in default})
+ preprocessor = PolynomialFeatures(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)
@@ -34,13 +40,14 @@ def test_default_configuration_classify(self):
def test_default_configuration_classify_sparse(self):
for i in range(2):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='breast_cancer',
- make_sparse=True)
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="breast_cancer", make_sparse=True
+ )
configuration_space = PolynomialFeatures.get_hyperparameter_search_space()
default = configuration_space.get_default_configuration()
- preprocessor = PolynomialFeatures(random_state=1,
- **{hp_name: default[hp_name] for
- hp_name in default})
+ preprocessor = PolynomialFeatures(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)
@@ -53,6 +60,6 @@ def test_default_configuration_classify_sparse(self):
self.assertAlmostEqual(accuracy, 0.8544152744630071, places=2)
def test_preprocessing_dtype(self):
- super(PolynomialFeaturesComponentTest,
- self)._test_preprocessing_dtype(PolynomialFeatures,
- test_sparse=False)
+ super(PolynomialFeaturesComponentTest, self)._test_preprocessing_dtype(
+ PolynomialFeatures, test_sparse=False
+ )
diff --git a/test/test_pipeline/components/feature_preprocessing/test_random_trees_embedding.py b/test/test_pipeline/components/feature_preprocessing/test_random_trees_embedding.py
index 681c319830..f84675dc1a 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_random_trees_embedding.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_random_trees_embedding.py
@@ -3,8 +3,9 @@
import numpy as np
import scipy.sparse
-from autosklearn.pipeline.components.feature_preprocessing.random_trees_embedding import \
- RandomTreesEmbedding
+from autosklearn.pipeline.components.feature_preprocessing.random_trees_embedding import ( # noqa: E501
+ RandomTreesEmbedding,
+)
from autosklearn.pipeline.util import _test_preprocessing, get_dataset
@@ -26,10 +27,9 @@ def test_preprocessing_dtype(self):
configuration_space = RandomTreesEmbedding.get_hyperparameter_search_space()
default = configuration_space.get_default_configuration()
- preprocessor = RandomTreesEmbedding(random_state=1,
- **{hp_name: default[hp_name] for
- hp_name in
- default})
+ preprocessor = RandomTreesEmbedding(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train)
Xt = preprocessor.transform(X_train)
@@ -40,10 +40,9 @@ def test_preprocessing_dtype(self):
X_train = X_train.astype(np.float64)
configuration_space = RandomTreesEmbedding.get_hyperparameter_search_space()
default = configuration_space.get_default_configuration()
- preprocessor = RandomTreesEmbedding(random_state=1,
- **{hp_name: default[hp_name] for
- hp_name in
- default})
+ preprocessor = RandomTreesEmbedding(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float64)
diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_percentile_classification.py b/test/test_pipeline/components/feature_preprocessing/test_select_percentile_classification.py
index d7cde925b0..b177e4f4ba 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_select_percentile_classification.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_select_percentile_classification.py
@@ -4,8 +4,9 @@
import scipy.sparse
import sklearn.preprocessing
-from autosklearn.pipeline.components.feature_preprocessing.select_percentile_classification \
- import SelectPercentileClassification
+from autosklearn.pipeline.components.feature_preprocessing.select_percentile_classification import ( # noqa: E501
+ SelectPercentileClassification,
+)
from autosklearn.pipeline.util import _test_preprocessing, get_dataset
@@ -13,29 +14,35 @@ class SelectPercentileClassificationTest(unittest.TestCase):
def test_default_configuration(self):
transformation, original = _test_preprocessing(SelectPercentileClassification)
self.assertEqual(transformation.shape[0], original.shape[0])
- self.assertEqual(transformation.shape[1], int(original.shape[1]/2))
+ self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))
self.assertFalse((transformation == 0).all())
transformation, original = _test_preprocessing(
- SelectPercentileClassification,
- make_sparse=True,
- )
+ SelectPercentileClassification,
+ make_sparse=True,
+ )
self.assertTrue(scipy.sparse.issparse(transformation))
self.assertEqual(transformation.shape[0], original.shape[0])
- self.assertEqual(transformation.shape[1], int(original.shape[1]/2))
+ self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))
# Custon preprocessing test to check if clipping to zero works
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
original_X_train = X_train.copy()
ss = sklearn.preprocessing.StandardScaler()
X_train = ss.fit_transform(X_train)
- configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectPercentileClassification.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
preprocessor = SelectPercentileClassification(
- random_state=1,
- **{hp_name: default[hp_name] for hp_name in default if default[hp_name] is not None},
- )
+ random_state=1,
+ **{
+ hp_name: default[hp_name]
+ for hp_name in default
+ if default[hp_name] is not None
+ },
+ )
transformer = preprocessor.fit(X_train, Y_train)
transformation, original = transformer.transform(X_train), original_X_train
@@ -48,11 +55,13 @@ def test_preprocessing_dtype(self):
X_train, Y_train, X_test, Y_test = get_dataset("iris")
self.assertEqual(X_train.dtype, np.float32)
- configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectPercentileClassification.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = SelectPercentileClassification(random_state=1,
- **{hp_name: default[hp_name]
- for hp_name in default})
+ preprocessor = SelectPercentileClassification(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float32)
@@ -60,11 +69,13 @@ def test_preprocessing_dtype(self):
# np.float64
X_train, Y_train, X_test, Y_test = get_dataset("iris")
X_train = X_train.astype(np.float64)
- configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectPercentileClassification.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = SelectPercentileClassification(random_state=1,
- **{hp_name: default[hp_name]
- for hp_name in default})
+ preprocessor = SelectPercentileClassification(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float64)
@@ -73,11 +84,13 @@ def test_preprocessing_dtype(self):
# np.float32
X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
self.assertEqual(X_train.dtype, np.float32)
- configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectPercentileClassification.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = SelectPercentileClassification(random_state=1,
- **{hp_name: default[hp_name]
- for hp_name in default})
+ preprocessor = SelectPercentileClassification(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float32)
@@ -85,11 +98,13 @@ def test_preprocessing_dtype(self):
# np.float64
X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
X_train = X_train.astype(np.float64)
- configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectPercentileClassification.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = SelectPercentileClassification(random_state=1,
- **{hp_name: default[hp_name]
- for hp_name in default})
+ preprocessor = SelectPercentileClassification(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float64)
diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_percentile_regression.py b/test/test_pipeline/components/feature_preprocessing/test_select_percentile_regression.py
index a76a15c5a3..0fd335fd83 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_select_percentile_regression.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_select_percentile_regression.py
@@ -2,8 +2,9 @@
import numpy as np
-from autosklearn.pipeline.components.feature_preprocessing.select_percentile_regression \
- import SelectPercentileRegression
+from autosklearn.pipeline.components.feature_preprocessing.select_percentile_regression import ( # noqa: E501
+ SelectPercentileRegression,
+)
from autosklearn.pipeline.util import _test_preprocessing, get_dataset
@@ -12,9 +13,9 @@ def test_default_configuration(self):
transformation, original = _test_preprocessing(
dataset="boston",
Preprocessor=SelectPercentileRegression,
- )
+ )
self.assertEqual(transformation.shape[0], original.shape[0])
- self.assertEqual(transformation.shape[1], int(original.shape[1]/2))
+ self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))
self.assertFalse((transformation == 0).all())
def test_preprocessing_dtype(self):
@@ -23,11 +24,13 @@ def test_preprocessing_dtype(self):
X_train, Y_train, X_test, Y_test = get_dataset("iris")
self.assertEqual(X_train.dtype, np.float32)
- configuration_space = SelectPercentileRegression.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectPercentileRegression.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = SelectPercentileRegression(random_state=1,
- **{hp_name: default[hp_name]
- for hp_name in default})
+ preprocessor = SelectPercentileRegression(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float32)
@@ -35,11 +38,13 @@ def test_preprocessing_dtype(self):
# np.float64
X_train, Y_train, X_test, Y_test = get_dataset("iris")
X_train = X_train.astype(np.float64)
- configuration_space = SelectPercentileRegression.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectPercentileRegression.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = SelectPercentileRegression(random_state=1,
- **{hp_name: default[hp_name]
- for hp_name in default})
+ preprocessor = SelectPercentileRegression(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float64)
diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py b/test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py
index 2497b5174a..2d1c2aaf78 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py
@@ -4,8 +4,9 @@
import scipy.sparse
import sklearn.preprocessing
-from autosklearn.pipeline.components.feature_preprocessing.select_rates_classification import \
- SelectClassificationRates
+from autosklearn.pipeline.components.feature_preprocessing.select_rates_classification import ( # noqa: E501
+ SelectClassificationRates,
+)
from autosklearn.pipeline.util import _test_preprocessing, get_dataset
@@ -17,27 +18,33 @@ def test_default_configuration(self):
self.assertFalse((transformation == 0).all())
transformation, original = _test_preprocessing(
- SelectClassificationRates, make_sparse=True)
+ SelectClassificationRates, make_sparse=True
+ )
self.assertTrue(scipy.sparse.issparse(transformation))
self.assertEqual(transformation.shape[0], original.shape[0])
self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))
# Custom preprocessing test to check if clipping to zero works
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
original_X_train = X_train.copy()
ss = sklearn.preprocessing.StandardScaler()
X_train = ss.fit_transform(X_train)
- configuration_space = SelectClassificationRates.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectClassificationRates.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = SelectClassificationRates(random_state=1,
- **{hp_name: default[hp_name]
- for hp_name in default
- if default[hp_name] is not None})
+ preprocessor = SelectClassificationRates(
+ random_state=1,
+ **{
+ hp_name: default[hp_name]
+ for hp_name in default
+ if default[hp_name] is not None
+ },
+ )
transformer = preprocessor.fit(X_train, Y_train)
- transformation, original = transformer.transform(
- X_train), original_X_train
+ transformation, original = transformer.transform(X_train), original_X_train
self.assertEqual(transformation.shape[0], original.shape[0])
# I don't know why it's 52 here and not 32 which would be half of the
# number of features. Seems to be related to a runtime warning raised
@@ -50,11 +57,13 @@ def test_preprocessing_dtype(self):
X_train, Y_train, X_test, Y_test = get_dataset("iris")
self.assertEqual(X_train.dtype, np.float32)
- configuration_space = SelectClassificationRates.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectClassificationRates.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = SelectClassificationRates(random_state=1,
- **{hp_name: default[hp_name] for hp_name in
- default})
+ preprocessor = SelectClassificationRates(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float32)
@@ -62,11 +71,13 @@ def test_preprocessing_dtype(self):
# np.float64
X_train, Y_train, X_test, Y_test = get_dataset("iris")
X_train = X_train.astype(np.float64)
- configuration_space = SelectClassificationRates.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectClassificationRates.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = SelectClassificationRates(random_state=1,
- **{hp_name: default[hp_name] for hp_name in
- default})
+ preprocessor = SelectClassificationRates(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float64)
@@ -75,11 +86,13 @@ def test_preprocessing_dtype(self):
# np.float32
X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
self.assertEqual(X_train.dtype, np.float32)
- configuration_space = SelectClassificationRates.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectClassificationRates.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = SelectClassificationRates(random_state=1,
- **{hp_name: default[hp_name] for hp_name in
- default})
+ preprocessor = SelectClassificationRates(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float32)
@@ -87,11 +100,13 @@ def test_preprocessing_dtype(self):
# np.float64
X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
X_train = X_train.astype(np.float64)
- configuration_space = SelectClassificationRates.get_hyperparameter_search_space()
+ configuration_space = (
+ SelectClassificationRates.get_hyperparameter_search_space()
+ )
default = configuration_space.get_default_configuration()
- preprocessor = SelectClassificationRates(random_state=1,
- **{hp_name: default[hp_name] for hp_name in
- default})
+ preprocessor = SelectClassificationRates(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float64)
diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py b/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py
index 573bab32ce..869d7fbee2 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py
@@ -4,8 +4,9 @@
import scipy.sparse
import sklearn.preprocessing
-from autosklearn.pipeline.components.feature_preprocessing.select_rates_regression import \
- SelectRegressionRates
+from autosklearn.pipeline.components.feature_preprocessing.select_rates_regression import ( # noqa: E501
+ SelectRegressionRates,
+)
from autosklearn.pipeline.util import _test_preprocessing, get_dataset
@@ -17,34 +18,38 @@ def test_default_configuration(self):
self.assertFalse((transformation == 0).all())
transformation, original = _test_preprocessing(
- SelectRegressionRates, make_sparse=True)
+ SelectRegressionRates, make_sparse=True
+ )
self.assertTrue(scipy.sparse.issparse(transformation))
self.assertEqual(transformation.shape[0], original.shape[0])
self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))
# Makes sure that the features are reduced, not the number of samples
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
original_X_train = X_train.copy()
ss = sklearn.preprocessing.StandardScaler()
X_train = ss.fit_transform(X_train)
configuration_space = SelectRegressionRates.get_hyperparameter_search_space()
default = configuration_space.get_default_configuration()
- preprocessor = SelectRegressionRates(random_state=1,
- **{hp_name: default[hp_name]
- for hp_name in default
- if default[hp_name] is not None})
+ preprocessor = SelectRegressionRates(
+ random_state=1,
+ **{
+ hp_name: default[hp_name]
+ for hp_name in default
+ if default[hp_name] is not None
+ },
+ )
transformer = preprocessor.fit(X_train, Y_train)
- transformation, original = transformer.transform(
- X_train), original_X_train
+ transformation, original = transformer.transform(X_train), original_X_train
self.assertEqual(transformation.shape[0], original.shape[0])
self.assertEqual(transformation.shape[1], 21)
def test_default_configuration_regression(self):
transformation, original = _test_preprocessing(
SelectRegressionRates,
- dataset='boston',
+ dataset="boston",
)
self.assertEqual(transformation.shape[0], original.shape[0])
# From 13 to 12 features
@@ -57,15 +62,15 @@ def test_preprocessing_dtype_regression(self):
X_train, Y_train, X_test, Y_test = get_dataset("boston")
self.assertEqual(X_train.dtype, np.float32)
- dataset_properties = {'target_type': 'regression'}
+ dataset_properties = {"target_type": "regression"}
configuration_space = SelectRegressionRates.get_hyperparameter_search_space(
dataset_properties
)
default = configuration_space.get_default_configuration()
- preprocessor = SelectRegressionRates(random_state=1,
- **{hp_name: default[hp_name] for hp_name in
- default})
+ preprocessor = SelectRegressionRates(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float32)
@@ -77,9 +82,9 @@ def test_preprocessing_dtype_regression(self):
dataset_properties
)
default = configuration_space.get_default_configuration()
- preprocessor = SelectRegressionRates(random_state=1,
- **{hp_name: default[hp_name] for hp_name in
- default})
+ preprocessor = SelectRegressionRates(
+ random_state=1, **{hp_name: default[hp_name] for hp_name in default}
+ )
preprocessor.fit(X_train, Y_train)
Xt = preprocessor.transform(X_train)
self.assertEqual(Xt.dtype, np.float64)
diff --git a/test/test_pipeline/components/feature_preprocessing/test_truncatedSVD.py b/test/test_pipeline/components/feature_preprocessing/test_truncatedSVD.py
index df1f1d2fe6..7e16fa7fa5 100644
--- a/test/test_pipeline/components/feature_preprocessing/test_truncatedSVD.py
+++ b/test/test_pipeline/components/feature_preprocessing/test_truncatedSVD.py
@@ -1,11 +1,16 @@
import unittest
-from sklearn.linear_model import RidgeClassifier
-from autosklearn.pipeline.components.feature_preprocessing.truncatedSVD import \
- TruncatedSVD
-from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \
- get_dataset
import sklearn.metrics
+from sklearn.linear_model import RidgeClassifier
+
+from autosklearn.pipeline.components.feature_preprocessing.truncatedSVD import (
+ TruncatedSVD,
+)
+from autosklearn.pipeline.util import (
+ PreprocessingTestCase,
+ _test_preprocessing,
+ get_dataset,
+)
class TruncatedSVDComponentTest(PreprocessingTestCase):
@@ -16,14 +21,19 @@ def test_default_configuration(self):
def test_default_configuration_classify(self):
for i in range(2):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
- make_sparse=True)
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="digits", make_sparse=True
+ )
configuration_space = TruncatedSVD.get_hyperparameter_search_space()
default = configuration_space.get_default_configuration()
preprocessor = TruncatedSVD(
random_state=1,
- **{hp_name: default[hp_name] for hp_name in default if default[hp_name] is not None}
- )
+ **{
+ hp_name: default[hp_name]
+ for hp_name in default
+ if default[hp_name] is not None
+ },
+ )
preprocessor.fit(X_train, Y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)
@@ -37,6 +47,6 @@ def test_default_configuration_classify(self):
@unittest.skip("Truncated SVD returns np.float64.")
def test_preprocessing_dtype(self):
- super(TruncatedSVDComponentTest,
- self)._test_preprocessing_dtype(TruncatedSVD,
- test_sparse=False)
+ super(TruncatedSVDComponentTest, self)._test_preprocessing_dtype(
+ TruncatedSVD, test_sparse=False
+ )
diff --git a/test/test_pipeline/components/regression/test_adaboost.py b/test/test_pipeline/components/regression/test_adaboost.py
index c7f199d5ee..b62df4fd9b 100644
--- a/test/test_pipeline/components/regression/test_adaboost.py
+++ b/test/test_pipeline/components/regression/test_adaboost.py
@@ -1,7 +1,7 @@
import sklearn.ensemble
-from autosklearn.pipeline.components.regression.adaboost import \
- AdaboostRegressor
+from autosklearn.pipeline.components.regression.adaboost import AdaboostRegressor
+
from .test_base import BaseRegressionComponentTest
diff --git a/test/test_pipeline/components/regression/test_ard_regression.py b/test/test_pipeline/components/regression/test_ard_regression.py
index dac8d61349..829bf9b507 100644
--- a/test/test_pipeline/components/regression/test_ard_regression.py
+++ b/test/test_pipeline/components/regression/test_ard_regression.py
@@ -1,7 +1,7 @@
import sklearn.linear_model
-from autosklearn.pipeline.components.regression.ard_regression import \
- ARDRegression
+from autosklearn.pipeline.components.regression.ard_regression import ARDRegression
+
from .test_base import BaseRegressionComponentTest
diff --git a/test/test_pipeline/components/regression/test_base.py b/test/test_pipeline/components/regression/test_base.py
index 8ffc1d23fe..dcf7770332 100644
--- a/test/test_pipeline/components/regression/test_base.py
+++ b/test/test_pipeline/components/regression/test_base.py
@@ -1,19 +1,17 @@
-from typing import Type, Container
+from typing import Container, Type
import unittest
-import pytest
-
import numpy as np
+import pytest
import sklearn.metrics
-from autosklearn.pipeline.util import _test_regressor, _test_regressor_iterative_fit
-from autosklearn.pipeline.constants import SPARSE
+from autosklearn.pipeline.components.regression import RegressorChoice, _regressors
from autosklearn.pipeline.components.regression.libsvm_svr import LibSVM_SVR
+from autosklearn.pipeline.constants import SPARSE
+from autosklearn.pipeline.util import _test_regressor, _test_regressor_iterative_fit
-from autosklearn.pipeline.components.regression import _regressors, RegressorChoice
-
-from test.test_pipeline.ignored_warnings import regressor_warnings, ignore_warnings
+from test.test_pipeline.ignored_warnings import ignore_warnings, regressor_warnings
class BaseRegressionComponentTest(unittest.TestCase):
@@ -37,8 +35,7 @@ def test_default_boston(self):
with ignore_warnings(regressor_warnings):
predictions, targets, n_calls = _test_regressor(
- dataset="boston",
- Regressor=self.module
+ dataset="boston", Regressor=self.module
)
score = sklearn.metrics.r2_score(y_true=targets, y_pred=predictions)
@@ -70,14 +67,13 @@ def test_default_boston_iterative_fit(self):
if self.__class__ == BaseRegressionComponentTest:
return
- if not hasattr(self.module, 'iterative_fit'):
+ if not hasattr(self.module, "iterative_fit"):
return
for i in range(2):
with ignore_warnings(regressor_warnings):
predictions, targets, regressor = _test_regressor_iterative_fit(
- dataset="boston",
- Regressor=self.module
+ dataset="boston", Regressor=self.module
)
score = sklearn.metrics.r2_score(targets, predictions)
@@ -92,8 +88,8 @@ def test_default_boston_iterative_fit(self):
self.assertAlmostEqual(fixture, score, places)
if self.step_hyperparameter is not None:
- param_name = self.step_hyperparameter['name']
- default = self.step_hyperparameter['value']
+ param_name = self.step_hyperparameter["name"]
+ default = self.step_hyperparameter["value"]
value = getattr(regressor.estimator, param_name)
expected = self.res.get("boston_iterative_n_iter", default)
@@ -110,7 +106,7 @@ def test_default_boston_iterative_sparse_fit(self):
if self.__class__ == BaseRegressionComponentTest:
return
- if not hasattr(self.module, 'iterative_fit'):
+ if not hasattr(self.module, "iterative_fit"):
return
if SPARSE not in self.module.get_properties()["input"]:
@@ -119,15 +115,13 @@ def test_default_boston_iterative_sparse_fit(self):
for i in range(2):
with ignore_warnings(regressor_warnings):
predictions, targets, _ = _test_regressor_iterative_fit(
- dataset="boston",
- Regressor=self.module,
- sparse=True
+ dataset="boston", Regressor=self.module, sparse=True
)
- self.assertAlmostEqual(self.res["default_boston_iterative_sparse"],
- sklearn.metrics.r2_score(targets,
- predictions),
- places=self.res.get(
- "default_boston_iterative_sparse_places", 7))
+ self.assertAlmostEqual(
+ self.res["default_boston_iterative_sparse"],
+ sklearn.metrics.r2_score(targets, predictions),
+ places=self.res.get("default_boston_iterative_sparse_places", 7),
+ )
def test_default_boston_sparse(self):
@@ -140,16 +134,14 @@ def test_default_boston_sparse(self):
for i in range(2):
with ignore_warnings(regressor_warnings):
predictions, targets, _ = _test_regressor(
- dataset="boston",
- Regressor=self.module,
- sparse=True
+ dataset="boston", Regressor=self.module, sparse=True
)
- self.assertAlmostEqual(self.res["default_boston_sparse"],
- sklearn.metrics.r2_score(targets,
- predictions),
- places=self.res.get(
- "default_boston_sparse_places", 7))
+ self.assertAlmostEqual(
+ self.res["default_boston_sparse"],
+ sklearn.metrics.r2_score(targets, predictions),
+ places=self.res.get("default_boston_sparse_places", 7),
+ )
def test_default_diabetes(self):
@@ -159,15 +151,14 @@ def test_default_diabetes(self):
for i in range(2):
with ignore_warnings(regressor_warnings):
predictions, targets, n_calls = _test_regressor(
- dataset="diabetes",
- Regressor=self.module
+ dataset="diabetes", Regressor=self.module
)
- self.assertAlmostEqual(self.res["default_diabetes"],
- sklearn.metrics.r2_score(targets,
- predictions),
- places=self.res.get(
- "default_diabetes_places", 7))
+ self.assertAlmostEqual(
+ self.res["default_diabetes"],
+ sklearn.metrics.r2_score(targets, predictions),
+ places=self.res.get("default_diabetes_places", 7),
+ )
if self.res.get("diabetes_n_calls"):
self.assertEqual(self.res["diabetes_n_calls"], n_calls)
@@ -177,28 +168,27 @@ def test_default_diabetes_iterative_fit(self):
if self.__class__ == BaseRegressionComponentTest:
return
- if not hasattr(self.module, 'iterative_fit'):
+ if not hasattr(self.module, "iterative_fit"):
return
for i in range(2):
with ignore_warnings(regressor_warnings):
predictions, targets, _ = _test_regressor_iterative_fit(
- dataset="diabetes",
- Regressor=self.module
+ dataset="diabetes", Regressor=self.module
)
- self.assertAlmostEqual(self.res["default_diabetes_iterative"],
- sklearn.metrics.r2_score(targets,
- predictions),
- places=self.res.get(
- "default_diabetes_iterative_places", 7))
+ self.assertAlmostEqual(
+ self.res["default_diabetes_iterative"],
+ sklearn.metrics.r2_score(targets, predictions),
+ places=self.res.get("default_diabetes_iterative_places", 7),
+ )
def test_default_diabetes_iterative_sparse_fit(self):
if self.__class__ == BaseRegressionComponentTest:
return
- if not hasattr(self.module, 'iterative_fit'):
+ if not hasattr(self.module, "iterative_fit"):
return
if SPARSE not in self.module.get_properties()["input"]:
@@ -207,21 +197,21 @@ def test_default_diabetes_iterative_sparse_fit(self):
for i in range(2):
with ignore_warnings(regressor_warnings):
predictions, targets, regressor = _test_regressor_iterative_fit(
- dataset="diabetes",
- Regressor=self.module,
- sparse=True
+ dataset="diabetes", Regressor=self.module, sparse=True
)
- self.assertAlmostEqual(self.res["default_diabetes_iterative_sparse"],
- sklearn.metrics.r2_score(targets,
- predictions),
- places=self.res.get(
- "default_diabetes_iterative_sparse_places", 7))
+ self.assertAlmostEqual(
+ self.res["default_diabetes_iterative_sparse"],
+ sklearn.metrics.r2_score(targets, predictions),
+ places=self.res.get("default_diabetes_iterative_sparse_places", 7),
+ )
if self.step_hyperparameter is not None:
self.assertEqual(
- getattr(regressor.estimator, self.step_hyperparameter['name']),
- self.res.get("diabetes_iterative_n_iter", self.step_hyperparameter['value'])
+ getattr(regressor.estimator, self.step_hyperparameter["name"]),
+ self.res.get(
+ "diabetes_iterative_n_iter", self.step_hyperparameter["value"]
+ ),
)
def test_default_diabetes_sparse(self):
@@ -235,47 +225,73 @@ def test_default_diabetes_sparse(self):
for i in range(2):
with ignore_warnings(regressor_warnings):
predictions, targets, _ = _test_regressor(
- dataset="diabetes",
- Regressor=self.module,
- sparse=True
+ dataset="diabetes", Regressor=self.module, sparse=True
)
- self.assertAlmostEqual(self.res["default_diabetes_sparse"],
- sklearn.metrics.r2_score(targets,
- predictions),
- places=self.res.get(
- "default_diabetes_sparse_places", 7))
+ self.assertAlmostEqual(
+ self.res["default_diabetes_sparse"],
+ sklearn.metrics.r2_score(targets, predictions),
+ places=self.res.get("default_diabetes_sparse_places", 7),
+ )
def test_module_idempotent(self):
- """ Fitting twice with the same config gives the same model params.
+ """Fitting twice with the same config gives the same model params.
- This is only valid when the random_state passed is an int. If a
- RandomState object is passed then repeated calls to fit will have
- different results. See the section on "Controlling Randomness" in the
- sklearn docs.
+ This is only valid when the random_state passed is an int. If a
+ RandomState object is passed then repeated calls to fit will have
+ different results. See the section on "Controlling Randomness" in the
+ sklearn docs.
- https://scikit-learn.org/0.24/common_pitfalls.html#controlling-randomness
+ https://scikit-learn.org/0.24/common_pitfalls.html#controlling-randomness
"""
if self.__class__ == BaseRegressionComponentTest:
return
regressor_cls = self.module
- X = np.array([
- [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5],
- [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5],
- [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5],
- [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5],
- ])
- y = np.array([
- 1, 1, 1, 1,
- 1, 1, 1, 1,
- 1, 1, 1, 1,
- 1, 1, 1, 1,
- ])
+ X = np.array(
+ [
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ [0.5, 0.5],
+ ]
+ )
+ y = np.array(
+ [
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ ]
+ )
# We ignore certain keys when comparing
- param_keys_ignored = ['base_estimator']
+ param_keys_ignored = ["base_estimator"]
# We use the default config + sampled ones
configuration_space = regressor_cls.get_hyperparameter_search_space()
@@ -292,14 +308,14 @@ def test_module_idempotent(self):
with ignore_warnings(regressor_warnings):
params_first = regressor.fit(X.copy(), y.copy()).estimator.get_params()
- if hasattr(regressor.estimator, 'random_state'):
+ if hasattr(regressor.estimator, "random_state"):
rs_1 = regressor.random_state
rs_estimator_1 = regressor.estimator.random_state
with ignore_warnings(regressor_warnings):
params_second = regressor.fit(X.copy(), y.copy()).estimator.get_params()
- if hasattr(regressor.estimator, 'random_state'):
+ if hasattr(regressor.estimator, "random_state"):
rs_2 = regressor.random_state
rs_estimator_2 = regressor.estimator.random_state
@@ -310,27 +326,27 @@ def test_module_idempotent(self):
del params[key]
# They should have equal parameters
- self.assertEqual(params_first, params_second,
- f"Failed with model args {model_args}")
- if (
- hasattr(regressor.estimator, 'random_state')
- and not isinstance(regressor, LibSVM_SVR)
+ self.assertEqual(
+ params_first, params_second, f"Failed with model args {model_args}"
+ )
+ if hasattr(regressor.estimator, "random_state") and not isinstance(
+ regressor, LibSVM_SVR
):
# sklearn.svm.SVR has it as an attribute but does not use it and
# defaults it to None, even if a value is passed in
- assert all([
- seed == random_state
- for random_state in [rs_1, rs_estimator_1, rs_2, rs_estimator_2]
- ])
+ assert all(
+ [
+ seed == random_state
+ for random_state in [rs_1, rs_estimator_1, rs_2, rs_estimator_2]
+ ]
+ )
@pytest.mark.parametrize("regressor", _regressors.values())
@pytest.mark.parametrize("X", [np.array([[1, 2, 3]] * 20)])
@pytest.mark.parametrize("y", [np.array([1] * 20)])
def test_fit_and_predict_with_1d_targets_as_1d(
- regressor: Type[RegressorChoice],
- X: np.ndarray,
- y: np.ndarray
+ regressor: Type[RegressorChoice], X: np.ndarray, y: np.ndarray
) -> None:
"""Test that all pipelines work with 1d target types
@@ -371,9 +387,7 @@ def test_fit_and_predict_with_1d_targets_as_1d(
@pytest.mark.parametrize("X", [np.array([[1, 2, 3]] * 20)])
@pytest.mark.parametrize("y", [np.array([[1]] * 20)])
def test_fit_and_predict_with_1d_targets_as_2d(
- regressor: Type[RegressorChoice],
- X: np.ndarray,
- y: np.ndarray
+ regressor: Type[RegressorChoice], X: np.ndarray, y: np.ndarray
) -> None:
"""Test that all pipelines work with 1d target types when they are wrapped as 2d
@@ -412,17 +426,18 @@ def test_fit_and_predict_with_1d_targets_as_2d(
assert len(predictions) == len(y)
-@pytest.mark.parametrize("regressor", [
- regressor
- for regressor in _regressors.values()
- if regressor.get_properties()['handles_multilabel']
-])
+@pytest.mark.parametrize(
+ "regressor",
+ [
+ regressor
+ for regressor in _regressors.values()
+ if regressor.get_properties()["handles_multilabel"]
+ ],
+)
@pytest.mark.parametrize("X", [np.array([[1, 2, 3]] * 20)])
@pytest.mark.parametrize("y", [np.array([[1, 1, 1]] * 20)])
def test_fit_and_predict_with_2d_targets(
- regressor: Type[RegressorChoice],
- X: np.ndarray,
- y: np.ndarray
+ regressor: Type[RegressorChoice], X: np.ndarray, y: np.ndarray
) -> None:
"""Test that all pipelines work with 2d target types
diff --git a/test/test_pipeline/components/regression/test_decision_tree.py b/test/test_pipeline/components/regression/test_decision_tree.py
index a5d2e53990..942b9db601 100644
--- a/test/test_pipeline/components/regression/test_decision_tree.py
+++ b/test/test_pipeline/components/regression/test_decision_tree.py
@@ -1,7 +1,7 @@
import sklearn.tree
-from autosklearn.pipeline.components.regression.decision_tree import \
- DecisionTree
+from autosklearn.pipeline.components.regression.decision_tree import DecisionTree
+
from .test_base import BaseRegressionComponentTest
diff --git a/test/test_pipeline/components/regression/test_extra_trees.py b/test/test_pipeline/components/regression/test_extra_trees.py
index 5d6f6d1acf..8d92fa30c8 100644
--- a/test/test_pipeline/components/regression/test_extra_trees.py
+++ b/test/test_pipeline/components/regression/test_extra_trees.py
@@ -1,7 +1,7 @@
import sklearn.ensemble
-from autosklearn.pipeline.components.regression.extra_trees import \
- ExtraTreesRegressor
+from autosklearn.pipeline.components.regression.extra_trees import ExtraTreesRegressor
+
from .test_base import BaseRegressionComponentTest
@@ -12,18 +12,18 @@ class ExtraTreesComponentTest(BaseRegressionComponentTest):
res = dict()
res["default_boston"] = 0.8539264243687228
res["boston_n_calls"] = 9
- res["default_boston_iterative"] = res['default_boston']
+ res["default_boston_iterative"] = res["default_boston"]
res["default_boston_sparse"] = 0.411211701806908
- res["default_boston_iterative_sparse"] = res['default_boston_sparse']
+ res["default_boston_iterative_sparse"] = res["default_boston_sparse"]
res["default_diabetes"] = 0.3885150255877827
res["diabetes_n_calls"] = 9
- res["default_diabetes_iterative"] = res['default_diabetes']
+ res["default_diabetes_iterative"] = res["default_diabetes"]
res["default_diabetes_sparse"] = 0.2422804139169642
- res["default_diabetes_iterative_sparse"] = res['default_diabetes_sparse']
+ res["default_diabetes_iterative_sparse"] = res["default_diabetes_sparse"]
sk_mod = sklearn.ensemble.ExtraTreesRegressor
module = ExtraTreesRegressor
step_hyperparameter = {
- 'name': 'n_estimators',
- 'value': module.get_max_iter(),
+ "name": "n_estimators",
+ "value": module.get_max_iter(),
}
diff --git a/test/test_pipeline/components/regression/test_gaussian_process.py b/test/test_pipeline/components/regression/test_gaussian_process.py
index d148d416df..0f766e22b1 100644
--- a/test/test_pipeline/components/regression/test_gaussian_process.py
+++ b/test/test_pipeline/components/regression/test_gaussian_process.py
@@ -1,7 +1,6 @@
import sklearn.gaussian_process
-from autosklearn.pipeline.components.regression.gaussian_process import \
- GaussianProcess
+from autosklearn.pipeline.components.regression.gaussian_process import GaussianProcess
from .test_base import BaseRegressionComponentTest
diff --git a/test/test_pipeline/components/regression/test_gradient_boosting.py b/test/test_pipeline/components/regression/test_gradient_boosting.py
index 9fcb2cd623..6412fd0598 100644
--- a/test/test_pipeline/components/regression/test_gradient_boosting.py
+++ b/test/test_pipeline/components/regression/test_gradient_boosting.py
@@ -1,7 +1,8 @@
import sklearn.ensemble
-from autosklearn.pipeline.components.regression.gradient_boosting import \
- GradientBoosting
+from autosklearn.pipeline.components.regression.gradient_boosting import (
+ GradientBoosting,
+)
from .test_base import BaseRegressionComponentTest
diff --git a/test/test_pipeline/components/regression/test_k_nearest_neighbors.py b/test/test_pipeline/components/regression/test_k_nearest_neighbors.py
index 40637c3ec8..19d0cf40f5 100644
--- a/test/test_pipeline/components/regression/test_k_nearest_neighbors.py
+++ b/test/test_pipeline/components/regression/test_k_nearest_neighbors.py
@@ -1,7 +1,9 @@
import sklearn.neighbors
-from autosklearn.pipeline.components.regression.k_nearest_neighbors import \
- KNearestNeighborsRegressor
+from autosklearn.pipeline.components.regression.k_nearest_neighbors import (
+ KNearestNeighborsRegressor,
+)
+
from .test_base import BaseRegressionComponentTest
diff --git a/test/test_pipeline/components/regression/test_liblinear_svr.py b/test/test_pipeline/components/regression/test_liblinear_svr.py
index 42b73bfba7..37b6552c9b 100644
--- a/test/test_pipeline/components/regression/test_liblinear_svr.py
+++ b/test/test_pipeline/components/regression/test_liblinear_svr.py
@@ -1,7 +1,7 @@
import sklearn.svm
-from autosklearn.pipeline.components.regression.liblinear_svr import \
- LibLinear_SVR
+from autosklearn.pipeline.components.regression.liblinear_svr import LibLinear_SVR
+
from .test_base import BaseRegressionComponentTest
diff --git a/test/test_pipeline/components/regression/test_mlp.py b/test/test_pipeline/components/regression/test_mlp.py
index c003037c76..9e2a92acac 100644
--- a/test/test_pipeline/components/regression/test_mlp.py
+++ b/test/test_pipeline/components/regression/test_mlp.py
@@ -64,6 +64,6 @@ class MLPComponentTest(BaseRegressionComponentTest):
sk_mod = sklearn.neural_network.MLPRegressor
module = MLPRegressor
step_hyperparameter = {
- 'name': 'n_iter_',
- 'value': module.get_max_iter(),
+ "name": "n_iter_",
+ "value": module.get_max_iter(),
}
diff --git a/test/test_pipeline/components/regression/test_random_forests.py b/test/test_pipeline/components/regression/test_random_forests.py
index ee6f342a8e..6e1634ff83 100644
--- a/test/test_pipeline/components/regression/test_random_forests.py
+++ b/test/test_pipeline/components/regression/test_random_forests.py
@@ -1,7 +1,7 @@
import sklearn.ensemble
-from autosklearn.pipeline.components.regression.random_forest import \
- RandomForest
+from autosklearn.pipeline.components.regression.random_forest import RandomForest
+
from .test_base import BaseRegressionComponentTest
@@ -11,18 +11,18 @@ class RandomForestComponentTest(BaseRegressionComponentTest):
res = dict()
res["default_boston"] = 0.8410063895401654
res["boston_n_calls"] = 9
- res["default_boston_iterative"] = res['default_boston']
+ res["default_boston_iterative"] = res["default_boston"]
res["default_boston_sparse"] = 0.4194462097407078
- res["default_boston_iterative_sparse"] = res['default_boston_sparse']
+ res["default_boston_iterative_sparse"] = res["default_boston_sparse"]
res["default_diabetes"] = 0.3496051170409269
res["diabetes_n_calls"] = 9
- res["default_diabetes_iterative"] = res['default_diabetes']
+ res["default_diabetes_iterative"] = res["default_diabetes"]
res["default_diabetes_sparse"] = 0.2383300978781976
- res["default_diabetes_iterative_sparse"] = res['default_diabetes_sparse']
+ res["default_diabetes_iterative_sparse"] = res["default_diabetes_sparse"]
sk_mod = sklearn.ensemble.RandomForestRegressor
module = RandomForest
step_hyperparameter = {
- 'name': 'n_estimators',
- 'value': module.get_max_iter(),
+ "name": "n_estimators",
+ "value": module.get_max_iter(),
}
diff --git a/test/test_pipeline/components/regression/test_sgd.py b/test/test_pipeline/components/regression/test_sgd.py
index df31b3e026..467f3519f1 100644
--- a/test/test_pipeline/components/regression/test_sgd.py
+++ b/test/test_pipeline/components/regression/test_sgd.py
@@ -1,6 +1,7 @@
import sklearn.linear_model
from autosklearn.pipeline.components.regression.sgd import SGD
+
from .test_base import BaseRegressionComponentTest
@@ -10,16 +11,16 @@ class SGDComponentTest(BaseRegressionComponentTest):
# Values are extremely bad because the invscaling does not drop the
# learning rate aggressively enough!
res = dict()
- res["default_boston"] = -1.1811672998629865e+28
+ res["default_boston"] = -1.1811672998629865e28
res["boston_n_calls"] = 6
- res["default_boston_iterative"] = res['default_boston']
- res["default_boston_sparse"] = -1.1518512489347601e+28
- res["default_boston_iterative_sparse"] = res['default_boston_sparse']
+ res["default_boston_iterative"] = res["default_boston"]
+ res["default_boston_sparse"] = -1.1518512489347601e28
+ res["default_boston_iterative_sparse"] = res["default_boston_sparse"]
res["default_diabetes"] = 0.27420813549185374
res["diabetes_n_calls"] = 10
- res["default_diabetes_iterative"] = res['default_diabetes']
+ res["default_diabetes_iterative"] = res["default_diabetes"]
res["default_diabetes_sparse"] = 0.034801785011824404
- res["default_diabetes_iterative_sparse"] = res['default_diabetes_sparse']
+ res["default_diabetes_iterative_sparse"] = res["default_diabetes_sparse"]
sk_mod = sklearn.linear_model.SGDRegressor
module = SGD
diff --git a/test/test_pipeline/components/regression/test_support_vector_regression.py b/test/test_pipeline/components/regression/test_support_vector_regression.py
index 57cde050ed..84bea51da6 100644
--- a/test/test_pipeline/components/regression/test_support_vector_regression.py
+++ b/test/test_pipeline/components/regression/test_support_vector_regression.py
@@ -1,6 +1,7 @@
import sklearn.linear_model
from autosklearn.pipeline.components.regression.libsvm_svr import LibSVM_SVR
+
from .test_base import BaseRegressionComponentTest
diff --git a/test/test_pipeline/components/test_base.py b/test/test_pipeline/components/test_base.py
index c53246cc77..1e6ddbbd14 100644
--- a/test/test_pipeline/components/test_base.py
+++ b/test/test_pipeline/components/test_base.py
@@ -2,20 +2,23 @@
import sys
import unittest
-from autosklearn.pipeline.components.base import find_components, \
- AutoSklearnClassificationAlgorithm
+from autosklearn.pipeline.components.base import (
+ AutoSklearnClassificationAlgorithm,
+ find_components,
+)
this_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(this_dir)
class TestBase(unittest.TestCase):
-
def test_find_components(self):
- c = find_components('dummy_components',
- os.path.join(this_dir, 'dummy_components'),
- AutoSklearnClassificationAlgorithm)
- print('COMPONENTS: %s' % repr(c))
+ c = find_components(
+ "dummy_components",
+ os.path.join(this_dir, "dummy_components"),
+ AutoSklearnClassificationAlgorithm,
+ )
+ print("COMPONENTS: %s" % repr(c))
self.assertEqual(len(c), 2)
- self.assertEqual(c['dummy_component_1'].__name__, 'DummyComponent1')
- self.assertEqual(c['dummy_component_2'].__name__, 'DummyComponent2')
+ self.assertEqual(c["dummy_component_1"].__name__, "DummyComponent1")
+ self.assertEqual(c["dummy_component_2"].__name__, "DummyComponent2")
diff --git a/test/test_pipeline/ignored_warnings.py b/test/test_pipeline/ignored_warnings.py
index 5b941281f9..715cacb6ba 100644
--- a/test/test_pipeline/ignored_warnings.py
+++ b/test/test_pipeline/ignored_warnings.py
@@ -1,106 +1,116 @@
-from contextlib import contextmanager
-from typing import List, Iterator, Tuple
+from typing import Iterator, List, Tuple, Type
import warnings
+from contextlib import contextmanager
from sklearn.exceptions import ConvergenceWarning
-
regressor_warnings = [
(
- UserWarning, ( # From QuantileTransformer
+ UserWarning,
+ ( # From QuantileTransformer
r"n_quantiles \(\d+\) is greater than the total number of samples \(\d+\)\."
r" n_quantiles is set to n_samples\."
- )
+ ),
),
(
- ConvergenceWarning, ( # From GaussianProcesses
+ ConvergenceWarning,
+ ( # From GaussianProcesses
r"The optimal value found for dimension \d+ of parameter \w+ is close"
r" to the specified (upper|lower) bound .*(Increasing|Decreasing) the bound"
r" and calling fit again may find a better value."
- )
+ ),
),
(
- UserWarning, ( # From FastICA
- r"n_components is too large: it will be set to \d+"
- )
+ UserWarning,
+ (r"n_components is too large: it will be set to \d+"), # From FastICA
),
(
- ConvergenceWarning, ( # From SGD
- r"Maximum number of iteration reached before convergence\. Consider increasing"
- r" max_iter to improve the fit\."
- )
+ ConvergenceWarning,
+ ( # From SGD
+ r"Maximum number of iteration reached before convergence\."
+ r" Consider increasing max_iter to improve the fit\."
+ ),
),
(
- ConvergenceWarning, ( # From MLP
+ ConvergenceWarning,
+ ( # From MLP
r"Stochastic Optimizer: Maximum iterations \(\d+\) reached and the"
r" optimization hasn't converged yet\."
- )
+ ),
),
]
classifier_warnings = [
(
- UserWarning, ( # From QuantileTransformer
+ UserWarning,
+ ( # From QuantileTransformer
r"n_quantiles \(\d+\) is greater than the total number of samples \(\d+\)\."
r" n_quantiles is set to n_samples\."
- )
+ ),
),
(
- UserWarning, ( # From FastICA
- r"n_components is too large: it will be set to \d+"
- )
-
+ UserWarning,
+ (r"n_components is too large: it will be set to \d+"), # From FastICA
),
(
- ConvergenceWarning, ( # From Liblinear
+ ConvergenceWarning,
+ ( # From Liblinear
r"Liblinear failed to converge, increase the number of iterations\."
- )
+ ),
),
(
- ConvergenceWarning, ( # From SGD
- r"Maximum number of iteration reached before convergence\. Consider increasing"
- r" max_iter to improve the fit\."
- )
+ ConvergenceWarning,
+ ( # From SGD
+ r"Maximum number of iteration reached before convergence\."
+ r"Consider increasing max_iter to improve the fit\."
+ ),
),
(
- ConvergenceWarning, ( # From MLP
+ ConvergenceWarning,
+ ( # From MLP
r"Stochastic Optimizer: Maximum iterations \(\d+\) reached and the"
r" optimization hasn't converged yet\."
- )
+ ),
),
(
- ConvergenceWarning, ( # From FastICA
+ ConvergenceWarning,
+ ( # From FastICA
r"FastICA did not converge\."
r" Consider increasing tolerance or the maximum number of iterations\."
- )
+ ),
),
(
- UserWarning, ( # From LDA (Linear Discriminant Analysis)
- r"Variables are collinear"
- )
+ UserWarning,
+ (r"Variables are collinear"), # From LDA (Linear Discriminant Analysis)
),
(
- UserWarning, (
+ UserWarning,
+ (
r"Clustering metrics expects discrete values but received continuous values"
r" for label, and multiclass values for target"
- )
- )
+ ),
+ ),
]
feature_preprocessing_warnings = [
(
- ConvergenceWarning, ( # From liblinear
+ ConvergenceWarning,
+ ( # From liblinear
r"Liblinear failed to converge, increase the number of iterations."
- )
+ ),
)
]
-ignored_warnings = regressor_warnings + classifier_warnings + feature_preprocessing_warnings
+ignored_warnings = (
+ regressor_warnings + classifier_warnings + feature_preprocessing_warnings
+)
@contextmanager
-def ignore_warnings(to_ignore: List[Tuple[Exception, str]] = ignored_warnings) -> Iterator[None]:
+def ignore_warnings(
+ to_ignore: List[Tuple[Type[Warning], str]] = ignored_warnings
+) -> Iterator[None]:
"""A context manager to ignore warnings
>>> with ignore_warnings(classifier_warnings):
@@ -113,5 +123,5 @@ def ignore_warnings(to_ignore: List[Tuple[Exception, str]] = ignored_warnings) -
"""
with warnings.catch_warnings():
for category, message in to_ignore:
- warnings.filterwarnings('ignore', category=category, message=message)
+ warnings.filterwarnings("ignore", category=category, message=message)
yield
diff --git a/test/test_pipeline/implementations/__init__.py b/test/test_pipeline/implementations/__init__.py
index 8f0ce6cb7c..92bf78f389 100644
--- a/test/test_pipeline/implementations/__init__.py
+++ b/test/test_pipeline/implementations/__init__.py
@@ -1 +1 @@
-__author__ = 'feurerm'
+__author__ = "feurerm"
diff --git a/test/test_pipeline/implementations/test_CategoryShift.py b/test/test_pipeline/implementations/test_CategoryShift.py
index 621d9b47cb..1b5e1451e6 100644
--- a/test/test_pipeline/implementations/test_CategoryShift.py
+++ b/test/test_pipeline/implementations/test_CategoryShift.py
@@ -1,4 +1,5 @@
import unittest
+
import numpy as np
import scipy.sparse
@@ -6,7 +7,6 @@
class CategoryShiftTest(unittest.TestCase):
-
def test_dense(self):
X = np.random.randint(0, 255, (3, 4))
Y = CategoryShift().fit_transform(X)
@@ -14,7 +14,8 @@ def test_dense(self):
def test_sparse(self):
X = scipy.sparse.csc_matrix(
- ([1, 2, 0, 4], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4))
+ ([1, 2, 0, 4], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)
+ )
Y = CategoryShift().fit_transform(X)
X.data += 3
self.assertTrue((Y.todense() == X.todense()).all())
@@ -29,6 +30,6 @@ def test_negative(self):
CategoryShift().fit_transform(X)
def test_string(self):
- X = np.array([['a', 'b'], ['c', 'd']])
+ X = np.array([["a", "b"], ["c", "d"]])
with self.assertRaises(ValueError):
CategoryShift().fit_transform(X)
diff --git a/test/test_pipeline/implementations/test_MinorityCoalescer.py b/test/test_pipeline/implementations/test_MinorityCoalescer.py
index 73cbf9049a..7bdca8f1aa 100644
--- a/test/test_pipeline/implementations/test_MinorityCoalescer.py
+++ b/test/test_pipeline/implementations/test_MinorityCoalescer.py
@@ -1,24 +1,25 @@
import unittest
-import numpy as np
+import numpy as np
import scipy.sparse
from autosklearn.pipeline.implementations.MinorityCoalescer import MinorityCoalescer
class MinorityCoalescerTest(unittest.TestCase):
-
@property
def X1(self):
# Generates an array with categories 3, 4, 5, 6, 7 and occurences of 30%,
# 30%, 30%, 5% and 5% respectively
- X = np.vstack((
- np.ones((30, 10)) * 3,
- np.ones((30, 10)) * 4,
- np.ones((30, 10)) * 5,
- np.ones((5, 10)) * 6,
- np.ones((5, 10)) * 7,
- ))
+ X = np.vstack(
+ (
+ np.ones((30, 10)) * 3,
+ np.ones((30, 10)) * 4,
+ np.ones((30, 10)) * 5,
+ np.ones((5, 10)) * 6,
+ np.ones((5, 10)) * 7,
+ )
+ )
for col in range(X.shape[1]):
np.random.shuffle(X[:, col])
return X
@@ -27,13 +28,15 @@ def X1(self):
def X2(self):
# Generates an array with categories 3, 4, 5, 6, 7 and occurences of 5%,
# 5%, 5%, 35% and 50% respectively
- X = np.vstack((
- np.ones((5, 10)) * 3,
- np.ones((5, 10)) * 4,
- np.ones((5, 10)) * 5,
- np.ones((35, 10)) * 6,
- np.ones((50, 10)) * 7,
- ))
+ X = np.vstack(
+ (
+ np.ones((5, 10)) * 3,
+ np.ones((5, 10)) * 4,
+ np.ones((5, 10)) * 5,
+ np.ones((35, 10)) * 6,
+ np.ones((50, 10)) * 7,
+ )
+ )
for col in range(X.shape[1]):
np.random.shuffle(X[:, col])
return X
@@ -48,7 +51,7 @@ def test_default(self):
def test_coalesce_10_percent(self):
X = self.X1
- Y = MinorityCoalescer(minimum_fraction=.1).fit_transform(X)
+ Y = MinorityCoalescer(minimum_fraction=0.1).fit_transform(X)
for col in range(Y.shape[1]):
hist = np.histogram(Y[:, col], bins=np.arange(1, 7))
np.testing.assert_array_almost_equal(hist[0], [10, 0, 30, 30, 30])
@@ -57,7 +60,7 @@ def test_coalesce_10_percent(self):
def test_coalesce_10_percent_sparse(self):
X = scipy.sparse.csc_matrix(self.X1)
- Y = MinorityCoalescer(minimum_fraction=.1).fit_transform(X)
+ Y = MinorityCoalescer(minimum_fraction=0.1).fit_transform(X)
# Assert no copies were made
self.assertEqual(id(X), id(Y))
Y = Y.todense()
@@ -75,7 +78,7 @@ def test_transform_after_fit(self):
X_fit = self.X1 # Here categories 3, 4, 5 have ocurrence above 10%
X_transf = self.X2 # Here it is the opposite, just categs 6 and 7 are above 10%
- mc = MinorityCoalescer(minimum_fraction=.1).fit(X_fit)
+ mc = MinorityCoalescer(minimum_fraction=0.1).fit(X_fit)
# transform() should coalesce categories as learned during fit.
# Category distribution in X_transf should be irrelevant.
diff --git a/test/test_pipeline/implementations/test_SparseOneHotEncoder.py b/test/test_pipeline/implementations/test_SparseOneHotEncoder.py
index 731533637b..91f1827c06 100644
--- a/test/test_pipeline/implementations/test_SparseOneHotEncoder.py
+++ b/test/test_pipeline/implementations/test_SparseOneHotEncoder.py
@@ -1,38 +1,37 @@
import unittest
import numpy as np
-
import scipy.sparse
-import sklearn.tree
import sklearn.datasets
import sklearn.model_selection
import sklearn.pipeline
+import sklearn.tree
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
-from autosklearn.pipeline.implementations.SparseOneHotEncoder import SparseOneHotEncoder
from autosklearn.pipeline.implementations.CategoryShift import CategoryShift
+from autosklearn.pipeline.implementations.SparseOneHotEncoder import SparseOneHotEncoder
-sparse1 = scipy.sparse.csc_matrix(([3, 2, 1, 1, 2, 3],
- ((1, 4, 5, 2, 3, 5),
- (0, 0, 0, 1, 1, 1))), shape=(6, 2))
-sparse1_1h = scipy.sparse.csc_matrix(([1, 1, 1, 1, 1, 1],
- ((5, 4, 1, 2, 3, 5),
- (0, 1, 2, 3, 4, 5))), shape=(6, 6))
+sparse1 = scipy.sparse.csc_matrix(
+ ([3, 2, 1, 1, 2, 3], ((1, 4, 5, 2, 3, 5), (0, 0, 0, 1, 1, 1))), shape=(6, 2)
+)
+sparse1_1h = scipy.sparse.csc_matrix(
+ ([1, 1, 1, 1, 1, 1], ((5, 4, 1, 2, 3, 5), (0, 1, 2, 3, 4, 5))), shape=(6, 6)
+)
-sparse2 = scipy.sparse.csc_matrix(([2, 1, 0, 0, 0, 0],
- ((1, 4, 5, 2, 3, 5),
- (0, 0, 0, 1, 1, 1))), shape=(6, 2))
-sparse2_1h = scipy.sparse.csc_matrix(([1, 1, 1, 1, 1, 1],
- ((5, 4, 1, 2, 3, 5),
- (0, 1, 2, 3, 3, 3))), shape=(6, 4))
+sparse2 = scipy.sparse.csc_matrix(
+ ([2, 1, 0, 0, 0, 0], ((1, 4, 5, 2, 3, 5), (0, 0, 0, 1, 1, 1))), shape=(6, 2)
+)
+sparse2_1h = scipy.sparse.csc_matrix(
+ ([1, 1, 1, 1, 1, 1], ((5, 4, 1, 2, 3, 5), (0, 1, 2, 3, 3, 3))), shape=(6, 4)
+)
-sparse2_csr = scipy.sparse.csr_matrix(([2, 1, 0, 0, 0, 0],
- ((1, 4, 5, 2, 3, 5),
- (0, 0, 0, 1, 1, 1))), shape=(6, 2))
-sparse2_csr_1h = scipy.sparse.csr_matrix(([1, 1, 1, 1, 1, 1],
- ((5, 4, 1, 2, 3, 5),
- (0, 1, 2, 3, 3, 3))), shape=(6, 4))
+sparse2_csr = scipy.sparse.csr_matrix(
+ ([2, 1, 0, 0, 0, 0], ((1, 4, 5, 2, 3, 5), (0, 0, 0, 1, 1, 1))), shape=(6, 2)
+)
+sparse2_csr_1h = scipy.sparse.csr_matrix(
+ ([1, 1, 1, 1, 1, 1], ((5, 4, 1, 2, 3, 5), (0, 1, 2, 3, 3, 3))), shape=(6, 4)
+)
class TestSparseOneHotEncoder(unittest.TestCase):
@@ -52,8 +51,7 @@ def _fit_then_transform(self, expected, input):
transformation = ohe.fit_transform(input)
self.assertIsInstance(transformation, scipy.sparse.csr_matrix)
np.testing.assert_array_almost_equal(
- expected.astype(float),
- transformation.todense()
+ expected.astype(float), transformation.todense()
)
self._check_arrays_equal(input, input_copy)
@@ -90,23 +88,26 @@ def test_transform_with_unknown_value(self):
self.assertEqual(3, np.sum(output))
def test_classification_workflow(self):
- X, y = sklearn.datasets.fetch_openml(data_id=24, as_frame=False, return_X_y=True)
+ X, y = sklearn.datasets.fetch_openml(
+ data_id=24, as_frame=False, return_X_y=True
+ )
print(type(X))
- X_train, X_test, y_train, y_test = \
- sklearn.model_selection.train_test_split(X, y, random_state=3,
- train_size=0.5,
- test_size=0.5)
+ X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+ X, y, random_state=3, train_size=0.5, test_size=0.5
+ )
X_train = scipy.sparse.csc_matrix(X_train)
X_test = scipy.sparse.csc_matrix(X_test)
- pipeline = sklearn.pipeline.Pipeline((
- ('shift', CategoryShift()),
- ('imput', SimpleImputer(strategy='constant', fill_value=2)),
- ('ohe', SparseOneHotEncoder()),
- ('tree', DecisionTreeClassifier(random_state=1)),
- ))
+ pipeline = sklearn.pipeline.Pipeline(
+ (
+ ("shift", CategoryShift()),
+ ("imput", SimpleImputer(strategy="constant", fill_value=2)),
+ ("ohe", SparseOneHotEncoder()),
+ ("tree", DecisionTreeClassifier(random_state=1)),
+ )
+ )
pipeline.fit(X_train, y_train)
pred_train = pipeline.predict(X_train)
diff --git a/test/test_pipeline/implementations/test_util.py b/test/test_pipeline/implementations/test_util.py
index 06f2a1eb2f..58412e0b0c 100644
--- a/test/test_pipeline/implementations/test_util.py
+++ b/test/test_pipeline/implementations/test_util.py
@@ -7,19 +7,44 @@
class UtilTest(unittest.TestCase):
def test_softmax_binary(self):
- df = np.array([-40.00643897, 34.69754581, 23.71181359, -29.89724287,
- 27.06071791, -37.78334103, -40.15812461, 40.16139229,
- -27.85887801, 42.67404756, -36.89753589, -36.45148009,
- 54.68976306, 19.47886562, -49.99821027, -35.70205302,
- -40.59639267, 32.96343916, -39.23777841, -37.86535019,
- -33.10196906, 26.84144377, -36.8569686])
+ df = np.array(
+ [
+ -40.00643897,
+ 34.69754581,
+ 23.71181359,
+ -29.89724287,
+ 27.06071791,
+ -37.78334103,
+ -40.15812461,
+ 40.16139229,
+ -27.85887801,
+ 42.67404756,
+ -36.89753589,
+ -36.45148009,
+ 54.68976306,
+ 19.47886562,
+ -49.99821027,
+ -35.70205302,
+ -40.59639267,
+ 32.96343916,
+ -39.23777841,
+ -37.86535019,
+ -33.10196906,
+ 26.84144377,
+ -36.8569686,
+ ]
+ )
probas = softmax(df)
- expected = [[1., 0.] if d < 0. else [0., 1.] for d in df]
+ expected = [[1.0, 0.0] if d < 0.0 else [0.0, 1.0] for d in df]
np.testing.assert_array_almost_equal(expected, probas)
def test_softmax(self):
- df = np.array([[2.75021367e+10, -8.83772371e-01, -2.20516715e+27],
- [-2.10848072e+11, 2.35024444e-01, 5.20106536e+25]])
+ df = np.array(
+ [
+ [2.75021367e10, -8.83772371e-01, -2.20516715e27],
+ [-2.10848072e11, 2.35024444e-01, 5.20106536e25],
+ ]
+ )
# With a numerically unstable softmax, the output would be something
# like this:
# [[ 0. 0. nan]
@@ -30,6 +55,7 @@ def test_softmax(self):
df = np.array([[0.1, 0.6, 0.3], [0.2, 0.3, 0.5]])
probas = softmax(df)
- expected = np.array([[0.25838965, 0.42601251, 0.31559783],
- [0.28943311, 0.31987306, 0.39069383]])
+ expected = np.array(
+ [[0.25838965, 0.42601251, 0.31559783], [0.28943311, 0.31987306, 0.39069383]]
+ )
np.testing.assert_array_almost_equal(expected, probas)
diff --git a/test/test_pipeline/test_base.py b/test/test_pipeline/test_base.py
index 0d40bca0d1..f1efed23b4 100644
--- a/test/test_pipeline/test_base.py
+++ b/test/test_pipeline/test_base.py
@@ -5,8 +5,8 @@
import autosklearn.pipeline.base
import autosklearn.pipeline.components.base
-import autosklearn.pipeline.components.feature_preprocessing
-import autosklearn.pipeline.components.classification
+import autosklearn.pipeline.components.classification as classification
+import autosklearn.pipeline.components.feature_preprocessing as feature_preprocessing
class BasePipelineMock(autosklearn.pipeline.base.BasePipeline):
@@ -17,42 +17,45 @@ def __init__(self):
class BaseTest(unittest.TestCase):
def test_get_hyperparameter_configuration_space_3choices(self):
cs = ConfigSpace.configuration_space.ConfigurationSpace()
- dataset_properties = {'target_type': 'classification'}
+ dataset_properties = {"target_type": "classification"}
exclude = {}
include = {}
- pipeline = [('p0',
- autosklearn.pipeline.components.feature_preprocessing
- .FeaturePreprocessorChoice(dataset_properties)),
- ('p1',
- autosklearn.pipeline.components.feature_preprocessing
- .FeaturePreprocessorChoice(dataset_properties)),
- ('c', autosklearn.pipeline.components.classification
- .ClassifierChoice(dataset_properties))]
+ pipeline = [
+ (
+ "p0",
+ feature_preprocessing.FeaturePreprocessorChoice(dataset_properties),
+ ),
+ (
+ "p1",
+ feature_preprocessing.FeaturePreprocessorChoice(dataset_properties),
+ ),
+ (
+ "c",
+ classification.ClassifierChoice(dataset_properties),
+ ),
+ ]
base = BasePipelineMock()
- cs = base._get_base_search_space(cs, dataset_properties,
- exclude, include, pipeline)
+ cs = base._get_base_search_space(
+ cs, dataset_properties, exclude, include, pipeline
+ )
- self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices),
- 13)
- self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices),
- 15)
+ self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 13)
+ self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 15)
# for clause in sorted([str(clause) for clause in cs.forbidden_clauses]):
# print(clause)
self.assertEqual(148, len(cs.forbidden_clauses))
cs = ConfigSpace.configuration_space.ConfigurationSpace()
- dataset_properties = {'target_type': 'classification', 'signed': True}
- include = {'c': ['multinomial_nb']}
- cs = base._get_base_search_space(cs, dataset_properties,
- exclude, include, pipeline)
- self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices),
- 13)
- self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices),
- 10)
- self.assertEqual(len(cs.get_hyperparameter("c:__choice__").choices),
- 1)
+ dataset_properties = {"target_type": "classification", "signed": True}
+ include = {"c": ["multinomial_nb"]}
+ cs = base._get_base_search_space(
+ cs, dataset_properties, exclude, include, pipeline
+ )
+ self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 13)
+ self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 10)
+ self.assertEqual(len(cs.get_hyperparameter("c:__choice__").choices), 1)
# Mostly combinations of p0 making the data unsigned and p1 not
# changing the values of the data points
# for clause in sorted([str(clause) for clause in cs.forbidden_clauses]):
@@ -60,42 +63,41 @@ def test_get_hyperparameter_configuration_space_3choices(self):
self.assertEqual(64, len(cs.forbidden_clauses))
cs = ConfigSpace.configuration_space.ConfigurationSpace()
- dataset_properties = {'target_type': 'classification', 'signed': True}
+ dataset_properties = {"target_type": "classification", "signed": True}
include = {}
- cs = base._get_base_search_space(cs, dataset_properties,
- exclude, include, pipeline)
- self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices),
- 13)
- self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices),
- 15)
- self.assertEqual(len(cs.get_hyperparameter("c:__choice__").choices),
- 16)
+ cs = base._get_base_search_space(
+ cs, dataset_properties, exclude, include, pipeline
+ )
+ self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 13)
+ self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 15)
+ self.assertEqual(len(cs.get_hyperparameter("c:__choice__").choices), 16)
# for clause in sorted([str(clause) for clause in cs.forbidden_clauses]):
# print(clause)
self.assertEqual(110, len(cs.forbidden_clauses))
cs = ConfigSpace.configuration_space.ConfigurationSpace()
- dataset_properties = {'target_type': 'classification', 'sparse': True}
- cs = base._get_base_search_space(cs, dataset_properties,
- exclude, include, pipeline)
- self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices),
- 12)
- self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices),
- 15)
+ dataset_properties = {"target_type": "classification", "sparse": True}
+ cs = base._get_base_search_space(
+ cs, dataset_properties, exclude, include, pipeline
+ )
+ self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 12)
+ self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 15)
# for clause in sorted([str(clause) for clause in cs.forbidden_clauses]):
# print(clause)
self.assertEqual(419, len(cs.forbidden_clauses))
cs = ConfigSpace.configuration_space.ConfigurationSpace()
- dataset_properties = {'target_type': 'classification',
- 'sparse': True, 'signed': True}
- cs = base._get_base_search_space(cs, dataset_properties,
- exclude, include, pipeline)
-
- self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices),
- 12)
- self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices),
- 15)
+ dataset_properties = {
+ "target_type": "classification",
+ "sparse": True,
+ "signed": True,
+ }
+ cs = base._get_base_search_space(
+ cs, dataset_properties, exclude, include, pipeline
+ )
+
+ self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 12)
+ self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 15)
# Data is guaranteed to be positive in cases like densifier,
# extra_trees_preproc, multinomial_nb -> less constraints
# for clause in sorted([str(clause) for clause in cs.forbidden_clauses]):
@@ -123,52 +125,71 @@ def test_init_params_handling(self):
for init_params, expected_init_params in [
({}, {}),
(None, None),
- ({'M:key': 'value'}, {'key': 'value'}),
+ ({"M:key": "value"}, {"key": "value"}),
]:
node = unittest.mock.Mock(
spec=autosklearn.pipeline.components.base.AutoSklearnComponent
)
node.get_hyperparameter_search_space.return_value = cs
- node.key = 'value'
- base.steps = [('M', node)]
- base.set_hyperparameters(cs.sample_configuration(), init_params=init_params)
- self.assertEqual(node.set_hyperparameters.call_args[1]['init_params'],
- expected_init_params)
+ node.key = "value"
+ base.steps = [("M", node)]
+ base.set_hyperparameters(
+ cs.sample_configuration(), init_params=init_params
+ )
+ self.assertEqual(
+ node.set_hyperparameters.call_args[1]["init_params"],
+ expected_init_params,
+ )
# Check for proper exception raising
node = unittest.mock.Mock(
spec=autosklearn.pipeline.components.base.AutoSklearnComponent
)
node.get_hyperparameter_search_space.return_value = cs
- base.steps = [('M', node)]
- with self.assertRaisesRegex(ValueError, "Unsupported argument to init_params"):
- base.set_hyperparameters(cs.sample_configuration(), init_params={'key': 'value'})
+ base.steps = [("M", node)]
+ with self.assertRaisesRegex(
+ ValueError, "Unsupported argument to init_params"
+ ):
+ base.set_hyperparameters(
+ cs.sample_configuration(), init_params={"key": "value"}
+ )
# An invalid node name is passed
- with self.assertRaisesRegex(ValueError, "The current node name specified via key"):
- base.set_hyperparameters(cs.sample_configuration(), init_params={'N:key': 'value'})
+ with self.assertRaisesRegex(
+ ValueError, "The current node name specified via key"
+ ):
+ base.set_hyperparameters(
+ cs.sample_configuration(), init_params={"N:key": "value"}
+ )
# The value was not properly set -- Here it happens because the
# object is a magic mock, calling the method doesn't set a new parameter
with self.assertRaisesRegex(ValueError, "Cannot properly set the pair"):
- base.set_hyperparameters(cs.sample_configuration(), init_params={'M:key': 'value'})
+ base.set_hyperparameters(
+ cs.sample_configuration(), init_params={"M:key": "value"}
+ )
def test_include_exclude_validation(self):
- """
- Makes sure include and exclude arguments are validated and raises expected exception
- on error
+ """Makes sure include and exclude arguments are validated and raises
+ expected exception on error
"""
base = BasePipelineMock()
- dataset_properties = {'target_type': 'classification'}
+ dataset_properties = {"target_type": "classification"}
base.dataset_properties = dataset_properties
- base.steps = [('p0',
- autosklearn.pipeline.components.feature_preprocessing
- .FeaturePreprocessorChoice(dataset_properties)),
- ('p1',
- autosklearn.pipeline.components.feature_preprocessing
- .FeaturePreprocessorChoice(dataset_properties)),
- ('c', autosklearn.pipeline.components.classification
- .ClassifierChoice(dataset_properties))]
+ base.steps = [
+ (
+ "p0",
+ feature_preprocessing.FeaturePreprocessorChoice(dataset_properties),
+ ),
+ (
+ "p1",
+ feature_preprocessing.FeaturePreprocessorChoice(dataset_properties),
+ ),
+ (
+ "c",
+ classification.ClassifierChoice(dataset_properties),
+ ),
+ ]
def assert_value_error(include=None, exclude=None):
base.include = include
@@ -177,21 +198,21 @@ def assert_value_error(include=None, exclude=None):
base._validate_include_exclude_params()
# Same key in include and exclude argument
- assert_value_error(include={'c': ['adaboost']}, exclude={'c': ['sgd']})
+ assert_value_error(include={"c": ["adaboost"]}, exclude={"c": ["sgd"]})
# Invalid key in the exclude argument
- assert_value_error(exclude={'p2': ['pca']})
+ assert_value_error(exclude={"p2": ["pca"]})
# Invalid value type for the key in the include argument
- assert_value_error(include={'c': ('adaboost', 'sgd')}, exclude=None)
+ assert_value_error(include={"c": ("adaboost", "sgd")}, exclude=None)
# Empty list of the key in the include argument
- assert_value_error(include={'c': []})
+ assert_value_error(include={"c": []})
# Invalid component in the list value for the key in the include argument
- assert_value_error(include={'c': ['pca']})
+ assert_value_error(include={"c": ["pca"]})
# Case when all conditions passed for include and exclude
- base.include = {'c': ['adaboost', 'sgd']}
- base.exclude = {'p1': ['pca']}
+ base.include = {"c": ["adaboost", "sgd"]}
+ base.exclude = {"p1": ["pca"]}
self.assertIsNone(base._validate_include_exclude_params())
diff --git a/test/test_pipeline/test_classification.py b/test/test_pipeline/test_classification.py
index 49267b0fee..c197dd30fc 100644
--- a/test/test_pipeline/test_classification.py
+++ b/test/test_pipeline/test_classification.py
@@ -8,29 +8,37 @@
import unittest
import unittest.mock
-from joblib import Memory
import numpy as np
-
-from sklearn.base import clone
import sklearn.datasets
import sklearn.decomposition
-import sklearn.model_selection
import sklearn.ensemble
+import sklearn.model_selection
import sklearn.svm
-from sklearn.utils.validation import check_is_fitted
-
from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter
+from joblib import Memory
+from sklearn.base import clone
+from sklearn.utils.validation import check_is_fitted
-from autosklearn.pipeline.classification import SimpleClassificationPipeline
-from autosklearn.pipeline.components.base import \
- AutoSklearnClassificationAlgorithm, AutoSklearnPreprocessingAlgorithm
-from autosklearn.pipeline.components.base import AutoSklearnComponent, AutoSklearnChoice, _addons
import autosklearn.pipeline.components.classification as classification_components
import autosklearn.pipeline.components.feature_preprocessing as preprocessing_components
+from autosklearn.pipeline.classification import SimpleClassificationPipeline
+from autosklearn.pipeline.components.base import (
+ AutoSklearnChoice,
+ AutoSklearnClassificationAlgorithm,
+ AutoSklearnComponent,
+ AutoSklearnPreprocessingAlgorithm,
+ _addons,
+)
+from autosklearn.pipeline.constants import (
+ DENSE,
+ INPUT,
+ PREDICTIONS,
+ SIGNED_DATA,
+ SPARSE,
+ UNSIGNED_DATA,
+)
from autosklearn.pipeline.util import get_dataset
-from autosklearn.pipeline.constants import \
- DENSE, SPARSE, UNSIGNED_DATA, PREDICTIONS, SIGNED_DATA, INPUT
from test.test_pipeline.ignored_warnings import classifier_warnings, ignore_warnings
@@ -38,16 +46,18 @@
class DummyClassifier(AutoSklearnClassificationAlgorithm):
@staticmethod
def get_properties(dataset_properties=None):
- return {'shortname': 'AB',
- 'name': 'AdaBoost Classifier',
- 'handles_regression': False,
- 'handles_classification': True,
- 'handles_multiclass': True,
- 'handles_multilabel': True,
- 'handles_multioutput': False,
- 'is_deterministic': True,
- 'input': (DENSE, SPARSE, UNSIGNED_DATA),
- 'output': (PREDICTIONS,)}
+ return {
+ "shortname": "AB",
+ "name": "AdaBoost Classifier",
+ "handles_regression": False,
+ "handles_classification": True,
+ "handles_multiclass": True,
+ "handles_multilabel": True,
+ "handles_multioutput": False,
+ "is_deterministic": True,
+ "input": (DENSE, SPARSE, UNSIGNED_DATA),
+ "output": (PREDICTIONS,),
+ }
@staticmethod
def get_hyperparameter_search_space(dataset_properties=None):
@@ -58,16 +68,18 @@ def get_hyperparameter_search_space(dataset_properties=None):
class DummyPreprocessor(AutoSklearnPreprocessingAlgorithm):
@staticmethod
def get_properties(dataset_properties=None):
- return {'shortname': 'AB',
- 'name': 'AdaBoost Classifier',
- 'handles_regression': False,
- 'handles_classification': True,
- 'handles_multiclass': True,
- 'handles_multilabel': True,
- 'handles_multioutput': False,
- 'is_deterministic': True,
- 'input': (DENSE, SPARSE, UNSIGNED_DATA),
- 'output': (INPUT,)}
+ return {
+ "shortname": "AB",
+ "name": "AdaBoost Classifier",
+ "handles_regression": False,
+ "handles_classification": True,
+ "handles_multiclass": True,
+ "handles_multilabel": True,
+ "handles_multioutput": False,
+ "is_deterministic": True,
+ "input": (DENSE, SPARSE, UNSIGNED_DATA),
+ "output": (INPUT,),
+ }
@staticmethod
def get_hyperparameter_search_space(dataset_properties=None):
@@ -81,16 +93,18 @@ def __init__(*args, **kwargs):
@staticmethod
def get_properties(dataset_properties=None):
- return {'shortname': 'AB',
- 'name': 'AdaBoost Classifier',
- 'handles_regression': False,
- 'handles_classification': True,
- 'handles_multiclass': True,
- 'handles_multilabel': True,
- 'handles_multioutput': False,
- 'is_deterministic': True,
- 'input': (DENSE, SPARSE, UNSIGNED_DATA),
- 'output': (INPUT,)}
+ return {
+ "shortname": "AB",
+ "name": "AdaBoost Classifier",
+ "handles_regression": False,
+ "handles_classification": True,
+ "handles_multiclass": True,
+ "handles_multilabel": True,
+ "handles_multioutput": False,
+ "is_deterministic": True,
+ "input": (DENSE, SPARSE, UNSIGNED_DATA),
+ "output": (INPUT,),
+ }
def fit(self, X, y):
raise ValueError("Make sure fit is called")
@@ -116,21 +130,21 @@ def test_io_dict(self):
if classifiers[c] == classification_components.ClassifierChoice:
continue
props = classifiers[c].get_properties()
- self.assertIn('input', props)
- self.assertIn('output', props)
- inp = props['input']
- output = props['output']
+ self.assertIn("input", props)
+ self.assertIn("output", props)
+ inp = props["input"]
+ output = props["output"]
self.assertIsInstance(inp, tuple)
self.assertIsInstance(output, tuple)
for i in inp:
self.assertIn(i, (SPARSE, DENSE, SIGNED_DATA, UNSIGNED_DATA))
self.assertEqual(output, (PREDICTIONS,))
- self.assertIn('handles_regression', props)
- self.assertFalse(props['handles_regression'])
- self.assertIn('handles_classification', props)
- self.assertIn('handles_multiclass', props)
- self.assertIn('handles_multilabel', props)
+ self.assertIn("handles_regression", props)
+ self.assertFalse(props["handles_regression"])
+ self.assertIn("handles_classification", props)
+ self.assertIn("handles_multiclass", props)
+ self.assertIn("handles_multilabel", props)
def test_find_classifiers(self):
"""Test that the classifier components can be found
@@ -143,9 +157,11 @@ def test_find_classifiers(self):
classifiers = classification_components._classifiers
self.assertGreaterEqual(len(classifiers), 2)
for key in classifiers:
- if hasattr(classifiers[key], 'get_components'):
+ if hasattr(classifiers[key], "get_components"):
continue
- self.assertIn(AutoSklearnClassificationAlgorithm, classifiers[key].__bases__)
+ self.assertIn(
+ AutoSklearnClassificationAlgorithm, classifiers[key].__bases__
+ )
def test_find_preprocessors(self):
"""Test that preproccesor components can be found
@@ -156,20 +172,23 @@ def test_find_preprocessors(self):
* The inherit from AutoSklearnPreprocessingAlgorithm
"""
preprocessors = preprocessing_components._preprocessors
- self.assertGreaterEqual(len(preprocessors), 1)
+ self.assertGreaterEqual(len(preprocessors), 1)
for key in preprocessors:
- if hasattr(preprocessors[key], 'get_components'):
+ if hasattr(preprocessors[key], "get_components"):
continue
- self.assertIn(AutoSklearnPreprocessingAlgorithm, preprocessors[key].__bases__)
+ self.assertIn(
+ AutoSklearnPreprocessingAlgorithm, preprocessors[key].__bases__
+ )
def test_default_configuration(self):
"""Test that seeded SimpleClassificaitonPipeline returns good results on iris
Expects
-------
- * The performance of configuration with fixed seed gets above 96% accuracy on iris
+ * The performance of configuration with fixed seed gets above 96% accuracy
+ on iris
"""
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris")
auto = SimpleClassificationPipeline(random_state=1)
@@ -190,11 +209,12 @@ def test_default_configuration_multilabel(self):
* The performance of a random configuratino gets above 96% on a multilabel
version of iris
"""
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris', make_multilabel=True)
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="iris", make_multilabel=True
+ )
classifier = SimpleClassificationPipeline(
- dataset_properties={'multilabel': True},
- random_state=0
+ dataset_properties={"multilabel": True}, random_state=0
)
cs = classifier.get_hyperparameter_search_space()
@@ -218,14 +238,14 @@ def test_default_configuration_iterative_fit(self):
* Random forest pipeline can be fit iteratively
* Test that its number of estimators is equal to the iteration count
"""
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris")
classifier = SimpleClassificationPipeline(
include={
- 'classifier': ['random_forest'],
- 'feature_preprocessor': ['no_preprocessing']
+ "classifier": ["random_forest"],
+ "feature_preprocessor": ["no_preprocessing"],
},
- random_state=0
+ random_state=0,
)
classifier.fit_transformer(X_train, Y_train)
@@ -256,9 +276,7 @@ def test_multilabel(self):
* All configurations should fit, predict and predict_proba successfully
"""
cache = Memory(location=tempfile.gettempdir())
- cached_func = cache.cache(
- sklearn.datasets.make_multilabel_classification
- )
+ cached_func = cache.cache(sklearn.datasets.make_multilabel_classification)
X, Y = cached_func(
n_samples=150,
n_features=20,
@@ -269,14 +287,16 @@ def test_multilabel(self):
sparse=False,
return_indicator=True,
return_distributions=False,
- random_state=1
+ random_state=1,
)
data = {
- 'X_train': X[:100, :],
- 'Y_train': Y[:100, :],
- 'X_test': X[101:, :],
- 'Y_test': Y[101:, ]
+ "X_train": X[:100, :],
+ "Y_train": Y[:100, :],
+ "X_test": X[101:, :],
+ "Y_test": Y[
+ 101:,
+ ],
}
pipeline = SimpleClassificationPipeline(dataset_properties={"multilabel": True})
@@ -301,12 +321,14 @@ def test_configurations_signed_data(self):
-------
* All configurations should fit, predict and predict_proba successfully
"""
- dataset_properties = {'signed': True}
+ dataset_properties = {"signed": True}
cls = SimpleClassificationPipeline(dataset_properties=dataset_properties)
cs = cls.get_hyperparameter_search_space()
- self._test_configurations(configurations_space=cs, dataset_properties=dataset_properties)
+ self._test_configurations(
+ configurations_space=cs, dataset_properties=dataset_properties
+ )
def test_configurations_sparse(self):
"""Tests a non-seeded random set of configurations with sparse data
@@ -315,7 +337,7 @@ def test_configurations_sparse(self):
-------
* All configurations should fit, predict and predict_proba successfully
"""
- pipeline = SimpleClassificationPipeline(dataset_properties={'sparse': True})
+ pipeline = SimpleClassificationPipeline(dataset_properties={"sparse": True})
cs = pipeline.get_hyperparameter_search_space()
self._test_configurations(configurations_space=cs, make_sparse=True)
@@ -330,41 +352,89 @@ def test_configurations_categorical_data(self):
* All configurations should fit, predict and predict_proba successfully
"""
pipeline = SimpleClassificationPipeline(
- dataset_properties={'sparse': False},
+ dataset_properties={"sparse": False},
include={
- 'feature_preprocessor': ['no_preprocessing'],
- 'classifier': ['sgd', 'adaboost']
- }
+ "feature_preprocessor": ["no_preprocessing"],
+ "classifier": ["sgd", "adaboost"],
+ },
)
cs = pipeline.get_hyperparameter_search_space()
categorical_columns = [
- True, True, True, False, False, True, True, True, False, True, True, True, True,
- True, True, True, True, True, True, True, True, True, True, True, True, True,
- True, True, True, True, True, True, False, False, False, True, True, True
+ True,
+ True,
+ True,
+ False,
+ False,
+ True,
+ True,
+ True,
+ False,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ False,
+ False,
+ False,
+ True,
+ True,
+ True,
]
categorical = {
- i: 'categorical' if is_categorical else 'numerical'
+ i: "categorical" if is_categorical else "numerical"
for i, is_categorical in enumerate(categorical_columns)
}
here = os.path.dirname(__file__)
- dataset_path = os.path.join(here, "components", "data_preprocessing", "dataset.pkl")
+ dataset_path = os.path.join(
+ here, "components", "data_preprocessing", "dataset.pkl"
+ )
X = np.loadtxt(dataset_path)
y = X[:, -1].copy()
X = X[:, :-1]
- X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, y)
+ X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
+ X, y
+ )
- data = {'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test}
+ data = {
+ "X_train": X_train,
+ "Y_train": Y_train,
+ "X_test": X_test,
+ "Y_test": Y_test,
+ }
- init_params = {'data_preprocessor:feat_type': categorical}
+ init_params = {"data_preprocessor:feat_type": categorical}
- self._test_configurations(configurations_space=cs, dataset=data, init_params=init_params)
+ self._test_configurations(
+ configurations_space=cs, dataset=data, init_params=init_params
+ )
- @unittest.mock.patch('autosklearn.pipeline.components.data_preprocessing'
- '.DataPreprocessorChoice.set_hyperparameters')
+ @unittest.mock.patch(
+ "autosklearn.pipeline.components.data_preprocessing"
+ ".DataPreprocessorChoice.set_hyperparameters"
+ )
def test_categorical_passed_to_one_hot_encoder(self, ohe_mock):
"""Test that the feat_types arg is passed to the OneHotEncoder
@@ -379,36 +449,38 @@ def test_categorical_passed_to_one_hot_encoder(self, ohe_mock):
# Mock the _check_init_params_honored as there is no object created,
# _check_init_params_honored will fail as a datapreprocessor was never created
- with unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline'
- '._check_init_params_honored'):
+ with unittest.mock.patch(
+ "autosklearn.pipeline.classification.SimpleClassificationPipeline"
+ "._check_init_params_honored"
+ ):
# Check through construction
- feat_types = {0: 'categorical', 1: 'numerical'}
+ feat_types = {0: "categorical", 1: "numerical"}
cls = SimpleClassificationPipeline(
- init_params={'data_preprocessor:feat_type': feat_types}
+ init_params={"data_preprocessor:feat_type": feat_types}
)
- init_args = ohe_mock.call_args[1]['init_params']
- self.assertEqual(init_args, {'feat_type': feat_types})
+ init_args = ohe_mock.call_args[1]["init_params"]
+ self.assertEqual(init_args, {"feat_type": feat_types})
# Check through `set_hyperparameters`
- feat_types = {0: 'categorical', 1: 'categorical', 2: 'numerical'}
+ feat_types = {0: "categorical", 1: "categorical", 2: "numerical"}
default = cls.get_hyperparameter_search_space().get_default_configuration()
cls.set_hyperparameters(
configuration=default,
- init_params={'data_preprocessor:feat_type': feat_types},
+ init_params={"data_preprocessor:feat_type": feat_types},
)
- init_args = ohe_mock.call_args[1]['init_params']
- self.assertEqual(init_args, {'feat_type': feat_types})
+ init_args = ohe_mock.call_args[1]["init_params"]
+ self.assertEqual(init_args, {"feat_type": feat_types})
def _test_configurations(
self,
configurations_space: ConfigurationSpace,
make_sparse: bool = False,
- dataset: Union[str, Dict[str, Any]] = 'digits',
+ dataset: Union[str, Dict[str, Any]] = "digits",
init_params: Dict[str, Any] = None,
dataset_properties: Dict[str, Any] = None,
n_samples: int = 10,
@@ -448,53 +520,55 @@ def _test_configurations(
config._populate_values()
# Restrict configurations which could take too long on travis-ci
- restrictions = {'classifier:passive_aggressive:n_iter': 5,
- 'classifier:sgd:n_iter': 5,
- 'classifier:adaboost:n_estimators': 50,
- 'classifier:adaboost:max_depth': 1,
- 'feature_preprocessor:kernel_pca:n_components': 10,
- 'feature_preprocessor:kitchen_sinks:n_components': 50,
- 'classifier:proj_logit:max_epochs': 1,
- 'classifier:libsvm_svc:degree': 2,
- 'regressor:libsvm_svr:degree': 2,
- 'feature_preprocessor:truncatedSVD:target_dim': 10,
- 'feature_preprocessor:polynomial:degree': 2,
- 'classifier:lda:n_components': 10,
- 'feature_preprocessor:nystroem_sampler:n_components': 50,
- 'feature_preprocessor:feature_agglomeration:n_clusters': 2,
- 'classifier:gradient_boosting:max_leaf_nodes': 64}
-
- config._values.update({
- param: value
- for param, value in restrictions.items()
- if param in config and config[param] is not None
- })
+ restrictions = {
+ "classifier:passive_aggressive:n_iter": 5,
+ "classifier:sgd:n_iter": 5,
+ "classifier:adaboost:n_estimators": 50,
+ "classifier:adaboost:max_depth": 1,
+ "feature_preprocessor:kernel_pca:n_components": 10,
+ "feature_preprocessor:kitchen_sinks:n_components": 50,
+ "classifier:proj_logit:max_epochs": 1,
+ "classifier:libsvm_svc:degree": 2,
+ "regressor:libsvm_svr:degree": 2,
+ "feature_preprocessor:truncatedSVD:target_dim": 10,
+ "feature_preprocessor:polynomial:degree": 2,
+ "classifier:lda:n_components": 10,
+ "feature_preprocessor:nystroem_sampler:n_components": 50,
+ "feature_preprocessor:feature_agglomeration:n_clusters": 2,
+ "classifier:gradient_boosting:max_leaf_nodes": 64,
+ }
+
+ config._values.update(
+ {
+ param: value
+ for param, value in restrictions.items()
+ if param in config and config[param] is not None
+ }
+ )
if isinstance(dataset, str):
X_train, Y_train, X_test, Y_test = get_dataset(
- dataset=dataset,
- make_sparse=make_sparse,
- add_NaNs=True
+ dataset=dataset, make_sparse=make_sparse, add_NaNs=True
)
else:
- X_train = dataset['X_train'].copy()
- Y_train = dataset['Y_train'].copy()
- X_test = dataset['X_test'].copy()
- dataset['Y_test'].copy()
+ X_train = dataset["X_train"].copy()
+ Y_train = dataset["Y_train"].copy()
+ X_test = dataset["X_test"].copy()
+ dataset["Y_test"].copy()
init_params_ = copy.deepcopy(init_params)
cls = SimpleClassificationPipeline(
- dataset_properties=dataset_properties,
- init_params=init_params_
+ dataset_properties=dataset_properties, init_params=init_params_
)
cls.set_hyperparameters(config, init_params=init_params_)
# First make sure that for this configuration, setting the parameters
# does not mistakenly set the estimator as fitted
for name, step in cls.named_steps.items():
- with self.assertRaisesRegex(sklearn.exceptions.NotFittedError,
- "instance is not fitted yet"):
+ with self.assertRaisesRegex(
+ sklearn.exceptions.NotFittedError, "instance is not fitted yet"
+ ):
check_is_fitted(step)
try:
@@ -526,15 +600,17 @@ def _test_configurations(
continue
elif "Numerical problems in QDA" in e.args[0]:
continue
- elif 'Bug in scikit-learn' in e.args[0]:
+ elif "Bug in scikit-learn" in e.args[0]:
continue
- elif 'The condensed distance matrix must contain only finite ' \
- 'values.' in e.args[0]:
+ elif (
+ "The condensed distance matrix must contain only finite "
+ "values." in e.args[0]
+ ):
continue
- elif 'Internal work array size computation failed' in e.args[0]:
+ elif "Internal work array size computation failed" in e.args[0]:
continue
# Assumed to be caused by knn with preprocessor fast_ica with whiten
- elif 'Input contains NaN, infinity or a value too large' in e.args[0]:
+ elif "Input contains NaN, infinity or a value too large" in e.args[0]:
continue
else:
e.args += (f"config={config}",)
@@ -581,14 +657,18 @@ def test_get_hyperparameter_search_space(self):
cs = pipeline.get_hyperparameter_search_space()
self.assertIsInstance(cs, ConfigurationSpace)
- rescale_param = 'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__'
+ rescale_param = (
+ "data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__"
+ )
n_choices = len(cs.get_hyperparameter(rescale_param).choices)
self.assertEqual(n_choices, 7)
- n_classifiers = len(cs.get_hyperparameter('classifier:__choice__').choices)
+ n_classifiers = len(cs.get_hyperparameter("classifier:__choice__").choices)
self.assertEqual(n_classifiers, 16)
- n_preprocessors = len(cs.get_hyperparameter('feature_preprocessor:__choice__').choices)
+ n_preprocessors = len(
+ cs.get_hyperparameter("feature_preprocessor:__choice__").choices
+ )
self.assertEqual(n_preprocessors, 13)
hyperparameters = cs.get_hyperparameters()
@@ -610,69 +690,71 @@ def test_get_hyperparameter_search_space_include_exclude_models(self):
Expects
-------
- * Including a classifier choice has pipeline give back matching choice
- * Excluding a classifier choice means it won't show up in the hyperparameter space
+ * Including a classifier has pipeline give back matching choice
+ * Excluding a classifier means it won't show up in the hyperparameter space
* Including a feature preprocessor has pipeline give back matching choice
- * Excluding a feature preprocessor means it won't show up in the hyperparameter space
+ * Excluding a feature preprocessor means it won't show up in the
+ hyperparameter space
"""
# include a classifier choice
- pipeline = SimpleClassificationPipeline(include={'classifier': ['libsvm_svc']})
+ pipeline = SimpleClassificationPipeline(include={"classifier": ["libsvm_svc"]})
cs = pipeline.get_hyperparameter_search_space()
- expected = CategoricalHyperparameter('classifier:__choice__', ['libsvm_svc'])
- returned = cs.get_hyperparameter('classifier:__choice__')
+ expected = CategoricalHyperparameter("classifier:__choice__", ["libsvm_svc"])
+ returned = cs.get_hyperparameter("classifier:__choice__")
self.assertEqual(returned, expected)
# exclude a classifier choice
- pipeline = SimpleClassificationPipeline(exclude={'classifier': ['libsvm_svc']})
+ pipeline = SimpleClassificationPipeline(exclude={"classifier": ["libsvm_svc"]})
cs = pipeline.get_hyperparameter_search_space()
- self.assertNotIn('libsvm_svc', str(cs))
+ self.assertNotIn("libsvm_svc", str(cs))
# include a feature preprocessor
pipeline = SimpleClassificationPipeline(
- include={'feature_preprocessor': ['select_percentile_classification']}
+ include={"feature_preprocessor": ["select_percentile_classification"]}
)
cs = pipeline.get_hyperparameter_search_space()
- returned = cs.get_hyperparameter('feature_preprocessor:__choice__')
+ returned = cs.get_hyperparameter("feature_preprocessor:__choice__")
expected = CategoricalHyperparameter(
- 'feature_preprocessor:__choice__',
- ['select_percentile_classification']
+ "feature_preprocessor:__choice__", ["select_percentile_classification"]
)
self.assertEqual(returned, expected)
# exclude a feature preprocessor
pipeline = SimpleClassificationPipeline(
- exclude={'feature_preprocessor': ['select_percentile_classification']}
+ exclude={"feature_preprocessor": ["select_percentile_classification"]}
)
cs = pipeline.get_hyperparameter_search_space()
- self.assertNotIn('select_percentile_classification', str(cs))
+ self.assertNotIn("select_percentile_classification", str(cs))
- def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(self):
+ def test_get_hyperparameter_search_space_preprocessor_contradicts_default(
+ self,
+ ):
"""Test that the default classifier gets updated based on the legal feature
preprocessors that come before.
Expects
-------
- * With 'densifier' as only legal feature_preprocessor, 'qda' is default classifier
- * With 'nystroem_sampler' as only legal feature_preprocessor, 'sgd' is default classifier
+ * With 'densifier' as only legal feature_preprocessor, 'qda' is default
+ * With 'nystroem_sampler' as only legal feature_preprocessor, 'sgd' is default
"""
pipeline = SimpleClassificationPipeline(
- include={'feature_preprocessor': ['densifier']},
- dataset_properties={'sparse': True}
+ include={"feature_preprocessor": ["densifier"]},
+ dataset_properties={"sparse": True},
)
cs = pipeline.get_hyperparameter_search_space()
- default_choice = cs.get_hyperparameter('classifier:__choice__').default_value
- self.assertEqual(default_choice, 'qda')
+ default_choice = cs.get_hyperparameter("classifier:__choice__").default_value
+ self.assertEqual(default_choice, "qda")
pipeline = SimpleClassificationPipeline(
- include={'feature_preprocessor': ['nystroem_sampler']}
+ include={"feature_preprocessor": ["nystroem_sampler"]}
)
cs = pipeline.get_hyperparameter_search_space()
- default_choice = cs.get_hyperparameter('classifier:__choice__').default_value
- self.assertEqual(default_choice, 'sgd')
+ default_choice = cs.get_hyperparameter("classifier:__choice__").default_value
+ self.assertEqual(default_choice, "sgd")
def test_get_hyperparameter_search_space_only_forbidden_combinations(self):
"""Test that invalid pipeline configurations raise errors
@@ -686,43 +768,48 @@ def test_get_hyperparameter_search_space_only_forbidden_combinations(self):
with self.assertRaisesRegex(AssertionError, "No valid pipeline found."):
SimpleClassificationPipeline(
include={
- 'classifier': ['multinomial_nb'],
- 'feature_preprocessor': ['pca']
+ "classifier": ["multinomial_nb"],
+ "feature_preprocessor": ["pca"],
},
- dataset_properties={'sparse': True}
+ dataset_properties={"sparse": True},
)
- with self.assertRaisesRegex(ValueError, "Cannot find a legal default configuration."):
+ with self.assertRaisesRegex(
+ ValueError, "Cannot find a legal default configuration."
+ ):
SimpleClassificationPipeline(
include={
- 'classifier': ['liblinear_svc'],
- 'feature_preprocessor': ['densifier']
+ "classifier": ["liblinear_svc"],
+ "feature_preprocessor": ["densifier"],
},
- dataset_properties={'sparse': True}
+ dataset_properties={"sparse": True},
)
@unittest.skip("Wait until ConfigSpace is fixed.")
def test_get_hyperparameter_search_space_dataset_properties(self):
cs_mc = SimpleClassificationPipeline.get_hyperparameter_search_space(
- dataset_properties={'multiclass': True}
+ dataset_properties={"multiclass": True}
)
- self.assertNotIn('bernoulli_nb', str(cs_mc))
+ self.assertNotIn("bernoulli_nb", str(cs_mc))
cs_ml = SimpleClassificationPipeline.get_hyperparameter_search_space(
- dataset_properties={'multilabel': True})
- self.assertNotIn('k_nearest_neighbors', str(cs_ml))
- self.assertNotIn('liblinear', str(cs_ml))
- self.assertNotIn('libsvm_svc', str(cs_ml))
- self.assertNotIn('sgd', str(cs_ml))
+ dataset_properties={"multilabel": True}
+ )
+ self.assertNotIn("k_nearest_neighbors", str(cs_ml))
+ self.assertNotIn("liblinear", str(cs_ml))
+ self.assertNotIn("libsvm_svc", str(cs_ml))
+ self.assertNotIn("sgd", str(cs_ml))
cs_sp = SimpleClassificationPipeline.get_hyperparameter_search_space(
- dataset_properties={'sparse': True})
- self.assertIn('extra_trees', str(cs_sp))
- self.assertIn('gradient_boosting', str(cs_sp))
- self.assertIn('random_forest', str(cs_sp))
+ dataset_properties={"sparse": True}
+ )
+ self.assertIn("extra_trees", str(cs_sp))
+ self.assertIn("gradient_boosting", str(cs_sp))
+ self.assertIn("random_forest", str(cs_sp))
cs_mc_ml = SimpleClassificationPipeline.get_hyperparameter_search_space(
- dataset_properties={'multilabel': True, 'multiclass': True})
+ dataset_properties={"multilabel": True, "multiclass": True}
+ )
self.assertEqual(cs_ml, cs_mc_ml)
def test_predict_batched(self):
@@ -733,12 +820,13 @@ def test_predict_batched(self):
-------
* Should expect the output shape to match that of the digits dataset
* Should expect a fixed call count each test run
- * Should expect predict_proba with `batches` and predict_proba perform near identically
+ * Should expect predict_proba with `batches` and predict_proba
+ perform near identically
"""
- cls = SimpleClassificationPipeline(include={'classifier': ['sgd']})
+ cls = SimpleClassificationPipeline(include={"classifier": ["sgd"]})
# Multiclass
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
with ignore_warnings(classifier_warnings):
cls.fit(X_train, Y_train)
@@ -764,15 +852,17 @@ def test_predict_batched_sparse(self):
-------
* Should expect the output shape to match that of the digits dataset
* Should expect a fixed call count each test run
- * Should expect predict_proba with `batches` and predict_proba perform near identically
+ * Should expect predict_proba with `batches` and predict_proba
+ perform near identically
"""
cls = SimpleClassificationPipeline(
- dataset_properties={'sparse': True},
- include={'classifier': ['sgd']}
+ dataset_properties={"sparse": True}, include={"classifier": ["sgd"]}
)
# Multiclass
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True)
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="digits", make_sparse=True
+ )
with ignore_warnings(classifier_warnings):
cls.fit(X_train, Y_train)
@@ -797,11 +887,12 @@ def test_predict_proba_batched(self):
-------
* Should expect the output shape to match that of the digits dataset
* Should expect a fixed call count each test run
- * Should expect predict_proba with `batches` and predict_proba perform near identically
+ * Should expect predict_proba with `batches` and predict_proba
+ perform near identically
"""
# Multiclass
- cls = SimpleClassificationPipeline(include={'classifier': ['sgd']})
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
+ cls = SimpleClassificationPipeline(include={"classifier": ["sgd"]})
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
with ignore_warnings(classifier_warnings):
cls.fit(X_train, Y_train)
@@ -820,10 +911,11 @@ def test_predict_proba_batched(self):
np.testing.assert_array_almost_equal(prediction_, prediction)
# Multilabel
- cls = SimpleClassificationPipeline(include={'classifier': ['lda']})
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
- Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)]))
- for y in Y_train]))
+ cls = SimpleClassificationPipeline(include={"classifier": ["lda"]})
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
+ Y_train = np.array(
+ list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])
+ )
with ignore_warnings(classifier_warnings):
cls.fit(X_train, Y_train)
@@ -849,15 +941,18 @@ def test_predict_proba_batched_sparse(self):
-------
* Should expect the output shape to match that of the digits dataset
* Should expect a fixed call count each test run
- * Should expect predict_proba with `batches` and predict_proba perform near identically
+ * Should expect predict_proba with `batches` and predict_proba
+ perform near identically
"""
cls = SimpleClassificationPipeline(
- dataset_properties={'sparse': True, 'multiclass': True},
- include={'classifier': ['sgd']}
+ dataset_properties={"sparse": True, "multiclass": True},
+ include={"classifier": ["sgd"]},
)
# Multiclass
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True)
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="digits", make_sparse=True
+ )
X_test_ = X_test.copy()
with ignore_warnings(classifier_warnings):
@@ -877,10 +972,12 @@ def test_predict_proba_batched_sparse(self):
# Multilabel
cls = SimpleClassificationPipeline(
- dataset_properties={'sparse': True, 'multilabel': True},
- include={'classifier': ['lda']}
+ dataset_properties={"sparse": True, "multilabel": True},
+ include={"classifier": ["lda"]},
+ )
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="digits", make_sparse=True
)
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True)
X_test_ = X_test.copy()
Y_train = np.array([[1 if i != y else 0 for i in range(10)] for y in Y_train])
@@ -909,7 +1006,7 @@ def test_pipeline_clonability(self):
* The cloned object can be constructed from theses params
* The reconstructed clone and the original have the same param values
"""
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris")
auto = SimpleClassificationPipeline()
@@ -952,18 +1049,24 @@ def test_add_classifier(self):
* There should be 1 component after adding a classifier
* The classifier should be in the search space of the Pipeline after being added
"""
- self.assertEqual(len(classification_components.additional_components.components), 0)
- self.assertEqual(len(_addons['classification'].components), 0)
+ self.assertEqual(
+ len(classification_components.additional_components.components), 0
+ )
+ self.assertEqual(len(_addons["classification"].components), 0)
classification_components.add_classifier(DummyClassifier)
- self.assertEqual(len(classification_components.additional_components.components), 1)
- self.assertEqual(len(_addons['classification'].components), 1)
+ self.assertEqual(
+ len(classification_components.additional_components.components), 1
+ )
+ self.assertEqual(len(_addons["classification"].components), 1)
cs = SimpleClassificationPipeline().get_hyperparameter_search_space()
- self.assertIn('DummyClassifier', str(cs))
+ self.assertIn("DummyClassifier", str(cs))
- del classification_components.additional_components.components['DummyClassifier']
+ del classification_components.additional_components.components[
+ "DummyClassifier"
+ ]
def test_add_preprocessor(self):
"""Test that preprocessors can be added
@@ -972,22 +1075,30 @@ def test_add_preprocessor(self):
-------
* There should be 0 components initially
* There should be 1 component after adding a preprocessor
- * The preprocessor should be in the search space of the Pipeline after being added
+ * The preprocessor ii in the search space of the Pipeline after being added
"""
- self.assertEqual(len(preprocessing_components.additional_components.components), 0)
- self.assertEqual(len(_addons['feature_preprocessing'].components), 0)
+ self.assertEqual(
+ len(preprocessing_components.additional_components.components), 0
+ )
+ self.assertEqual(len(_addons["feature_preprocessing"].components), 0)
preprocessing_components.add_preprocessor(DummyPreprocessor)
- self.assertEqual(len(preprocessing_components.additional_components.components), 1)
- self.assertEqual(len(_addons['feature_preprocessing'].components), 1)
+ self.assertEqual(
+ len(preprocessing_components.additional_components.components), 1
+ )
+ self.assertEqual(len(_addons["feature_preprocessing"].components), 1)
cs = SimpleClassificationPipeline().get_hyperparameter_search_space()
- self.assertIn('DummyPreprocessor', str(cs))
+ self.assertIn("DummyPreprocessor", str(cs))
- del preprocessing_components.additional_components.components['DummyPreprocessor']
+ del preprocessing_components.additional_components.components[
+ "DummyPreprocessor"
+ ]
- def _test_set_hyperparameter_choice(self, expected_key, implementation, config_dict):
+ def _test_set_hyperparameter_choice(
+ self, expected_key, implementation, config_dict
+ ):
"""Given a configuration in config, this procedure makes sure that the given
implementation, which should be a Choice component, honors the type of the
object, and any hyperparameter associated to it
@@ -1001,14 +1112,16 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d
# Are there further hyperparams?
# A choice component might have attribute requirements that we need to check
- expected_sub_key = expected_key.replace(':__choice__', ':') + implementation_type
+ expected_sub_key = (
+ expected_key.replace(":__choice__", ":") + implementation_type
+ )
expected_attributes = {}
- if 'data_preprocessor:__choice__' in expected_key:
+ if "data_preprocessor:__choice__" in expected_key:
# We have to check both the numerical and categorical
to_check = {
- 'numerical_transformer': implementation.choice.numer_ppl.named_steps,
- 'categorical_transformer': implementation.choice.categ_ppl.named_steps,
- 'text_transformer': implementation.choice.txt_ppl.named_steps,
+ "numerical_transformer": implementation.choice.numer_ppl.named_steps,
+ "categorical_transformer": implementation.choice.categ_ppl.named_steps,
+ "text_transformer": implementation.choice.txt_ppl.named_steps,
}
for data_type, pipeline in to_check.items():
@@ -1016,8 +1129,7 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d
# If it is a Choice, make sure it is the correct one!
if isinstance(sub_step, AutoSklearnChoice):
key = "data_preprocessor:feature_type:{}:{}:__choice__".format(
- data_type,
- sub_name
+ data_type, sub_name
)
keys_checked.extend(
self._test_set_hyperparameter_choice(
@@ -1029,10 +1141,10 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d
keys_checked.extend(
self._test_set_hyperparameter_component(
"data_preprocessor:feature_type:{}:{}".format(
- data_type,
- sub_name
+ data_type, sub_name
),
- sub_step, config_dict
+ sub_step,
+ config_dict,
)
)
else:
@@ -1041,7 +1153,7 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d
for key, value in config_dict.items():
if key != expected_key and expected_sub_key in key:
- expected_attributes[key.split(':')[-1]] = value
+ expected_attributes[key.split(":")[-1]] = value
keys_checked.append(key)
if expected_attributes:
@@ -1053,7 +1165,9 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d
return keys_checked
- def _test_set_hyperparameter_component(self, expected_key, implementation, config_dict):
+ def _test_set_hyperparameter_component(
+ self, expected_key, implementation, config_dict
+ ):
"""
Given a configuration in config, this procedure makes sure that
the given implementation, which should be a autosklearn component, honors
@@ -1067,15 +1181,14 @@ def _test_set_hyperparameter_component(self, expected_key, implementation, confi
for key, value in config_dict.items():
if expected_key in key:
keys_checked.append(key)
- key = key.replace(expected_key + ':', '')
- if ':' in key:
- raise ValueError("This utility should only be called with a "
- "matching string that produces leaf configurations, "
- "that is no further colons are expected, yet key={}"
- "".format(
- key
- )
- )
+ key = key.replace(expected_key + ":", "")
+ if ":" in key:
+ raise ValueError(
+ "This utility should only be called with a "
+ "matching string that produces leaf configurations, "
+ "that is no further colons are expected, yet key={}"
+ "".format(key)
+ )
expected_attributes[key] = value
# self.assertDictContainsSubset(expected_attributes, attributes)
# Cannot check the whole dictionary, just names, as some
@@ -1097,12 +1210,17 @@ def test_set_hyperparameters_honors_configuration(self):
"""
random_state = 1
all_combinations = list(itertools.product([True, False], repeat=4))
- for sparse, multilabel, signed, multiclass, in all_combinations:
+ for (
+ sparse,
+ multilabel,
+ signed,
+ multiclass,
+ ) in all_combinations:
dataset_properties = {
- 'sparse': sparse,
- 'multilabel': multilabel,
- 'multiclass': multiclass,
- 'signed': signed,
+ "sparse": sparse,
+ "multilabel": multilabel,
+ "multiclass": multiclass,
+ "signed": signed,
}
cls = SimpleClassificationPipeline(
random_state=random_state,
@@ -1121,36 +1239,37 @@ def test_set_hyperparameters_honors_configuration(self):
keys_checked = []
for name, step in cls.named_steps.items():
- if name == 'data_preprocessor':
+ if name == "data_preprocessor":
keys_checked.extend(
self._test_set_hyperparameter_choice(
- 'data_preprocessor:__choice__', step, config_dict
+ "data_preprocessor:__choice__", step, config_dict
)
)
self.assertEqual(step.random_state, random_state)
- elif name == 'balancing':
+ elif name == "balancing":
keys_checked.extend(
self._test_set_hyperparameter_component(
- 'balancing',
- step, config_dict
+ "balancing", step, config_dict
)
)
- elif name == 'feature_preprocessor':
+ elif name == "feature_preprocessor":
keys_checked.extend(
self._test_set_hyperparameter_choice(
- 'feature_preprocessor:__choice__', step, config_dict
+ "feature_preprocessor:__choice__", step, config_dict
)
)
self.assertEqual(step.random_state, random_state)
- elif name == 'classifier':
+ elif name == "classifier":
keys_checked.extend(
self._test_set_hyperparameter_choice(
- 'classifier:__choice__', step, config_dict
+ "classifier:__choice__", step, config_dict
)
)
self.assertEqual(step.random_state, random_state)
else:
- raise ValueError("Found another type of step! Need to update this check")
+ raise ValueError(
+ "Found another type of step! Need to update this check"
+ )
# Make sure we checked the whole configuration
self.assertSetEqual(set(config_dict.keys()), set(keys_checked))
@@ -1162,18 +1281,18 @@ def test_fit_instantiates_component(self):
# We reduce the search space as forbidden clauses prevent to instantiate
# the user defined preprocessor manually
- cls = SimpleClassificationPipeline(
- include={'classifier': ['random_forest']}
- )
+ cls = SimpleClassificationPipeline(include={"classifier": ["random_forest"]})
cs = cls.get_hyperparameter_search_space()
- self.assertIn('CrashPreprocessor', str(cs))
+ self.assertIn("CrashPreprocessor", str(cs))
config = cs.sample_configuration()
try:
- config['feature_preprocessor:__choice__'] = 'CrashPreprocessor'
+ config["feature_preprocessor:__choice__"] = "CrashPreprocessor"
except Exception as e:
# In case of failure clean up the components and print enough information
# to clean up with check in the future
- del preprocessing_components.additional_components.components['CrashPreprocessor']
+ del preprocessing_components.additional_components.components[
+ "CrashPreprocessor"
+ ]
self.fail("cs={} config={} Exception={}".format(cs, config, e))
cls.set_hyperparameters(config)
@@ -1182,7 +1301,9 @@ def test_fit_instantiates_component(self):
with ignore_warnings(classifier_warnings):
cls.fit(
X=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]),
- y=np.array([1, 0, 1, 1])
+ y=np.array([1, 0, 1, 1]),
)
- del preprocessing_components.additional_components.components['CrashPreprocessor']
+ del preprocessing_components.additional_components.components[
+ "CrashPreprocessor"
+ ]
diff --git a/test/test_pipeline/test_create_searchspace_util_classification.py b/test/test_pipeline/test_create_searchspace_util_classification.py
index 7bf1450979..a830430097 100644
--- a/test/test_pipeline/test_create_searchspace_util_classification.py
+++ b/test/test_pipeline/test_create_searchspace_util_classification.py
@@ -1,20 +1,23 @@
+import unittest
from collections import OrderedDict
-import unittest
import numpy
-
from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter
-from autosklearn.pipeline.components.classification.liblinear_svc import LibLinear_SVC
+import autosklearn.pipeline.create_searchspace_util
from autosklearn.pipeline.components.classification.lda import LDA
-
+from autosklearn.pipeline.components.classification.liblinear_svc import LibLinear_SVC
+from autosklearn.pipeline.components.feature_preprocessing.no_preprocessing import (
+ NoPreprocessing,
+)
from autosklearn.pipeline.components.feature_preprocessing.pca import PCA
-from autosklearn.pipeline.components.feature_preprocessing.truncatedSVD import TruncatedSVD
-from autosklearn.pipeline.components.feature_preprocessing.no_preprocessing import NoPreprocessing
-from autosklearn.pipeline.components.feature_preprocessing.random_trees_embedding \
- import RandomTreesEmbedding
-import autosklearn.pipeline.create_searchspace_util
+from autosklearn.pipeline.components.feature_preprocessing.random_trees_embedding import ( # noqa: E501
+ RandomTreesEmbedding,
+)
+from autosklearn.pipeline.components.feature_preprocessing.truncatedSVD import (
+ TruncatedSVD,
+)
class TestCreateClassificationSearchspace(unittest.TestCase):
@@ -23,9 +26,9 @@ class TestCreateClassificationSearchspace(unittest.TestCase):
def test_get_match_array_sparse_and_dense(self):
# preproc is empty
preprocessors = OrderedDict()
- preprocessors['pca'] = PCA
+ preprocessors["pca"] = PCA
classifiers = OrderedDict()
- classifiers['lda'] = LDA
+ classifiers["lda"] = LDA
# Sparse + dense
class Preprocessors(object):
@@ -40,62 +43,69 @@ def get_available_components(self, *args, **kwargs):
# Dense
m = autosklearn.pipeline.create_searchspace_util.get_match_array(
- pipeline=((0, PCA), (1, LDA)), dataset_properties={'sparse': True})
+ pipeline=((0, PCA), (1, LDA)), dataset_properties={"sparse": True}
+ )
self.assertEqual(numpy.sum(m), 0)
m = autosklearn.pipeline.create_searchspace_util.get_match_array(
- pipeline=((0, PCA), (1, LDA)), dataset_properties={'sparse': False})
+ pipeline=((0, PCA), (1, LDA)), dataset_properties={"sparse": False}
+ )
self.assertEqual(m, [[1]])
# Sparse
- preprocessors['tSVD'] = TruncatedSVD
+ preprocessors["tSVD"] = TruncatedSVD
m = autosklearn.pipeline.create_searchspace_util.get_match_array(
- pipeline=((0, Preprocessors), (1, LDA)),
- dataset_properties={'sparse': True})
+ pipeline=((0, Preprocessors), (1, LDA)), dataset_properties={"sparse": True}
+ )
self.assertEqual(m[0], [0]) # pca
self.assertEqual(m[1], [1]) # svd
m = autosklearn.pipeline.create_searchspace_util.get_match_array(
pipeline=((0, Preprocessors), (1, LDA)),
- dataset_properties={'sparse': False})
+ dataset_properties={"sparse": False},
+ )
self.assertEqual(m[0], [1]) # pca
self.assertEqual(m[1], [0]) # svd
- preprocessors['none'] = NoPreprocessing
+ preprocessors["none"] = NoPreprocessing
m = autosklearn.pipeline.create_searchspace_util.get_match_array(
- pipeline=((0, Preprocessors), (1, LDA)),
- dataset_properties={'sparse': True})
+ pipeline=((0, Preprocessors), (1, LDA)), dataset_properties={"sparse": True}
+ )
self.assertEqual(m[0, :], [0]) # pca
self.assertEqual(m[1, :], [1]) # tsvd
self.assertEqual(m[2, :], [0]) # none
m = autosklearn.pipeline.create_searchspace_util.get_match_array(
pipeline=((0, Preprocessors), (1, LDA)),
- dataset_properties={'sparse': False})
+ dataset_properties={"sparse": False},
+ )
self.assertEqual(m[0, :], [1]) # pca
self.assertEqual(m[1, :], [0]) # tsvd
self.assertEqual(m[2, :], [1]) # none
- classifiers['libsvm'] = LibLinear_SVC
+ classifiers["libsvm"] = LibLinear_SVC
m = autosklearn.pipeline.create_searchspace_util.get_match_array(
pipeline=((0, Preprocessors), (1, Classifiers)),
- dataset_properties={'sparse': False})
+ dataset_properties={"sparse": False},
+ )
self.assertListEqual(list(m[0, :]), [1, 1]) # pca
self.assertListEqual(list(m[1, :]), [0, 0]) # tsvd
self.assertListEqual(list(m[2, :]), [1, 1]) # none
m = autosklearn.pipeline.create_searchspace_util.get_match_array(
pipeline=((0, Preprocessors), (1, Classifiers)),
- dataset_properties={'sparse': True})
+ dataset_properties={"sparse": True},
+ )
self.assertListEqual(list(m[0, :]), [0, 0]) # pca
self.assertListEqual(list(m[1, :]), [1, 1]) # tsvd
self.assertListEqual(list(m[2, :]), [0, 1]) # none
# Do fancy 3d stuff
- preprocessors['random_trees'] = RandomTreesEmbedding
+ preprocessors["random_trees"] = RandomTreesEmbedding
m = autosklearn.pipeline.create_searchspace_util.get_match_array(
pipeline=((0, Preprocessors), (1, Preprocessors), (2, Classifiers)),
- dataset_properties={'sparse': False})
+ dataset_properties={"sparse": False},
+ )
# PCA followed by truncated SVD is forbidden
self.assertEqual(list(m[0].flatten()), [1, 1, 0, 0, 1, 1, 0, 1])
# Truncated SVD is forbidden
@@ -112,28 +122,38 @@ def test_get_match_array_signed_unsigned_and_binary(self):
@unittest.skip("Not currently working.")
def test_add_forbidden(self):
m = numpy.ones([2, 3])
- preprocessors_list = ['pa', 'pb']
- classifier_list = ['ca', 'cb', 'cc']
+ preprocessors_list = ["pa", "pb"]
+ classifier_list = ["ca", "cb", "cc"]
cs = ConfigurationSpace()
- preprocessor = CategoricalHyperparameter(name='feature_preprocessor',
- choices=preprocessors_list)
- classifier = CategoricalHyperparameter(name='classifier',
- choices=classifier_list)
+ preprocessor = CategoricalHyperparameter(
+ name="feature_preprocessor", choices=preprocessors_list
+ )
+ classifier = CategoricalHyperparameter(
+ name="classifier", choices=classifier_list
+ )
cs.add_hyperparameter(preprocessor)
cs.add_hyperparameter(classifier)
new_cs = autosklearn.pipeline.create_searchspace_util.add_forbidden(
- conf_space=cs, node_0_list=preprocessors_list,
- node_1_list=classifier_list, matches=m,
- node_0_name='feature_preprocessor', node_1_name="classifier")
+ conf_space=cs,
+ node_0_list=preprocessors_list,
+ node_1_list=classifier_list,
+ matches=m,
+ node_0_name="feature_preprocessor",
+ node_1_name="classifier",
+ )
self.assertEqual(len(new_cs.forbidden_clauses), 0)
self.assertIsInstance(new_cs, ConfigurationSpace)
m[1, 1] = 0
new_cs = autosklearn.pipeline.create_searchspace_util.add_forbidden(
- conf_space=cs, node_0_list=preprocessors_list,
- node_1_list=classifier_list, matches=m,
- node_0_name='feature_preprocessor', node_1_name="classifier")
+ conf_space=cs,
+ node_0_list=preprocessors_list,
+ node_1_list=classifier_list,
+ matches=m,
+ node_0_name="feature_preprocessor",
+ node_1_name="classifier",
+ )
self.assertEqual(len(new_cs.forbidden_clauses), 1)
- self.assertEqual(new_cs.forbidden_clauses[0].components[0].value, 'cb')
- self.assertEqual(new_cs.forbidden_clauses[0].components[1].value, 'pb')
+ self.assertEqual(new_cs.forbidden_clauses[0].components[0].value, "cb")
+ self.assertEqual(new_cs.forbidden_clauses[0].components[1].value, "pb")
self.assertIsInstance(new_cs, ConfigurationSpace)
diff --git a/test/test_pipeline/test_regression.py b/test/test_pipeline/test_regression.py
index fccf59af67..501b73ec5d 100644
--- a/test/test_pipeline/test_regression.py
+++ b/test/test_pipeline/test_regression.py
@@ -5,28 +5,36 @@
import unittest
import unittest.mock
-from joblib import Memory
import numpy as np
import sklearn.datasets
import sklearn.decomposition
-from sklearn.base import clone
import sklearn.ensemble
import sklearn.svm
-from sklearn.utils.validation import check_is_fitted
-
from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter
+from joblib import Memory
+from sklearn.base import clone
+from sklearn.utils.validation import check_is_fitted
-from autosklearn.pipeline.regression import SimpleRegressionPipeline
-from autosklearn.pipeline.components.base import \
- AutoSklearnPreprocessingAlgorithm, AutoSklearnRegressionAlgorithm
-import autosklearn.pipeline.components.regression as regression_components
-from autosklearn.pipeline.components.base import AutoSklearnComponent, AutoSklearnChoice
import autosklearn.pipeline.components.feature_preprocessing as preprocessing_components
+import autosklearn.pipeline.components.regression as regression_components
+from autosklearn.pipeline.components.base import (
+ AutoSklearnChoice,
+ AutoSklearnComponent,
+ AutoSklearnPreprocessingAlgorithm,
+ AutoSklearnRegressionAlgorithm,
+)
+from autosklearn.pipeline.constants import (
+ DENSE,
+ PREDICTIONS,
+ SIGNED_DATA,
+ SPARSE,
+ UNSIGNED_DATA,
+)
+from autosklearn.pipeline.regression import SimpleRegressionPipeline
from autosklearn.pipeline.util import get_dataset
-from autosklearn.pipeline.constants import SPARSE, DENSE, SIGNED_DATA, UNSIGNED_DATA, PREDICTIONS
-from test.test_pipeline.ignored_warnings import regressor_warnings, ignore_warnings
+from test.test_pipeline.ignored_warnings import ignore_warnings, regressor_warnings
class SimpleRegressionPipelineTest(unittest.TestCase):
@@ -38,41 +46,43 @@ def test_io_dict(self):
if regressors[r] == regression_components.RegressorChoice:
continue
props = regressors[r].get_properties()
- self.assertIn('input', props)
- self.assertIn('output', props)
- inp = props['input']
- output = props['output']
+ self.assertIn("input", props)
+ self.assertIn("output", props)
+ inp = props["input"]
+ output = props["output"]
self.assertIsInstance(inp, tuple)
self.assertIsInstance(output, tuple)
for i in inp:
self.assertIn(i, (SPARSE, DENSE, SIGNED_DATA, UNSIGNED_DATA))
self.assertEqual(output, (PREDICTIONS,))
- self.assertIn('handles_regression', props)
- self.assertTrue(props['handles_regression'])
- self.assertIn('handles_classification', props)
- self.assertIn('handles_multiclass', props)
- self.assertIn('handles_multilabel', props)
- self.assertIn('handles_multioutput', props)
- self.assertFalse(props['handles_classification'])
- self.assertFalse(props['handles_multiclass'])
- self.assertFalse(props['handles_multilabel'])
+ self.assertIn("handles_regression", props)
+ self.assertTrue(props["handles_regression"])
+ self.assertIn("handles_classification", props)
+ self.assertIn("handles_multiclass", props)
+ self.assertIn("handles_multilabel", props)
+ self.assertIn("handles_multioutput", props)
+ self.assertFalse(props["handles_classification"])
+ self.assertFalse(props["handles_multiclass"])
+ self.assertFalse(props["handles_multilabel"])
def test_find_regressors(self):
regressors = regression_components._regressors
self.assertGreaterEqual(len(regressors), 1)
for key in regressors:
- if hasattr(regressors[key], 'get_components'):
+ if hasattr(regressors[key], "get_components"):
continue
self.assertIn(AutoSklearnRegressionAlgorithm, regressors[key].__bases__)
def test_find_preprocessors(self):
preprocessors = preprocessing_components._preprocessors
- self.assertGreaterEqual(len(preprocessors), 1)
+ self.assertGreaterEqual(len(preprocessors), 1)
for key in preprocessors:
- if hasattr(preprocessors[key], 'get_components'):
+ if hasattr(preprocessors[key], "get_components"):
continue
- self.assertIn(AutoSklearnPreprocessingAlgorithm, preprocessors[key].__bases__)
+ self.assertIn(
+ AutoSklearnPreprocessingAlgorithm, preprocessors[key].__bases__
+ )
def test_configurations(self):
cs = SimpleRegressionPipeline().get_hyperparameter_search_space()
@@ -80,27 +90,28 @@ def test_configurations(self):
self._test_configurations(cs)
def test_configurations_signed_data(self):
- dataset_properties = {'signed': True}
- cs = SimpleRegressionPipeline(dataset_properties=dataset_properties).\
- get_hyperparameter_search_space()
+ dataset_properties = {"signed": True}
+ cs = SimpleRegressionPipeline(
+ dataset_properties=dataset_properties
+ ).get_hyperparameter_search_space()
- self._test_configurations(configurations_space=cs,
- dataset_properties=dataset_properties)
+ self._test_configurations(
+ configurations_space=cs, dataset_properties=dataset_properties
+ )
def test_configurations_sparse(self):
- dataset_properties = {'sparse': True}
+ dataset_properties = {"sparse": True}
cs = SimpleRegressionPipeline(
dataset_properties=dataset_properties
).get_hyperparameter_search_space()
- self._test_configurations(cs, make_sparse=True,
- dataset_properties=dataset_properties)
+ self._test_configurations(
+ cs, make_sparse=True, dataset_properties=dataset_properties
+ )
def test_multioutput(self):
cache = Memory(location=tempfile.gettempdir())
- cached_func = cache.cache(
- sklearn.datasets.make_regression
- )
+ cached_func = cache.cache(sklearn.datasets.make_regression)
X, Y = cached_func(
n_samples=250,
n_features=20,
@@ -112,24 +123,33 @@ def test_multioutput(self):
noise=0.3,
shuffle=True,
coef=False,
- random_state=1
+ random_state=1,
)
X_train = X[:200, :]
Y_train = Y[:200, :]
X_test = X[200:, :]
Y_test = Y[200:, :]
- data = {'X_train': X_train, 'Y_train': Y_train,
- 'X_test': X_test, 'Y_test': Y_test}
+ data = {
+ "X_train": X_train,
+ "Y_train": Y_train,
+ "X_test": X_test,
+ "Y_test": Y_test,
+ }
- dataset_properties = {'multioutput': True}
+ dataset_properties = {"multioutput": True}
pipeline = SimpleRegressionPipeline(dataset_properties=dataset_properties)
cs = pipeline.get_hyperparameter_search_space()
self._test_configurations(cs, data=data, dataset_properties=dataset_properties)
- def _test_configurations(self, configurations_space, make_sparse=False,
- data=None, dataset_properties=None):
+ def _test_configurations(
+ self,
+ configurations_space,
+ make_sparse=False,
+ data=None,
+ dataset_properties=None,
+ ):
# Use a limit of ~4GiB
limit = 3072 * 1024 * 1024
resource.setrlimit(resource.RLIMIT_AS, (limit, limit))
@@ -141,42 +161,48 @@ def _test_configurations(self, configurations_space, make_sparse=False,
config._populate_values()
# Restrict configurations which could take too long on travis-ci
- restrictions = {'regressor:adaboost:n_estimators': 50,
- 'regressor:adaboost:max_depth': 1,
- 'feature_preprocessor:kernel_pca:n_components': 10,
- 'feature_preprocessor:kitchen_sinks:n_components': 50,
- 'regressor:libsvm_svc:degree': 2,
- 'regressor:libsvm_svr:degree': 2,
- 'regressor:libsvm_svr:C': 1.,
- 'feature_preprocessor:truncatedSVD:target_dim': 10,
- 'feature_preprocessor:polynomial:degree': 2,
- 'regressor:lda:n_components': 10}
+ restrictions = {
+ "regressor:adaboost:n_estimators": 50,
+ "regressor:adaboost:max_depth": 1,
+ "feature_preprocessor:kernel_pca:n_components": 10,
+ "feature_preprocessor:kitchen_sinks:n_components": 50,
+ "regressor:libsvm_svc:degree": 2,
+ "regressor:libsvm_svr:degree": 2,
+ "regressor:libsvm_svr:C": 1.0,
+ "feature_preprocessor:truncatedSVD:target_dim": 10,
+ "feature_preprocessor:polynomial:degree": 2,
+ "regressor:lda:n_components": 10,
+ }
for restrict_parameter in restrictions:
restrict_to = restrictions[restrict_parameter]
- if restrict_parameter in config and config[restrict_parameter] is not None:
+ if (
+ restrict_parameter in config
+ and config[restrict_parameter] is not None
+ ):
config._values[restrict_parameter] = restrict_to
if data is None:
X_train, Y_train, X_test, Y_test = get_dataset(
- dataset='boston', make_sparse=make_sparse, add_NaNs=True)
+ dataset="boston", make_sparse=make_sparse, add_NaNs=True
+ )
else:
- X_train = data['X_train'].copy()
- Y_train = data['Y_train'].copy()
- X_test = data['X_test'].copy()
- data['Y_test'].copy()
+ X_train = data["X_train"].copy()
+ Y_train = data["Y_train"].copy()
+ X_test = data["X_test"].copy()
+ data["Y_test"].copy()
cls = SimpleRegressionPipeline(
- random_state=1,
- dataset_properties=dataset_properties
+ random_state=1, dataset_properties=dataset_properties
)
cls.set_hyperparameters(config)
# First make sure that for this configuration, setting the parameters
# does not mistakenly set the estimator as fitted
for name, step in cls.named_steps.items():
- with self.assertRaisesRegex(sklearn.exceptions.NotFittedError,
- "instance is not fitted yet"):
+ with self.assertRaisesRegex(
+ sklearn.exceptions.NotFittedError, "instance is not fitted yet"
+ ):
check_is_fitted(step)
try:
@@ -190,9 +216,9 @@ def _test_configurations(self, configurations_space, make_sparse=False,
for name, step in cls.named_steps.items():
check_is_fitted(step)
except sklearn.exceptions.NotFittedError:
- self.fail("config={} raised NotFittedError unexpectedly!".format(
- config
- ))
+ self.fail(
+ "config={} raised NotFittedError unexpectedly!".format(config)
+ )
cls.predict(X_test)
except MemoryError:
@@ -200,8 +226,7 @@ def _test_configurations(self, configurations_space, make_sparse=False,
except np.linalg.LinAlgError:
continue
except ValueError as e:
- if "Floating-point under-/overflow occurred at epoch" in \
- e.args[0]:
+ if "Floating-point under-/overflow occurred at epoch" in e.args[0]:
continue
elif "removed all features" in e.args[0]:
continue
@@ -209,13 +234,17 @@ def _test_configurations(self, configurations_space, make_sparse=False,
continue
elif "Numerical problems in QDA" in e.args[0]:
continue
- elif 'Bug in scikit-learn' in e.args[0]:
+ elif "Bug in scikit-learn" in e.args[0]:
continue
- elif 'The condensed distance matrix must contain only finite ' \
- 'values.' in e.args[0]:
+ elif (
+ "The condensed distance matrix must contain only finite "
+ "values." in e.args[0]
+ ):
continue
- elif "zero-size array to reduction operation maximum which has no " \
- "identity" in e.args[0]:
+ elif (
+ "zero-size array to reduction operation maximum which has no "
+ "identity" in e.args[0]
+ ):
continue
else:
e.args += (f"config={config}",)
@@ -244,7 +273,10 @@ def _test_configurations(self, configurations_space, make_sparse=False,
raise e
except Exception as e:
- if "Multiple input features cannot have the same target value" in e.args[0]:
+ if (
+ "Multiple input features cannot have the same target value"
+ in e.args[0]
+ ):
continue
else:
e.args += (f"config={config}",)
@@ -252,7 +284,7 @@ def _test_configurations(self, configurations_space, make_sparse=False,
def test_default_configuration(self):
for i in range(2):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="diabetes")
auto = SimpleRegressionPipeline(random_state=1)
auto = auto.fit(X_train, Y_train)
predictions = auto.predict(copy.deepcopy(X_test))
@@ -266,16 +298,15 @@ def test_default_configuration_iterative_fit(self):
regressor = SimpleRegressionPipeline(
random_state=1,
include={
- 'regressor': ['random_forest'],
- 'feature_preprocessor': ['no_preprocessing']
- }
+ "regressor": ["random_forest"],
+ "feature_preprocessor": ["no_preprocessing"],
+ },
)
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="boston")
regressor.fit_transformer(X_train, Y_train)
for i in range(1, 11):
regressor.iterative_fit(X_train, Y_train)
- self.assertEqual(regressor.steps[-1][-1].choice.estimator.n_estimators,
- i)
+ self.assertEqual(regressor.steps[-1][-1].choice.estimator.n_estimators, i)
def test_repr(self):
representation = repr(SimpleRegressionPipeline())
@@ -293,56 +324,50 @@ def test_get_hyperparameter_search_space(self):
self.assertEqual(len(forbiddens), 35)
def test_get_hyperparameter_search_space_include_exclude_models(self):
- regressor = SimpleRegressionPipeline(
- include={'regressor': ['random_forest']}
- )
+ regressor = SimpleRegressionPipeline(include={"regressor": ["random_forest"]})
cs = regressor.get_hyperparameter_search_space()
self.assertEqual(
- cs.get_hyperparameter('regressor:__choice__'),
- CategoricalHyperparameter('regressor:__choice__', ['random_forest']),
+ cs.get_hyperparameter("regressor:__choice__"),
+ CategoricalHyperparameter("regressor:__choice__", ["random_forest"]),
)
# TODO add this test when more than one regressor is present
- regressor = SimpleRegressionPipeline(
- exclude={'regressor': ['random_forest']}
- )
+ regressor = SimpleRegressionPipeline(exclude={"regressor": ["random_forest"]})
cs = regressor.get_hyperparameter_search_space()
- self.assertNotIn('random_forest', str(cs))
+ self.assertNotIn("random_forest", str(cs))
- regressor = SimpleRegressionPipeline(
- include={'feature_preprocessor': ['pca']}
- )
+ regressor = SimpleRegressionPipeline(include={"feature_preprocessor": ["pca"]})
cs = regressor.get_hyperparameter_search_space()
- self.assertEqual(cs.get_hyperparameter(
- 'feature_preprocessor:__choice__'),
- CategoricalHyperparameter('feature_preprocessor:__choice__', ['pca']))
+ self.assertEqual(
+ cs.get_hyperparameter("feature_preprocessor:__choice__"),
+ CategoricalHyperparameter("feature_preprocessor:__choice__", ["pca"]),
+ )
regressor = SimpleRegressionPipeline(
- exclude={'feature_preprocessor': ['no_preprocessing']}
+ exclude={"feature_preprocessor": ["no_preprocessing"]}
)
cs = regressor.get_hyperparameter_search_space()
- self.assertNotIn('no_preprocessing', str(cs))
+ self.assertNotIn("no_preprocessing", str(cs))
- def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(
- self
+ def test_get_hyperparameter_search_space_preprocessor_contradicts_default(
+ self,
):
regressor = SimpleRegressionPipeline(
- include={'feature_preprocessor': ['densifier']},
- dataset_properties={'sparse': True}
+ include={"feature_preprocessor": ["densifier"]},
+ dataset_properties={"sparse": True},
)
cs = regressor.get_hyperparameter_search_space()
self.assertEqual(
- cs.get_hyperparameter('regressor:__choice__').default_value,
- 'gradient_boosting'
+ cs.get_hyperparameter("regressor:__choice__").default_value,
+ "gradient_boosting",
)
regressor = SimpleRegressionPipeline(
- include={'feature_preprocessor': ['nystroem_sampler']}
+ include={"feature_preprocessor": ["nystroem_sampler"]}
)
cs = regressor.get_hyperparameter_search_space()
self.assertEqual(
- cs.get_hyperparameter('regressor:__choice__').default_value,
- 'sgd'
+ cs.get_hyperparameter("regressor:__choice__").default_value, "sgd"
)
def test_get_hyperparameter_search_space_only_forbidden_combinations(self):
@@ -351,9 +376,9 @@ def test_get_hyperparameter_search_space_only_forbidden_combinations(self):
"Cannot find a legal default configuration.",
SimpleRegressionPipeline,
include={
- 'regressor': ['random_forest'],
- 'feature_preprocessor': ['kitchen_sinks']
- }
+ "regressor": ["random_forest"],
+ "feature_preprocessor": ["kitchen_sinks"],
+ },
)
# It must also be catched that no classifiers which can handle sparse
@@ -363,14 +388,16 @@ def test_get_hyperparameter_search_space_only_forbidden_combinations(self):
"Cannot find a legal default configuration",
SimpleRegressionPipeline,
include={
- 'regressor': ['extra_trees'],
- 'feature_preprocessor': ['densifier']
+ "regressor": ["extra_trees"],
+ "feature_preprocessor": ["densifier"],
},
- dataset_properties={'sparse': True}
+ dataset_properties={"sparse": True},
)
- @unittest.skip("test_get_hyperparameter_search_space_dataset_properties" +
- " Not yet Implemented")
+ @unittest.skip(
+ "test_get_hyperparameter_search_space_dataset_properties"
+ + " Not yet Implemented"
+ )
def test_get_hyperparameter_search_space_dataset_properties(self):
# TODO: We do not have any dataset properties for regression, so this
# test is somewhat stupid
@@ -403,16 +430,14 @@ def test_get_hyperparameter_search_space_dataset_properties(self):
"""
def test_predict_batched(self):
- include = {'regressor': ['decision_tree']}
+ include = {"regressor": ["decision_tree"]}
cs = SimpleRegressionPipeline(include=include).get_hyperparameter_search_space()
default = cs.get_default_configuration()
regressor = SimpleRegressionPipeline(
- config=default,
- random_state=1,
- include=include
+ config=default, random_state=1, include=include
)
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="boston")
regressor.fit(X_train, Y_train)
X_test_ = X_test.copy()
prediction_ = regressor.predict(X_test_)
@@ -424,12 +449,11 @@ def test_predict_batched(self):
np.testing.assert_array_almost_equal(prediction_, prediction)
def test_predict_batched_sparse(self):
- dataset_properties = {'sparse': True}
- include = {'regressor': ['decision_tree']}
+ dataset_properties = {"sparse": True}
+ include = {"regressor": ["decision_tree"]}
cs = SimpleRegressionPipeline(
- dataset_properties=dataset_properties,
- include=include
+ dataset_properties=dataset_properties, include=include
).get_hyperparameter_search_space()
default = cs.get_default_configuration()
@@ -437,11 +461,12 @@ def test_predict_batched_sparse(self):
config=default,
random_state=1,
dataset_properties=dataset_properties,
- include=include
+ include=include,
)
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston',
- make_sparse=True)
+ X_train, Y_train, X_test, Y_test = get_dataset(
+ dataset="boston", make_sparse=True
+ )
regressor.fit(X_train, Y_train)
X_test_ = X_test.copy()
prediction_ = regressor.predict(X_test_)
@@ -465,7 +490,7 @@ def test_validate_input_Y(self):
raise NotImplementedError()
def test_pipeline_clonability(self):
- X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston')
+ X_train, Y_train, X_test, Y_test = get_dataset(dataset="boston")
auto = SimpleRegressionPipeline(random_state=1)
auto = auto.fit(X_train, Y_train)
auto_clone = clone(auto)
@@ -494,7 +519,9 @@ def test_set_params(self):
def test_get_params(self):
pass
- def _test_set_hyperparameter_choice(self, expected_key, implementation, config_dict):
+ def _test_set_hyperparameter_choice(
+ self, expected_key, implementation, config_dict
+ ):
"""
Given a configuration in config, this procedure makes sure that
the given implementation, which should be a Choice component, honors
@@ -507,14 +534,16 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d
# Are there further hyperparams?
# A choice component might have attribute requirements that we need to check
- expected_sub_key = expected_key.replace(':__choice__', ':') + implementation_type
+ expected_sub_key = (
+ expected_key.replace(":__choice__", ":") + implementation_type
+ )
expected_attributes = {}
- if 'data_preprocessor:__choice__' in expected_key:
+ if "data_preprocessor:__choice__" in expected_key:
# We have to check both the numerical and categorical
to_check = {
- 'numerical_transformer': implementation.choice.numer_ppl.named_steps,
- 'categorical_transformer': implementation.choice.categ_ppl.named_steps,
- 'text_transformer': implementation.choice.txt_ppl.named_steps,
+ "numerical_transformer": implementation.choice.numer_ppl.named_steps,
+ "categorical_transformer": implementation.choice.categ_ppl.named_steps,
+ "text_transformer": implementation.choice.txt_ppl.named_steps,
}
for data_type, pipeline in to_check.items():
@@ -522,8 +551,7 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d
# If it is a Choice, make sure it is the correct one!
if isinstance(sub_step, AutoSklearnChoice):
key = "data_preprocessor:feature_type:{}:{}:__choice__".format(
- data_type,
- sub_name
+ data_type, sub_name
)
keys_checked.extend(
self._test_set_hyperparameter_choice(
@@ -535,10 +563,10 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d
keys_checked.extend(
self._test_set_hyperparameter_component(
"data_preprocessor:feature_type:{}:{}".format(
- data_type,
- sub_name
+ data_type, sub_name
),
- sub_step, config_dict
+ sub_step,
+ config_dict,
)
)
else:
@@ -547,7 +575,7 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d
else:
for key, value in config_dict.items():
if key != expected_key and expected_sub_key in key:
- expected_attributes[key.split(':')[-1]] = value
+ expected_attributes[key.split(":")[-1]] = value
keys_checked.append(key)
if expected_attributes:
attributes = vars(implementation.choice)
@@ -557,7 +585,9 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d
self.assertIn(expected_attribute, attributes.keys())
return keys_checked
- def _test_set_hyperparameter_component(self, expected_key, implementation, config_dict):
+ def _test_set_hyperparameter_component(
+ self, expected_key, implementation, config_dict
+ ):
"""
Given a configuration in config, this procedure makes sure that
the given implementation, which should be a autosklearn component, honors
@@ -569,15 +599,14 @@ def _test_set_hyperparameter_component(self, expected_key, implementation, confi
for key, value in config_dict.items():
if expected_key in key:
keys_checked.append(key)
- key = key.replace(expected_key + ':', '')
- if ':' in key:
- raise ValueError("This utility should only be called with a "
- "matching string that produces leaf configurations, "
- "that is no further colons are expected, yet key={}"
- "".format(
- key
- )
- )
+ key = key.replace(expected_key + ":", "")
+ if ":" in key:
+ raise ValueError(
+ "This utility should only be called with a "
+ "matching string that produces leaf configurations, "
+ "that is no further colons are expected, yet key={}"
+ "".format(key)
+ )
expected_attributes[key] = value
# Cannot check the whole dictionary, just names, as some
# classes map the text hyperparameter directly to a function!
@@ -598,12 +627,17 @@ def test_set_hyperparameters_honors_configuration(self):
"""
all_combinations = list(itertools.product([True, False], repeat=4))
- for sparse, multilabel, signed, multiclass, in all_combinations:
+ for (
+ sparse,
+ multilabel,
+ signed,
+ multiclass,
+ ) in all_combinations:
dataset_properties = {
- 'sparse': sparse,
- 'multilabel': multilabel,
- 'multiclass': multiclass,
- 'signed': signed,
+ "sparse": sparse,
+ "multilabel": multilabel,
+ "multiclass": multiclass,
+ "signed": signed,
}
random_state = 1
auto = SimpleRegressionPipeline(
@@ -623,31 +657,32 @@ def test_set_hyperparameters_honors_configuration(self):
keys_checked = []
for name, step in auto.named_steps.items():
- if name == 'data_preprocessor':
+ if name == "data_preprocessor":
keys_checked.extend(
self._test_set_hyperparameter_choice(
- 'data_preprocessor:__choice__', step, config_dict
+ "data_preprocessor:__choice__", step, config_dict
)
)
self.assertEqual(step.random_state, random_state)
- elif name == 'feature_preprocessor':
+ elif name == "feature_preprocessor":
keys_checked.extend(
self._test_set_hyperparameter_choice(
- 'feature_preprocessor:__choice__', step, config_dict
+ "feature_preprocessor:__choice__", step, config_dict
)
)
self.assertEqual(step.random_state, random_state)
- elif name == 'regressor':
+ elif name == "regressor":
keys_checked.extend(
self._test_set_hyperparameter_choice(
- 'regressor:__choice__', step, config_dict
+ "regressor:__choice__", step, config_dict
)
)
self.assertEqual(step.random_state, random_state)
else:
- raise ValueError("Found another type of step! Need to update this check"
- " {}. ".format(name)
- )
+ raise ValueError(
+ "Found another type of step! Need to update this check"
+ " {}. ".format(name)
+ )
# Make sure we checked the whole configuration
self.assertSetEqual(set(config_dict.keys()), set(keys_checked))
diff --git a/test/test_scripts/test_metadata_generation.py b/test/test_scripts/test_metadata_generation.py
index 6cc4fad38d..6c6ba70ef5 100644
--- a/test/test_scripts/test_metadata_generation.py
+++ b/test/test_scripts/test_metadata_generation.py
@@ -13,27 +13,29 @@
class TestMetadataGeneration(unittest.TestCase):
-
def setUp(self):
- self.working_directory = '/tmp/autosklearn-unittest-tmp-dir-%s-%d-%d' % (
- socket.gethostname(), os.getpid(), random.randint(0, 1000000))
+ self.working_directory = "/tmp/autosklearn-unittest-tmp-dir-%s-%d-%d" % (
+ socket.gethostname(),
+ os.getpid(),
+ random.randint(0, 1000000),
+ )
def print_files(self):
- print('Existing files:')
+ print("Existing files:")
for dirpath, dirnames, filenames in os.walk(self.working_directory):
print(dirpath, dirnames, filenames)
def test_metadata_generation(self):
regression_task_id = 360029
- regression_dataset_name = 'SWD'.lower()
+ regression_dataset_name = "SWD".lower()
classification_task_id = 245
- classification_dataset_name = 'breast-w'.lower()
+ classification_dataset_name = "breast-w".lower()
current_directory = __file__
- scripts_directory = os.path.abspath(os.path.join(current_directory,
- '..', '..', '..',
- 'scripts'))
+ scripts_directory = os.path.abspath(
+ os.path.join(current_directory, "..", "..", "..", "scripts")
+ )
# 1. create working directory
try:
@@ -44,214 +46,293 @@ def test_metadata_generation(self):
# 2. should be done by the person running the unit tests!
# 3. create configuration commands
- script_filename = os.path.join(scripts_directory, '01_create_commands.py')
- cmd = 'python3 %s --working-directory %s --test' % (script_filename, self.working_directory)
- rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ script_filename = os.path.join(scripts_directory, "01_create_commands.py")
+ cmd = "python3 %s --working-directory %s --test" % (
+ script_filename,
+ self.working_directory,
+ )
+ rval = subprocess.run(
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
self.assertEqual(rval.returncode, 0, msg=str(rval))
# 4. run one of the commands to get some data
- commands_output_file = os.path.join(self.working_directory, 'metadata_commands.txt')
+ commands_output_file = os.path.join(
+ self.working_directory, "metadata_commands.txt"
+ )
self.assertTrue(os.path.exists(commands_output_file))
with open(commands_output_file) as fh:
- cmds = fh.read().split('\n')
- # 6 regression, 7 classification (roc_auc + task 258 is illegal), 1 empty line
- self.assertEqual(len(cmds), 18, msg='\n'.join(cmds))
+ cmds = fh.read().split("\n")
+ # 6 regression, 7 classification (roc_auc + task 258 is illegal),
+ # 1 empty line
+ self.assertEqual(len(cmds), 18, msg="\n".join(cmds))
for task_id, dataset_name, task_type, metric in (
(
classification_task_id,
classification_dataset_name,
- 'classification',
- 'balanced_accuracy',
+ "classification",
+ "balanced_accuracy",
),
- (regression_task_id, regression_dataset_name, 'regression', 'r2')
+ (regression_task_id, regression_dataset_name, "regression", "r2"),
):
cmd = None
with open(commands_output_file) as fh:
while True:
cmd = fh.readline()
- if 'task-id %d' % task_id in cmd and metric in cmd:
+ if "task-id %d" % task_id in cmd and metric in cmd:
break
if cmd is None:
- self.fail('Did not find a command for task_id %s and metric %s in %s'
- % (task_id, metric, cmds))
+ self.fail(
+ "Did not find a command for task_id %s and metric %s in %s"
+ % (task_id, metric, cmds)
+ )
- self.assertIn('time-limit 86400', cmd)
- self.assertIn('per-run-time-limit 1800', cmd)
- cmd = cmd.replace('time-limit 86400', 'time-limit 60').replace(
- 'per-run-time-limit 1800', 'per-run-time-limit 5')
+ self.assertIn("time-limit 86400", cmd)
+ self.assertIn("per-run-time-limit 1800", cmd)
+ cmd = cmd.replace("time-limit 86400", "time-limit 60").replace(
+ "per-run-time-limit 1800", "per-run-time-limit 5"
+ )
# This tells the script to use the same memory limit for testing as
# for training. In production, it would use twice as much!
- cmd = cmd.replace('-s 1', '-s 1 --unittest')
- print('COMMAND: %s' % cmd)
- rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
- print('STDOUT: %s' % repr(rval.stdout), flush=True)
- print('STDERR: %s' % repr(rval.stderr), flush=True)
+ cmd = cmd.replace("-s 1", "-s 1 --unittest")
+ print("COMMAND: %s" % cmd)
+ rval = subprocess.run(
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
+ print("STDOUT: %s" % repr(rval.stdout), flush=True)
+ print("STDERR: %s" % repr(rval.stderr), flush=True)
self.print_files()
- expected_output_directory = os.path.join(self.working_directory,
- 'configuration',
- task_type,
- str(task_id), metric,
- 'auto-sklearn-output')
- self.assertTrue(os.path.exists(expected_output_directory),
- msg=expected_output_directory)
- smac_log = os.path.join(expected_output_directory, 'AutoML(1):%s.log' % dataset_name)
+ expected_output_directory = os.path.join(
+ self.working_directory,
+ "configuration",
+ task_type,
+ str(task_id),
+ metric,
+ "auto-sklearn-output",
+ )
+ self.assertTrue(
+ os.path.exists(expected_output_directory), msg=expected_output_directory
+ )
+ smac_log = os.path.join(
+ expected_output_directory, "AutoML(1):%s.log" % dataset_name
+ )
with open(smac_log) as fh:
smac_output = fh.read()
- self.assertEqual(rval.returncode, 0, msg=str(rval) + '\n' + smac_output)
- expected_validation_output = os.path.join(expected_output_directory, '..',
- 'validation_trajectory_1.json')
+ self.assertEqual(rval.returncode, 0, msg=str(rval) + "\n" + smac_output)
+ expected_validation_output = os.path.join(
+ expected_output_directory, "..", "validation_trajectory_1.json"
+ )
self.assertTrue(os.path.exists(expected_validation_output))
- trajectory = os.path.join(expected_output_directory,
- 'smac3-output', 'run_1', 'trajectory.json')
+ trajectory = os.path.join(
+ expected_output_directory, "smac3-output", "run_1", "trajectory.json"
+ )
with open(expected_validation_output) as fh_validation:
with open(trajectory) as fh_trajectory:
traj = json.load(fh_trajectory)
valid_traj = json.load(fh_validation)
- print('Validation trajectory:')
+ print("Validation trajectory:")
print(valid_traj)
self.assertGreater(len(traj), 2, msg=str(valid_traj))
self.assertEqual(len(traj), len(valid_traj), msg=str(valid_traj))
for entry in valid_traj:
- if task_type == 'classification':
+ if task_type == "classification":
for metric in CLASSIFICATION_METRICS:
# This is a multilabel metric
- if metric in ('precision_samples', 'recall_samples', 'f1_samples'):
+ if metric in (
+ "precision_samples",
+ "recall_samples",
+ "f1_samples",
+ ):
continue
self.assertIn(metric, entry[-1])
self.assertIsInstance(entry[-1][metric], float)
- self.assertTrue(np.isfinite(entry[-1][metric]),
- (metric, str(entry[-1][metric])))
+ self.assertTrue(
+ np.isfinite(entry[-1][metric]),
+ (metric, str(entry[-1][metric])),
+ )
else:
for metric in REGRESSION_METRICS:
self.assertIn(metric, entry[-1])
self.assertIsInstance(entry[-1][metric], float)
- self.assertTrue(np.isfinite(entry[-1][metric]),
- (metric, str(entry[-1][metric])))
+ self.assertTrue(
+ np.isfinite(entry[-1][metric]),
+ (metric, str(entry[-1][metric])),
+ )
# 5. Get the test performance of these configurations
- script_filename = os.path.join(scripts_directory, '02_retrieve_metadata.py')
- cmd = 'python3 %s --working-directory %s ' % (script_filename, self.working_directory)
- print('COMMAND: %s' % cmd)
- rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
- print('STDOUT: %s' % repr(rval.stdout), flush=True)
- print('STDERR: %s' % repr(rval.stderr), flush=True)
+ script_filename = os.path.join(scripts_directory, "02_retrieve_metadata.py")
+ cmd = "python3 %s --working-directory %s " % (
+ script_filename,
+ self.working_directory,
+ )
+ print("COMMAND: %s" % cmd)
+ rval = subprocess.run(
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
+ print("STDOUT: %s" % repr(rval.stdout), flush=True)
+ print("STDERR: %s" % repr(rval.stderr), flush=True)
self.assertEqual(rval.returncode, 0, msg=str(rval))
- for file in ['algorithm_runs.arff', 'configurations.csv', 'description.results.txt']:
- for metric in ['accuracy', 'balanced_accuracy', 'log_loss', 'roc_auc']:
+ for file in [
+ "algorithm_runs.arff",
+ "configurations.csv",
+ "description.results.txt",
+ ]:
+ for metric in ["accuracy", "balanced_accuracy", "log_loss", "roc_auc"]:
path = os.path.join(
self.working_directory,
- 'configuration_results',
- '%s_binary.classification_dense' % metric,
+ "configuration_results",
+ "%s_binary.classification_dense" % metric,
file,
)
self.assertTrue(os.path.exists(path), msg=path)
- for file in ['algorithm_runs.arff', 'configurations.csv', 'description.results.txt']:
- for metric in ['r2', 'mean_squared_error']:
+ for file in [
+ "algorithm_runs.arff",
+ "configurations.csv",
+ "description.results.txt",
+ ]:
+ for metric in ["r2", "mean_squared_error"]:
path = os.path.join(
self.working_directory,
- 'configuration_results',
- '%s_regression_dense' % metric,
+ "configuration_results",
+ "%s_regression_dense" % metric,
file,
)
self.assertTrue(os.path.exists(path), msg=path)
# 6. Calculate metafeatures
- script_filename = os.path.join(scripts_directory, '03_calculate_metafeatures.py')
- cmd = (
- 'python3 %s --working-directory %s --test-mode '
- % (script_filename, self.working_directory)
+ script_filename = os.path.join(
+ scripts_directory, "03_calculate_metafeatures.py"
+ )
+ cmd = "python3 %s --working-directory %s --test-mode " % (
+ script_filename,
+ self.working_directory,
+ )
+ rval = subprocess.run(
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
- rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
self.assertEqual(rval.returncode, 0, msg=str(rval))
- for task_type in ('classification', 'regression'):
- for file in ['calculation_times.csv', 'description.features.txt',
- 'feature_costs.arff', 'feature_runstatus.arff',
- 'feature_values.arff']:
+ for task_type in ("classification", "regression"):
+ for file in [
+ "calculation_times.csv",
+ "description.features.txt",
+ "feature_costs.arff",
+ "feature_runstatus.arff",
+ "feature_values.arff",
+ ]:
self.assertTrue(
- os.path.exists(os.path.join(
- self.working_directory,
- 'metafeatures',
- task_type,
- file)
+ os.path.exists(
+ os.path.join(
+ self.working_directory, "metafeatures", task_type, file
+ )
)
)
with open(
os.path.join(
- self.working_directory, 'metafeatures', 'regression', 'feature_values.arff'
+ self.working_directory,
+ "metafeatures",
+ "regression",
+ "feature_values.arff",
)
) as fh:
- metafeatures_arff = fh.read().split('\n')
+ metafeatures_arff = fh.read().split("\n")
contains_regression_id = False
for line in metafeatures_arff:
- if line.startswith('fri_c4_500_25,'):
+ if line.startswith("fri_c4_500_25,"):
contains_regression_id = True
self.assertTrue(contains_regression_id, msg=metafeatures_arff)
with open(
- os.path.join(
- self.working_directory, 'metafeatures', 'classification', 'feature_values.arff'
- )
+ os.path.join(
+ self.working_directory,
+ "metafeatures",
+ "classification",
+ "feature_values.arff",
+ )
) as fh:
- metafeatures_arff = fh.read().split('\n')
+ metafeatures_arff = fh.read().split("\n")
contains_classification_id = False
for line in metafeatures_arff:
- if line.startswith('anneal,'):
+ if line.startswith("anneal,"):
contains_classification_id = True
self.assertTrue(contains_classification_id, msg=metafeatures_arff)
# 7. Create aslib files
- script_filename = os.path.join(scripts_directory, '04_create_aslib_files.py')
- cmd = 'python3 %s --working-directory %s ' % (
- script_filename, self.working_directory)
- rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
+ script_filename = os.path.join(scripts_directory, "04_create_aslib_files.py")
+ cmd = "python3 %s --working-directory %s " % (
+ script_filename,
+ self.working_directory,
+ )
+ rval = subprocess.run(
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
self.assertEqual(rval.returncode, 0, msg=str(rval))
for metric_, combination in (
- (metric, '%s_binary.classification_dense' % metric),
- (metric, '%s_regression_dense' % metric),
+ (metric, "%s_binary.classification_dense" % metric),
+ (metric, "%s_regression_dense" % metric),
):
if task_type not in combination:
continue
- for file in ['algorithm_runs.arff', 'configurations.csv',
- 'description.txt', 'feature_costs.arff',
- 'feature_runstatus.arff', 'feature_values.arff',
- 'readme.txt']:
+ for file in [
+ "algorithm_runs.arff",
+ "configurations.csv",
+ "description.txt",
+ "feature_costs.arff",
+ "feature_runstatus.arff",
+ "feature_values.arff",
+ "readme.txt",
+ ]:
expected_path = os.path.join(
- self.working_directory, 'metadata', combination, file,
+ self.working_directory,
+ "metadata",
+ combination,
+ file,
)
self.assertTrue(os.path.exists(expected_path), msg=expected_path)
- with open(os.path.join(self.working_directory,
- 'metadata',
- combination,
- 'algorithm_runs.arff')) as fh:
+ with open(
+ os.path.join(
+ self.working_directory,
+ "metadata",
+ combination,
+ "algorithm_runs.arff",
+ )
+ ) as fh:
algorithm_runs = arff.load(fh)
- self.assertEqual(algorithm_runs['attributes'],
- [('instance_id', 'STRING'),
- ('repetition', 'NUMERIC'),
- ('algorithm', 'STRING'),
- (metric_, 'NUMERIC'),
- ('runstatus',
- ['ok', 'timeout', 'memout', 'not_applicable',
- 'crash', 'other'])])
- self.assertEqual(len(algorithm_runs['data']), 1)
- self.assertEqual(len(algorithm_runs['data'][0]), 5)
- self.assertLess(algorithm_runs['data'][0][3], 0.9)
- self.assertEqual(algorithm_runs['data'][0][4], 'ok')
+ self.assertEqual(
+ algorithm_runs["attributes"],
+ [
+ ("instance_id", "STRING"),
+ ("repetition", "NUMERIC"),
+ ("algorithm", "STRING"),
+ (metric_, "NUMERIC"),
+ (
+ "runstatus",
+ [
+ "ok",
+ "timeout",
+ "memout",
+ "not_applicable",
+ "crash",
+ "other",
+ ],
+ ),
+ ],
+ )
+ self.assertEqual(len(algorithm_runs["data"]), 1)
+ self.assertEqual(len(algorithm_runs["data"][0]), 5)
+ self.assertLess(algorithm_runs["data"][0][3], 0.9)
+ self.assertEqual(algorithm_runs["data"][0][4], "ok")
def tearDown(self):
for i in range(5):
diff --git a/test/test_util/__init__.py b/test/test_util/__init__.py
index cc3cd7becd..e298f0f075 100644
--- a/test/test_util/__init__.py
+++ b/test/test_util/__init__.py
@@ -1,2 +1,2 @@
# -*- encoding: utf-8 -*-
-__author__ = 'feurerm'
+__author__ = "feurerm"
diff --git a/test/test_util/test_StopWatch.py b/test/test_util/test_StopWatch.py
index 14038c6820..d45ecbf55d 100644
--- a/test/test_util/test_StopWatch.py
+++ b/test/test_util/test_StopWatch.py
@@ -22,8 +22,8 @@ def test_stopwatch_overhead(self):
cpu_start = time.process_time()
watch = StopWatch()
for i in range(1, 1000):
- watch.start_task('task_%d' % i)
- watch.stop_task('task_%d' % i)
+ watch.start_task("task_%d" % i)
+ watch.stop_task("task_%d" % i)
cpu_stop = time.process_time()
stop = time.time()
dur = stop - start
@@ -36,6 +36,6 @@ def test_stopwatch_overhead(self):
self.assertLess(watch.cpu_sum(), 2 * watch.wall_sum())
-if __name__ == '__main__':
+if __name__ == "__main__":
# import sys;sys.argv = ['', 'Test.testName']
unittest.main()
diff --git a/test/test_util/test_backend.py b/test/test_util/test_backend.py
index a029aef4bb..0673370b97 100644
--- a/test/test_util/test_backend.py
+++ b/test/test_util/test_backend.py
@@ -7,48 +7,48 @@
class BackendModelsTest(unittest.TestCase):
-
class BackendStub(Backend):
-
def __init__(self):
self.__class__ = Backend
def setUp(self):
self.backend = self.BackendStub()
- self.backend.internals_directory = '/'
+ self.backend.internals_directory = "/"
- @unittest.mock.patch('pickle.load')
- @unittest.mock.patch('os.path.exists')
+ @unittest.mock.patch("pickle.load")
+ @unittest.mock.patch("os.path.exists")
def test_load_model_by_seed_and_id(self, exists_mock, pickleLoadMock):
exists_mock.return_value = False
- open_mock = unittest.mock.mock_open(read_data='Data')
+ open_mock = unittest.mock.mock_open(read_data="Data")
with unittest.mock.patch(
- 'autosklearn.automl_common.common.utils.backend.open',
+ "autosklearn.automl_common.common.utils.backend.open",
open_mock,
create=True,
):
seed = 13
idx = 17
budget = 50.0
- expected_model = self._setup_load_model_mocks(open_mock,
- pickleLoadMock,
- seed, idx, budget)
+ expected_model = self._setup_load_model_mocks(
+ open_mock, pickleLoadMock, seed, idx, budget
+ )
actual_model = self.backend.load_model_by_seed_and_id_and_budget(
- seed, idx, budget)
+ seed, idx, budget
+ )
self.assertEqual(expected_model, actual_model)
- @unittest.mock.patch('pickle.load')
- @unittest.mock.patch.object(builtins, 'open')
- @unittest.mock.patch('os.path.exists')
+ @unittest.mock.patch("pickle.load")
+ @unittest.mock.patch.object(builtins, "open")
+ @unittest.mock.patch("os.path.exists")
def test_loads_models_by_identifiers(self, exists_mock, openMock, pickleLoadMock):
exists_mock.return_value = True
seed = 13
idx = 17
budget = 50.0
expected_model = self._setup_load_model_mocks(
- openMock, pickleLoadMock, seed, idx, budget)
+ openMock, pickleLoadMock, seed, idx, budget
+ )
expected_dict = {(seed, idx, budget): expected_model}
actual_dict = self.backend.load_models_by_identifiers([(seed, idx, budget)])
@@ -57,15 +57,25 @@ def test_loads_models_by_identifiers(self, exists_mock, openMock, pickleLoadMock
self.assertDictEqual(expected_dict, actual_dict)
def _setup_load_model_mocks(self, openMock, pickleLoadMock, seed, idx, budget):
- model_path = '/runs/%s_%s_%s/%s.%s.%s.model' % (seed, idx, budget, seed, idx, budget)
- file_handler = 'file_handler'
- expected_model = 'model'
+ model_path = "/runs/%s_%s_%s/%s.%s.%s.model" % (
+ seed,
+ idx,
+ budget,
+ seed,
+ idx,
+ budget,
+ )
+ file_handler = "file_handler"
+ expected_model = "model"
fileMock = unittest.mock.MagicMock()
fileMock.__enter__.return_value = file_handler
- openMock.side_effect = \
- lambda path, flag: fileMock if path == model_path and flag == 'rb' else None
- pickleLoadMock.side_effect = lambda fh: expected_model if fh == file_handler else None
+ openMock.side_effect = (
+ lambda path, flag: fileMock if path == model_path and flag == "rb" else None
+ )
+ pickleLoadMock.side_effect = (
+ lambda fh: expected_model if fh == file_handler else None
+ )
return expected_model
diff --git a/test/test_util/test_common.py b/test/test_util/test_common.py
index 740608969d..33fa4cee31 100644
--- a/test/test_util/test_common.py
+++ b/test/test_util/test_common.py
@@ -18,5 +18,5 @@ def test_check_pid(self):
self.assertFalse(exists)
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()
diff --git a/test/test_util/test_data.py b/test/test_util/test_data.py
index 87548b413f..2bceac804a 100644
--- a/test/test_util/test_data.py
+++ b/test/test_util/test_data.py
@@ -1,29 +1,33 @@
-from typing import Any, List, Dict, Union
-from itertools import chain
-import warnings
+from typing import Any, Dict, List, Union
-import pytest
+import warnings
+from itertools import chain
import numpy as np
import pandas as pd
+import pytest
import sklearn.datasets
from scipy.sparse import csr_matrix, spmatrix
from autosklearn.constants import (
- BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION,
- REGRESSION, MULTIOUTPUT_REGRESSION, CLASSIFICATION_TASKS, REGRESSION_TASKS
+ BINARY_CLASSIFICATION,
+ CLASSIFICATION_TASKS,
+ MULTICLASS_CLASSIFICATION,
+ MULTILABEL_CLASSIFICATION,
+ MULTIOUTPUT_REGRESSION,
+ REGRESSION,
+ REGRESSION_TASKS,
)
from autosklearn.util.data import (
- subsample,
+ default_dataset_compression_arg,
reduce_dataset_size_if_too_large,
reduce_precision,
reduction_mapping,
+ subsample,
supported_precision_reductions,
validate_dataset_compression_arg,
- default_dataset_compression_arg
)
-
parametrize = pytest.mark.parametrize
@@ -68,11 +72,14 @@ def test_validate_dataset_compression_arg_returns_with_memory_allocation(
assert validate_arg["methods"] == expected_methods
-@parametrize("methods", [
- ["precision"],
- ["precision", "subsample"],
- ["precision", "precision", "subsample"]
-])
+@parametrize(
+ "methods",
+ [
+ ["precision"],
+ ["precision", "subsample"],
+ ["precision", "precision", "subsample"],
+ ],
+)
def test_validate_dataset_compression_arg_returns_with_same_methods(
methods: List[str],
):
@@ -125,17 +132,14 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_key(key: str):
-------
* Should raise a ValueError
"""
- bad_arg = {
- **default_dataset_compression_arg,
- key: 1337
- }
+ bad_arg = {**default_dataset_compression_arg, key: 1337}
with pytest.raises(ValueError, match=r"Unknown key"):
validate_dataset_compression_arg(bad_arg, memory_limit=10)
@parametrize("memory_allocation", ["hello", {}, [1, 2, 3]])
def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocation_type(
- memory_allocation: Any
+ memory_allocation: Any,
):
"""
Parameters
@@ -148,13 +152,15 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocatio
* Should raise a ValueError
"""
bad_arg = {"memory_allocation": memory_allocation}
- with pytest.raises(ValueError, match=r"key 'memory_allocation' must be an `int` or `float`"):
+ with pytest.raises(
+ ValueError, match=r"key 'memory_allocation' must be an `int` or `float`"
+ ):
validate_dataset_compression_arg(bad_arg, memory_limit=10)
@parametrize("memory_allocation", [-0.5, 0.0, 1.0, 1.5])
def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocation_float(
- memory_allocation: float
+ memory_allocation: float,
):
"""
Parameters
@@ -168,16 +174,17 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocatio
"""
bad_arg = {"memory_allocation": memory_allocation}
- with pytest.raises(ValueError, match=r"key 'memory_allocation' if float must be in \(0, 1\)"):
+ with pytest.raises(
+ ValueError, match=r"key 'memory_allocation' if float must be in \(0, 1\)"
+ ):
validate_dataset_compression_arg(bad_arg, memory_limit=10)
-@parametrize("memory_allocation, memory_limit", [
- (0, 10), (10, 10), (-20, 10), (20, 10)
-])
+@parametrize(
+ "memory_allocation, memory_limit", [(0, 10), (10, 10), (-20, 10), (20, 10)]
+)
def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocation_int(
- memory_allocation: int,
- memory_limit: int
+ memory_allocation: int, memory_limit: int
):
"""
Parameters
@@ -193,12 +200,16 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocatio
* Should raise a ValueError
"""
bad_arg = {"memory_allocation": memory_allocation}
- with pytest.raises(ValueError, match=r"key 'memory_allocation' if int must be in \(0,"):
+ with pytest.raises(
+ ValueError, match=r"key 'memory_allocation' if int must be in \(0,"
+ ):
validate_dataset_compression_arg(bad_arg, memory_limit=memory_limit)
@parametrize("methods", [10, {"hello", "world"}, []])
-def test_validate_dataset_compression_arg_raises_error_with_bad_methods_type(methods: Any):
+def test_validate_dataset_compression_arg_raises_error_with_bad_methods_type(
+ methods: Any,
+):
"""
Parameters
----------
@@ -214,12 +225,17 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_methods_type(met
validate_dataset_compression_arg(bad_arg, memory_limit=10)
-@parametrize("methods", [
- ["bad", "worse"],
- ["precision", "kind_of_bad"],
- ["still_bad", "precision", "subsample"]
-])
-def test_validate_dataset_compression_arg_raises_error_with_bad_methods_entries(methods: Any):
+@parametrize(
+ "methods",
+ [
+ ["bad", "worse"],
+ ["precision", "kind_of_bad"],
+ ["still_bad", "precision", "subsample"],
+ ],
+)
+def test_validate_dataset_compression_arg_raises_error_with_bad_methods_entries(
+ methods: Any,
+):
"""
Parameters
----------
@@ -235,11 +251,16 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_methods_entries(
validate_dataset_compression_arg(bad_arg, memory_limit=10)
-@parametrize("y", [
- np.asarray(9999 * [0] + 1 * [1]),
- np.asarray(4999 * [1] + 4999 * [2] + 1 * [3] + 1 * [4]),
- np.asarray(4999 * [[0, 1, 1]] + 4999 * [[1, 1, 0]] + 1 * [[1, 0, 1]] + 1 * [[0, 0, 0]])
-])
+@parametrize(
+ "y",
+ [
+ np.asarray(9999 * [0] + 1 * [1]),
+ np.asarray(4999 * [1] + 4999 * [2] + 1 * [3] + 1 * [4]),
+ np.asarray(
+ 4999 * [[0, 1, 1]] + 4999 * [[1, 1, 0]] + 1 * [[1, 0, 1]] + 1 * [[0, 0, 0]]
+ ),
+ ],
+)
@parametrize("random_state", list(range(5)))
def test_subsample_classification_unique_labels_stay_in_training_set(y, random_state):
n_samples = len(y)
@@ -253,32 +274,37 @@ def test_subsample_classification_unique_labels_stay_in_training_set(y, random_s
with warnings.catch_warnings():
warnings.simplefilter("ignore")
X_sampled, y_sampled = subsample(
- X, y,
+ X,
+ y,
random_state=random_state,
sample_size=sample_size,
- is_classification=True
+ is_classification=True,
)
assert X_sampled.dtype == X.dtype and y_sampled.dtype == y.dtype
assert len(y_sampled) == sample_size
- assert all(label in y_sampled for label in unique_labels), \
- f"sampled unique = {np.unique(y_sampled)}, original unique = {unique_labels}"
+ assert all(
+ label in y_sampled for label in unique_labels
+ ), f"sampled unique = {np.unique(y_sampled)}, original unique = {unique_labels}"
@parametrize("X", [np.asarray([[1, 1, 1]] * 30)])
@parametrize("x_type", [list, np.ndarray, csr_matrix, pd.DataFrame])
-@parametrize("y, task", [
- (np.asarray([0] * 15 + [1] * 15), BINARY_CLASSIFICATION),
- (np.asarray([0] * 10 + [1] * 10 + [2] * 10), MULTICLASS_CLASSIFICATION),
- (np.asarray([[1, 0, 1]] * 30), MULTILABEL_CLASSIFICATION),
- (np.asarray([1.0] * 30), REGRESSION),
- (np.asarray([[1.0, 1.0, 1.0]] * 30), MULTIOUTPUT_REGRESSION),
-])
+@parametrize(
+ "y, task",
+ [
+ (np.asarray([0] * 15 + [1] * 15), BINARY_CLASSIFICATION),
+ (np.asarray([0] * 10 + [1] * 10 + [2] * 10), MULTICLASS_CLASSIFICATION),
+ (np.asarray([[1, 0, 1]] * 30), MULTILABEL_CLASSIFICATION),
+ (np.asarray([1.0] * 30), REGRESSION),
+ (np.asarray([[1.0, 1.0, 1.0]] * 30), MULTIOUTPUT_REGRESSION),
+ ],
+)
@parametrize("y_type", [list, np.ndarray, pd.DataFrame, pd.Series])
@parametrize("random_state", [0])
@parametrize("sample_size", [0.25, 0.5, 5, 10])
def test_subsample_validity(X, x_type, y, y_type, random_state, sample_size, task):
- """ Asserts the validity of the function with all valid types
+ """Asserts the validity of the function with all valid types
We want to make sure that `subsample` works correctly with all the types listed
as x_type and y_type.
@@ -289,10 +315,10 @@ def test_subsample_validity(X, x_type, y, y_type, random_state, sample_size, tas
"""
assert len(X) == len(y) # Make sure our test data is correct
- if (
- y_type == pd.Series
- and task in [MULTILABEL_CLASSIFICATION, MULTIOUTPUT_REGRESSION]
- ):
+ if y_type == pd.Series and task in [
+ MULTILABEL_CLASSIFICATION,
+ MULTIOUTPUT_REGRESSION,
+ ]:
# We can't have a pd.Series with multiple values as it's 1 dimensional
pytest.skip("Can't have pd.Series as y when task is n-dimensional")
@@ -312,10 +338,11 @@ def convert(arr, objtype):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
X_sampled, y_sampled = subsample(
- X, y,
+ X,
+ y,
random_state=random_state,
sample_size=sample_size,
- is_classification=task in CLASSIFICATION_TASKS
+ is_classification=task in CLASSIFICATION_TASKS,
)
# Function to get the type of an obj
@@ -359,9 +386,11 @@ def size(obj):
assert size(X_sampled) == sample_size
-@parametrize('X', [np.asarray([[0, 0, 1]] * 10)])
-@parametrize('dtype', supported_precision_reductions + [np.dtype('float32'), np.dtype('float64')])
-@parametrize('x_type', [np.ndarray, csr_matrix])
+@parametrize("X", [np.asarray([[0, 0, 1]] * 10)])
+@parametrize(
+ "dtype", supported_precision_reductions + [np.dtype("float32"), np.dtype("float64")]
+)
+@parametrize("x_type", [np.ndarray, csr_matrix])
def test_reduce_precision_correctly_reduces_precision(X, dtype, x_type):
X = X.astype(dtype)
if x_type == csr_matrix:
@@ -376,13 +405,13 @@ def test_reduce_precision_correctly_reduces_precision(X, dtype, x_type):
expected: Dict[type, type] = {
np.float32: np.float32,
np.float64: np.float32,
- np.dtype('float32'): np.float32,
- np.dtype('float64'): np.float32
+ np.dtype("float32"): np.float32,
+ np.dtype("float64"): np.float32,
}
- if hasattr(np, 'float96'):
+ if hasattr(np, "float96"):
expected[np.float96] = np.float64
- if hasattr(np, 'float128'):
+ if hasattr(np, "float128"):
expected[np.float128] = np.float64
assert precision == expected[dtype]
@@ -394,28 +423,40 @@ def test_reduce_precision_correctly_reduces_precision(X, dtype, x_type):
assert type(X) == type(X_reduced)
-@parametrize('X', [np.asarray([0, 0, 1]) * 10])
-@parametrize('dtype', [np.int32, np.int64, np.complex128])
+@parametrize("X", [np.asarray([0, 0, 1]) * 10])
+@parametrize("dtype", [np.int32, np.int64, np.complex128])
def test_reduce_precision_with_unsupported_dtypes(X, dtype):
X = X.astype(dtype)
with pytest.raises(ValueError) as err:
reduce_precision(X)
- expected = f"X.dtype = {X.dtype} not equal to any supported {supported_precision_reductions}"
+ expected = (
+ f"X.dtype = {X.dtype} not equal to any supported "
+ f"{supported_precision_reductions}"
+ )
+
assert err.value.args[0] == expected
-@parametrize("X", [
- np.ones((100000, 10), dtype=np.float64) # Make it big for reductions to take place
-])
+@parametrize(
+ "X",
+ [
+ np.ones(
+ (100000, 10), dtype=np.float64
+ ) # Make it big for reductions to take place
+ ],
+)
@parametrize("x_type", [csr_matrix, np.ndarray])
@parametrize("dtype", supported_precision_reductions)
-@parametrize('y, is_classification', [
- (np.ones((100000,)), True),
- (np.ones((100000,)), False),
-])
-@parametrize('memory_allocation', [0.1, 1/5.2, 1/8, 1])
-@parametrize('operations', [['precision'], ['subsample'], ['precision', 'subsample']])
+@parametrize(
+ "y, is_classification",
+ [
+ (np.ones((100000,)), True),
+ (np.ones((100000,)), False),
+ ],
+)
+@parametrize("memory_allocation", [0.1, 1 / 5.2, 1 / 8, 1])
+@parametrize("operations", [["precision"], ["subsample"], ["precision", "subsample"]])
def test_reduce_dataset_reduces_size_and_precision(
X, x_type, dtype, y, is_classification, memory_allocation, operations
):
@@ -444,13 +485,13 @@ def bytes(arr):
return arr.nbytes if isinstance(arr, np.ndarray) else arr.data.nbytes
# If we expect some precision reduction unless at float32 already
- if 'precision' in operations and dtype != np.float32:
+ if "precision" in operations and dtype != np.float32:
expected = reduction_mapping[X.dtype]
assert X_out.dtype == expected
assert bytes(X_out) < bytes(X)
# If we expect some subsampling
- if 'subsample' in operations:
+ if "subsample" in operations:
assert X_out.shape[0] < X.shape[0]
assert y_out.shape[0] < y.shape[0]
assert bytes(X_out) < bytes(X)
@@ -464,10 +505,10 @@ def test_reduce_dataset_invalid_dtype_for_precision_reduction():
reduce_dataset_size_if_too_large(
X=X,
y=X,
- operations=['precision'],
+ operations=["precision"],
memory_limit=1,
memory_allocation=0.1,
- is_classification=False
+ is_classification=False,
)
expected_err = f"Unsupported type `{X.dtype}` for precision reduction"
@@ -485,7 +526,7 @@ def test_reduce_dataset_invalid_operations():
operations=[invalid_op],
memory_limit=1,
memory_allocation=0.1,
- is_classification=False
+ is_classification=False,
)
expected_err = f"Unknown operation `{invalid_op}`"
@@ -504,13 +545,15 @@ def test_reduce_dataset_invalid_memory_allocation_float(memory_allocation: float
-------
* Should raise a ValueError
"""
- with pytest.raises(ValueError, match=r"memory_allocation if float must be in \(0, 1\)"):
+ with pytest.raises(
+ ValueError, match=r"memory_allocation if float must be in \(0, 1\)"
+ ):
reduce_dataset_size_if_too_large(
X=np.empty(1),
y=np.empty(1),
memory_limit=100,
is_classification=True,
- memory_allocation=memory_allocation
+ memory_allocation=memory_allocation,
)
@@ -526,17 +569,19 @@ def test_reduce_dataset_invalid_memory_allocation_int(memory_allocation: int):
-------
* Should raise a ValueError
"""
- with pytest.raises(ValueError, match=r"memory_allocation if int must be in \(0, memory_limit"):
+ with pytest.raises(
+ ValueError, match=r"memory_allocation if int must be in \(0, memory_limit"
+ ):
reduce_dataset_size_if_too_large(
X=np.empty(1),
y=np.empty(1),
is_classification=True,
memory_limit=100,
- memory_allocation=memory_allocation
+ memory_allocation=memory_allocation,
)
-@parametrize("memory_allocation", ["100", {'a': 1}, [100]])
+@parametrize("memory_allocation", ["100", {"a": 1}, [100]])
def test_reduce_dataset_invalid_memory_allocation_type(memory_allocation: Any):
"""
Parameters
@@ -554,25 +599,30 @@ def test_reduce_dataset_invalid_memory_allocation_type(memory_allocation: Any):
y=np.empty(1),
memory_limit=100,
is_classification=True,
- memory_allocation=memory_allocation
+ memory_allocation=memory_allocation,
)
@pytest.mark.parametrize(
- 'memory_limit,precision,task',
+ "memory_limit,precision,task",
[
(memory_limit, precision, task)
for task in chain(CLASSIFICATION_TASKS, REGRESSION_TASKS)
for precision in (float, np.float32, np.float64, np.float128)
for memory_limit in (1, 100)
- ]
+ ],
)
def test_reduce_dataset_subsampling_explicit_values(memory_limit, precision, task):
random_state = 0
fixture = {
BINARY_CLASSIFICATION: {
1: {float: 2621, np.float32: 2621, np.float64: 2621, np.float128: 1310},
- 100: {float: 12000, np.float32: 12000, np.float64: 12000, np.float128: 12000},
+ 100: {
+ float: 12000,
+ np.float32: 12000,
+ np.float64: 12000,
+ np.float128: 12000,
+ },
},
MULTICLASS_CLASSIFICATION: {
1: {float: 409, np.float32: 409, np.float64: 409, np.float128: 204},
@@ -589,7 +639,7 @@ def test_reduce_dataset_subsampling_explicit_values(memory_limit, precision, tas
MULTIOUTPUT_REGRESSION: {
1: {float: 1310, np.float32: 1310, np.float64: 1310, np.float128: 655},
100: {float: 5000, np.float32: 5000, np.float64: 5000, np.float128: 5000},
- }
+ },
}
# Create the task and data
@@ -620,12 +670,13 @@ def test_reduce_dataset_subsampling_explicit_values(memory_limit, precision, tas
with warnings.catch_warnings():
warnings.simplefilter("ignore")
X_new, y_new = reduce_dataset_size_if_too_large(
- X=X, y=y,
+ X=X,
+ y=y,
random_state=random_state,
memory_limit=memory_limit,
is_classification=task in CLASSIFICATION_TASKS,
- operations=['precision', 'subsample'],
- memory_allocation=0.1
+ operations=["precision", "subsample"],
+ memory_allocation=0.1,
)
# Assert the new number of samples
diff --git a/test/test_util/test_dependencies.py b/test/test_util/test_dependencies.py
index 53b2285750..1c59dad51b 100644
--- a/test/test_util/test_dependencies.py
+++ b/test/test_util/test_dependencies.py
@@ -1,30 +1,31 @@
-import unittest
-import pkg_resources
import re
-
-from unittest.mock import patch, Mock
+import unittest
+from unittest.mock import Mock, patch
import numpy as np
+import pkg_resources
-from autosklearn.util.dependencies import verify_packages, MissingPackageError, \
- IncorrectPackageVersionError
+from autosklearn.util.dependencies import (
+ IncorrectPackageVersionError,
+ MissingPackageError,
+ verify_packages,
+)
-@patch('pkg_resources.get_distribution')
+@patch("pkg_resources.get_distribution")
class VerifyPackagesTests(unittest.TestCase):
-
def test_existing_package(self, getDistributionMock):
- requirement = 'package'
+ requirement = "package"
distribution_mock = unittest.mock.Mock()
getDistributionMock.return_value = distribution_mock
- distribution_mock.version = '1.0.0'
+ distribution_mock.version = "1.0.0"
verify_packages(requirement)
- getDistributionMock.assert_called_once_with('package')
+ getDistributionMock.assert_called_once_with("package")
def test_missing_package(self, getDistributionMock):
- requirement = 'package'
+ requirement = "package"
getDistributionMock.side_effect = pkg_resources.DistributionNotFound()
@@ -35,7 +36,7 @@ def test_missing_package(self, getDistributionMock):
requirement,
)
- @patch('importlib.import_module')
+ @patch("importlib.import_module")
def test_package_can_only_be_imported(self, import_mock, getDistributionMock):
getDistributionMock.side_effect = pkg_resources.DistributionNotFound()
@@ -43,60 +44,64 @@ def test_package_can_only_be_imported(self, import_mock, getDistributionMock):
package.__version__ = np.__version__
import_mock.return_value = package
- verify_packages('numpy')
+ verify_packages("numpy")
def test_correct_package_versions(self, getDistributionMock):
- requirement = 'package==0.1.2\n' \
- 'package>0.1\n' \
- 'package>=0.1'
+ requirement = "package==0.1.2\n" "package>0.1\n" "package>=0.1"
moduleMock = Mock()
- moduleMock.version = '0.1.2'
+ moduleMock.version = "0.1.2"
getDistributionMock.return_value = moduleMock
verify_packages(requirement)
- getDistributionMock.assert_called_with('package')
+ getDistributionMock.assert_called_with("package")
self.assertEqual(3, len(getDistributionMock.call_args_list))
def test_wrong_package_version(self, getDistributionMock):
- requirement = 'package>0.1.2'
+ requirement = "package>0.1.2"
moduleMock = Mock()
- moduleMock.version = '0.1.2'
+ moduleMock.version = "0.1.2"
getDistributionMock.return_value = moduleMock
self.assertRaisesRegex(
IncorrectPackageVersionError,
- re.escape("found 'package' version 0.1.2 but requires package version >0.1.2"),
+ re.escape(
+ "found 'package' version 0.1.2 but requires package version >0.1.2"
+ ),
verify_packages,
requirement,
- )
+ )
def test_outdated_requirement(self, getDistributionMock):
- requirement = 'package>=0.1'
+ requirement = "package>=0.1"
moduleMock = Mock()
- moduleMock.version = '0.0.9'
+ moduleMock.version = "0.0.9"
getDistributionMock.return_value = moduleMock
self.assertRaisesRegex(
IncorrectPackageVersionError,
- re.escape("found 'package' version 0.0.9 but requires package version >=0.1"),
+ re.escape(
+ "found 'package' version 0.0.9 but requires package version >=0.1"
+ ),
verify_packages,
requirement,
- )
+ )
def test_too_fresh_requirement(self, getDistributionMock):
- requirement = 'package==0.1.2'
+ requirement = "package==0.1.2"
moduleMock = Mock()
- moduleMock.version = '0.1.3'
+ moduleMock.version = "0.1.3"
getDistributionMock.return_value = moduleMock
self.assertRaisesRegex(
IncorrectPackageVersionError,
- re.escape("found 'package' version 0.1.3 but requires package version ==0.1.2"),
+ re.escape(
+ "found 'package' version 0.1.3 but requires package version ==0.1.2"
+ ),
verify_packages,
requirement,
- )
+ )
diff --git a/test/test_util/test_logging.py b/test/test_util/test_logging.py
index 568593c7c8..d824aecc02 100644
--- a/test/test_util/test_logging.py
+++ b/test/test_util/test_logging.py
@@ -1,47 +1,46 @@
-import os
-import unittest
import logging
import logging.config
+import os
import tempfile
-import yaml
+import unittest
+import yaml
from autosklearn.util import logging_
class LoggingTest(unittest.TestCase):
-
def test_setup_logger(self):
# Test that setup_logger function correctly configures the logger
# according to the given dictionary, and uses the default
# logging.yaml file if logging_config is not specified.
- with open(os.path.join(os.path.dirname(__file__), 'example_config.yaml'), 'r') as fh:
+ with open(
+ os.path.join(os.path.dirname(__file__), "example_config.yaml"), "r"
+ ) as fh:
example_config = yaml.safe_load(fh)
# Configure logger with example_config.yaml.
- logging_.setup_logger(logging_config=example_config,
- output_dir=tempfile.gettempdir())
+ logging_.setup_logger(
+ logging_config=example_config, output_dir=tempfile.gettempdir()
+ )
# example_config sets the root logger's level to CRITICAL,
# which corresponds to 50.
self.assertEqual(logging.getLogger().getEffectiveLevel(), 50)
# This time use the default configuration.
- logging_.setup_logger(logging_config=None,
- output_dir=tempfile.gettempdir())
+ logging_.setup_logger(logging_config=None, output_dir=tempfile.gettempdir())
# default config sets the root logger's level to DEBUG,
# which corresponds to 10.
self.assertEqual(logging.getLogger().getEffectiveLevel(), 10)
# Make sure we log to the desired directory
- logging_.setup_logger(output_dir=os.path.dirname(__file__),
- filename='test.log'
- )
+ logging_.setup_logger(output_dir=os.path.dirname(__file__), filename="test.log")
logger = logging.getLogger()
- logger.info('test_setup_logger')
+ logger.info("test_setup_logger")
- with open(os.path.join(os.path.dirname(__file__), 'test.log')) as fh:
- self.assertIn('test_setup_logger', ''.join(fh.readlines()))
- os.remove(os.path.join(os.path.dirname(__file__), 'test.log'))
+ with open(os.path.join(os.path.dirname(__file__), "test.log")) as fh:
+ self.assertIn("test_setup_logger", "".join(fh.readlines()))
+ os.remove(os.path.join(os.path.dirname(__file__), "test.log"))
diff --git a/test/test_util/test_single_thread_client.py b/test/test_util/test_single_thread_client.py
index 34fe7736fe..770ff9f04a 100644
--- a/test/test_util/test_single_thread_client.py
+++ b/test/test_util/test_single_thread_client.py
@@ -1,8 +1,6 @@
import dask.distributed
-
-from distributed.utils_test import inc
-
import pytest
+from distributed.utils_test import inc
from autosklearn.util.single_thread_client import SingleThreadedClient
diff --git a/test/test_util/test_trials_callback.py b/test/test_util/test_trials_callback.py
index 3cda8ea204..d1bfe6b748 100644
--- a/test/test_util/test_trials_callback.py
+++ b/test/test_util/test_trials_callback.py
@@ -13,56 +13,62 @@
class AutoMLTrialsCallBack(IncorporateRunResultCallback):
-
def __init__(self, fname):
self.trials_num = 1
self.fname = fname
with open(fname, "w") as fp:
- fp.write("TrialNo, "
- "StartTime, "
- "EndTime, "
- "Status, "
- "TrainLoss, "
- "ValidLoss, "
- "TestLoss, "
- "Classifier")
+ fp.write(
+ "TrialNo, "
+ "StartTime, "
+ "EndTime, "
+ "Status, "
+ "TrainLoss, "
+ "ValidLoss, "
+ "TestLoss, "
+ "Classifier"
+ )
def __call__(
- self, smbo: 'SMBO',
- run_info: RunInfo,
- result: RunValue,
- time_left: float,
+ self,
+ smbo: "SMBO",
+ run_info: RunInfo,
+ result: RunValue,
+ time_left: float,
) -> None:
train_loss, valid_loss, test_loss = None, None, None
trial_start_time = result.starttime
trial_end_time = result.endtime
trial_status = result.status.name
if trial_status == StatusType.SUCCESS.name:
- train_loss = result.additional_info.get('train_loss')
+ train_loss = result.additional_info.get("train_loss")
valid_loss = result.cost
- test_loss = result.additional_info.get('test_loss')
- trial_classifier = run_info.config.get_dictionary()['classifier:__choice__']
+ test_loss = result.additional_info.get("test_loss")
+ trial_classifier = run_info.config.get_dictionary()["classifier:__choice__"]
with open(self.fname, "a+") as fp:
- fp.write(f"\n {self.trials_num}, {trial_start_time}, {trial_end_time}, {trial_status}, "
- f"{train_loss}, {valid_loss}, {test_loss}, {trial_classifier}")
+ fp.write(
+ f"\n {self.trials_num}, {trial_start_time}, {trial_end_time},"
+ f" {trial_status}, {train_loss}, {valid_loss}, {test_loss},"
+ f" {trial_classifier}"
+ )
self.trials_num += 1
class VerifyTrialsCallBack(unittest.TestCase):
-
def test_trials_callback_execution(self):
trials_summary_fname = os.path.join(tempfile.gettempdir(), "trials.csv")
- X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer')
- cls = AutoSklearnClassifier(time_left_for_this_task=30,
- initial_configurations_via_metalearning=0,
- per_run_time_limit=10,
- memory_limit=1024,
- delete_tmp_folder_after_terminate=False,
- n_jobs=1,
- include={'feature_preprocessor': ['pca'],
- 'classifier': ['sgd']},
- get_trials_callback=AutoMLTrialsCallBack(trials_summary_fname)
- )
+ X_train, Y_train, X_test, Y_test = putil.get_dataset("breast_cancer")
+ cls = AutoSklearnClassifier(
+ time_left_for_this_task=30,
+ initial_configurations_via_metalearning=0,
+ per_run_time_limit=10,
+ memory_limit=1024,
+ delete_tmp_folder_after_terminate=False,
+ n_jobs=1,
+ include={"feature_preprocessor": ["pca"], "classifier": ["sgd"]},
+ get_trials_callback=AutoMLTrialsCallBack(trials_summary_fname),
+ )
cls.fit(X_train, Y_train, X_test, Y_test)
trials = pd.read_csv(trials_summary_fname)
- assert trials.shape[0] > 0, f"Auto-Sklearn explored {trials.shape[0] - 1} trials"
+ assert (
+ trials.shape[0] > 0
+ ), f"Auto-Sklearn explored {trials.shape[0] - 1} trials"
diff --git a/testcommand.sh b/testcommand.sh
deleted file mode 100644
index 00c8fe8321..0000000000
--- a/testcommand.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env bash
-pytest -n 3 --durations=20 --timeout=300 --dist load --timeout-method=thread --fulltrace -v $1