Skip to content
Merged
147 changes: 81 additions & 66 deletions autosklearn/experimental/askl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,67 +18,6 @@
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import Scorer, accuracy, balanced_accuracy, log_loss, roc_auc

metrics = (balanced_accuracy, roc_auc, log_loss)
selector_files = {}
this_directory = pathlib.Path(__file__).resolve().parent
for metric in metrics:
training_data_file = this_directory / metric.name / "askl2_training_data.json"
with open(training_data_file) as fh:
training_data = json.load(fh)
fh.seek(0)
m = hashlib.md5()
m.update(fh.read().encode("utf8"))
training_data_hash = m.hexdigest()[:10]
selector_filename = "askl2_selector_%s_%s_%s_%s.pkl" % (
autosklearn.__version__,
sklearn.__version__,
metric.name,
training_data_hash,
)
selector_directory = os.environ.get("XDG_CACHE_HOME")
if selector_directory is None:
selector_directory = pathlib.Path.home()
selector_directory = (
pathlib.Path(selector_directory).joinpath("auto-sklearn").expanduser()
)
selector_files[metric.name] = selector_directory / selector_filename
metafeatures = pd.DataFrame(training_data["metafeatures"])
strategies = training_data["strategies"]
y_values = pd.DataFrame(
training_data["y_values"], columns=strategies, index=metafeatures.index
)
minima_for_methods = training_data["minima_for_methods"]
maxima_for_methods = training_data["maxima_for_methods"]
default_strategies = training_data["tie_break_order"]
if not selector_files[metric.name].exists():
selector = autosklearn.experimental.selector.OVORF(
configuration=training_data["configuration"],
random_state=np.random.RandomState(1),
n_estimators=500,
tie_break_order=default_strategies,
)
selector = autosklearn.experimental.selector.FallbackWrapper(
selector, default_strategies
)
selector.fit(
X=metafeatures,
y=y_values,
minima=minima_for_methods,
maxima=maxima_for_methods,
)
selector_files[metric.name].parent.mkdir(exist_ok=True, parents=True)

try:
with open(selector_files[metric.name], "wb") as fh:
pickle.dump(selector, fh)
except Exception as e:
print(
"AutoSklearn2Classifier needs to create a selector file under "
"the user's home directory or XDG_CACHE_HOME. Nevertheless "
"the path {} is not writable.".format(selector_files[metric.name])
)
raise e


class SmacObjectCallback:
def __init__(self, portfolio):
Expand Down Expand Up @@ -339,6 +278,7 @@ def __init__(
"classifier": include_estimators,
"feature_preprocessor": include_preprocessors,
}
self.train_selectors(selected_metric=metric)
super().__init__(
time_left_for_this_task=time_left_for_this_task,
per_run_time_limit=per_run_time_limit,
Expand Down Expand Up @@ -367,6 +307,81 @@ def __init__(
allow_string_features=allow_string_features,
)

def train_selectors(self, selected_metric=None):
self.metrics = (balanced_accuracy, roc_auc, log_loss)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we call this self.selector_metrics? We have a version of multi-objective autosklearn coming soon and we use metrics to indicate a list of metrics used for training.

self.selector_files = {}
self.this_directory = pathlib.Path(__file__).resolve().parent

if selected_metric is not None:
metric_list = [selected_metric]
else:
metric_list = self.metrics
for metric in metric_list:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor comment, adding some spacing between different constructs like if/else and for is nice. I imagine it was clumped like this before but there's no reason not to improve it now :) Non-blocking though, it's mergable without it.

training_data_file = (
self.this_directory / metric.name / "askl2_training_data.json"
)
with open(training_data_file) as fh:
training_data = json.load(fh)
fh.seek(0)
m = hashlib.md5()
m.update(fh.read().encode("utf8"))
training_data_hash = m.hexdigest()[:10]
selector_filename = "askl2_selector_%s_%s_%s_%s.pkl" % (
autosklearn.__version__,
sklearn.__version__,
metric.name,
training_data_hash,
)
selector_directory = os.environ.get("XDG_CACHE_HOME")
if selector_directory is None:
selector_directory = pathlib.Path.home()
selector_directory = (
pathlib.Path(selector_directory).joinpath("auto-sklearn").expanduser()
)
self.selector_files[metric.name] = selector_directory / selector_filename
metafeatures = pd.DataFrame(training_data["metafeatures"])
self.strategies = training_data["strategies"]
y_values = pd.DataFrame(
training_data["y_values"],
columns=self.strategies,
index=metafeatures.index,
)
minima_for_methods = training_data["minima_for_methods"]
maxima_for_methods = training_data["maxima_for_methods"]
default_strategies = training_data["tie_break_order"]
if not self.selector_files[metric.name].exists():
selector = autosklearn.experimental.selector.OVORF(
configuration=training_data["configuration"],
random_state=np.random.RandomState(1),
n_estimators=500,
tie_break_order=default_strategies,
)
selector = autosklearn.experimental.selector.FallbackWrapper(
selector, default_strategies
)
selector.fit(
X=metafeatures,
y=y_values,
minima=minima_for_methods,
maxima=maxima_for_methods,
)
self.selector_files[metric.name].parent.mkdir(
exist_ok=True, parents=True
)

try:
with open(self.selector_files[metric.name], "wb") as fh:
pickle.dump(selector, fh)
except Exception as e:
print(
"AutoSklearn2Classifier needs to create a selector file under "
"the user's home directory or XDG_CACHE_HOME. Nevertheless "
"the path {} is not writable.".format(
self.selector_files[metric.name]
)
)
raise e

def fit(
self,
X,
Expand Down Expand Up @@ -408,20 +423,20 @@ def fit(
else:
self.metric = log_loss

if self.metric in metrics:
if self.metric in self.metrics:
metric_name = self.metric.name
selector_file = selector_files[metric_name]
selector_file = self.selector_files[metric_name]
else:
metric_name = "balanced_accuracy"
selector_file = selector_files[metric_name]
selector_file = self.selector_files[metric_name]
with open(selector_file, "rb") as fh:
selector = pickle.load(fh)

metafeatures = pd.DataFrame(
{dataset_name: [X.shape[1], X.shape[0]]}
).transpose()
selection = np.argmax(selector.predict(metafeatures))
automl_policy = strategies[selection]
automl_policy = self.strategies[selection]

setting = {
"RF_None_holdout_iterative_es_if": {
Expand Down Expand Up @@ -471,7 +486,7 @@ def fit(
resampling_strategy_kwargs = None

portfolio_file = (
this_directory
self.this_directory
/ metric_name
/ "askl2_portfolios"
/ ("%s.json" % automl_policy)
Expand Down
20 changes: 12 additions & 8 deletions test/test_estimators/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,27 +1113,31 @@ def test_check_estimator_signature(class_):
],
)
def test_selector_file_askl2_can_be_created(selector_path):

with unittest.mock.patch("os.environ.get") as mock_foo:
mock_foo.return_value = selector_path
if selector_path is not None and not os.access(selector_path, os.W_OK):
with pytest.raises(PermissionError):
importlib.reload(autosklearn.experimental.askl2)
automl = AutoSklearn2Classifier(
time_left_for_this_task=60, delete_tmp_folder_after_terminate=False
)

else:
importlib.reload(autosklearn.experimental.askl2)
for metric in autosklearn.experimental.askl2.metrics:
assert os.path.exists(
autosklearn.experimental.askl2.selector_files[metric.name]
)
automl = AutoSklearn2Classifier(
time_left_for_this_task=60, delete_tmp_folder_after_terminate=False
)
for metric in automl.metrics:
assert os.path.exists(automl.selector_files[metric.name])
if selector_path is None or not os.access(selector_path, os.W_OK):
# We default to home in worst case
assert os.path.expanduser("~") in str(
autosklearn.experimental.askl2.selector_files[metric.name]
automl.selector_files[metric.name]
)
else:
# a dir provided via XDG_CACHE_HOME
assert selector_path in str(
autosklearn.experimental.askl2.selector_files[metric.name]
)
assert selector_path in str(automl.selector_files[metric.name])
# Re import it at the end so we do not affect other test
importlib.reload(autosklearn.experimental.askl2)

Expand Down