Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 85 additions & 66 deletions autosklearn/experimental/askl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,66 +18,7 @@
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import Scorer, accuracy, balanced_accuracy, log_loss, roc_auc

metrics = (balanced_accuracy, roc_auc, log_loss)
selector_files = {}
this_directory = pathlib.Path(__file__).resolve().parent
for metric in metrics:
training_data_file = this_directory / metric.name / "askl2_training_data.json"
with open(training_data_file) as fh:
training_data = json.load(fh)
fh.seek(0)
m = hashlib.md5()
m.update(fh.read().encode("utf8"))
training_data_hash = m.hexdigest()[:10]
selector_filename = "askl2_selector_%s_%s_%s_%s.pkl" % (
autosklearn.__version__,
sklearn.__version__,
metric.name,
training_data_hash,
)
selector_directory = os.environ.get("XDG_CACHE_HOME")
if selector_directory is None:
selector_directory = pathlib.Path.home()
selector_directory = (
pathlib.Path(selector_directory).joinpath("auto-sklearn").expanduser()
)
selector_files[metric.name] = selector_directory / selector_filename
metafeatures = pd.DataFrame(training_data["metafeatures"])
strategies = training_data["strategies"]
y_values = pd.DataFrame(
training_data["y_values"], columns=strategies, index=metafeatures.index
)
minima_for_methods = training_data["minima_for_methods"]
maxima_for_methods = training_data["maxima_for_methods"]
default_strategies = training_data["tie_break_order"]
if not selector_files[metric.name].exists():
selector = autosklearn.experimental.selector.OVORF(
configuration=training_data["configuration"],
random_state=np.random.RandomState(1),
n_estimators=500,
tie_break_order=default_strategies,
)
selector = autosklearn.experimental.selector.FallbackWrapper(
selector, default_strategies
)
selector.fit(
X=metafeatures,
y=y_values,
minima=minima_for_methods,
maxima=maxima_for_methods,
)
selector_files[metric.name].parent.mkdir(exist_ok=True, parents=True)

try:
with open(selector_files[metric.name], "wb") as fh:
pickle.dump(selector, fh)
except Exception as e:
print(
"AutoSklearn2Classifier needs to create a selector file under "
"the user's home directory or XDG_CACHE_HOME. Nevertheless "
"the path {} is not writable.".format(selector_files[metric.name])
)
raise e
selector_metrics = (balanced_accuracy, roc_auc, log_loss)


class SmacObjectCallback:
Expand Down Expand Up @@ -325,7 +266,7 @@ def __init__(
Not all keys returned by scikit-learn are supported yet.

""" # noqa (links are too long)

self.required_training = False # Boolean to indicate if selectors were trained.
include_estimators = [
"extra_trees",
"passive_aggressive",
Expand All @@ -339,6 +280,7 @@ def __init__(
"classifier": include_estimators,
"feature_preprocessor": include_preprocessors,
}
self.train_selectors(selected_metric=metric)
super().__init__(
time_left_for_this_task=time_left_for_this_task,
per_run_time_limit=per_run_time_limit,
Expand Down Expand Up @@ -367,6 +309,83 @@ def __init__(
allow_string_features=allow_string_features,
)

def train_selectors(self, selected_metric=None):
self.selector_metrics = (balanced_accuracy, roc_auc, log_loss)
self.selector_files = {}
self.this_directory = pathlib.Path(__file__).resolve().parent

if selected_metric is not None:
metric_list = [selected_metric]
else:
metric_list = self.selector_metrics

for metric in metric_list:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor comment, adding some spacing between different constructs like if/else and for is nice. I imagine it was clumped like this before but there's no reason not to improve it now :) Non-blocking though, it's mergable without it.

training_data_file = (
self.this_directory / metric.name / "askl2_training_data.json"
)
with open(training_data_file) as fh:
training_data = json.load(fh)
fh.seek(0)
m = hashlib.md5()
m.update(fh.read().encode("utf8"))
training_data_hash = m.hexdigest()[:10]
selector_filename = "askl2_selector_%s_%s_%s_%s.pkl" % (
autosklearn.__version__,
sklearn.__version__,
metric.name,
training_data_hash,
)
selector_directory = os.environ.get("XDG_CACHE_HOME")
if selector_directory is None:
selector_directory = pathlib.Path.home()
selector_directory = (
pathlib.Path(selector_directory).joinpath("auto-sklearn").expanduser()
)
self.selector_files[metric.name] = selector_directory / selector_filename
metafeatures = pd.DataFrame(training_data["metafeatures"])
self.strategies = training_data["strategies"]
y_values = pd.DataFrame(
training_data["y_values"],
columns=self.strategies,
index=metafeatures.index,
)
minima_for_methods = training_data["minima_for_methods"]
maxima_for_methods = training_data["maxima_for_methods"]
default_strategies = training_data["tie_break_order"]
if not self.selector_files[metric.name].exists():
self.required_training = True
selector = autosklearn.experimental.selector.OVORF(
configuration=training_data["configuration"],
random_state=np.random.RandomState(1),
n_estimators=500,
tie_break_order=default_strategies,
)
selector = autosklearn.experimental.selector.FallbackWrapper(
selector, default_strategies
)
selector.fit(
X=metafeatures,
y=y_values,
minima=minima_for_methods,
maxima=maxima_for_methods,
)
self.selector_files[metric.name].parent.mkdir(
exist_ok=True, parents=True
)

try:
with open(self.selector_files[metric.name], "wb") as fh:
pickle.dump(selector, fh)
except Exception as e:
print(
"AutoSklearn2Classifier needs to create a selector file under "
"the user's home directory or XDG_CACHE_HOME. Nevertheless "
"the path {} is not writable.".format(
self.selector_files[metric.name]
)
)
raise e

def fit(
self,
X,
Expand Down Expand Up @@ -408,20 +427,20 @@ def fit(
else:
self.metric = log_loss

if self.metric in metrics:
if self.metric in self.selector_metrics:
metric_name = self.metric.name
selector_file = selector_files[metric_name]
selector_file = self.selector_files[metric_name]
else:
metric_name = "balanced_accuracy"
selector_file = selector_files[metric_name]
selector_file = self.selector_files[metric_name]
with open(selector_file, "rb") as fh:
selector = pickle.load(fh)

metafeatures = pd.DataFrame(
{dataset_name: [X.shape[1], X.shape[0]]}
).transpose()
selection = np.argmax(selector.predict(metafeatures))
automl_policy = strategies[selection]
automl_policy = self.strategies[selection]

setting = {
"RF_None_holdout_iterative_es_if": {
Expand Down Expand Up @@ -471,7 +490,7 @@ def fit(
resampling_strategy_kwargs = None

portfolio_file = (
this_directory
self.this_directory
/ metric_name
/ "askl2_portfolios"
/ ("%s.json" % automl_policy)
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
76 changes: 68 additions & 8 deletions test/test_estimators/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,31 +1113,91 @@ def test_check_estimator_signature(class_):
],
)
def test_selector_file_askl2_can_be_created(selector_path):

with unittest.mock.patch("os.environ.get") as mock_foo:
mock_foo.return_value = selector_path
if selector_path is not None and not os.access(selector_path, os.W_OK):
with pytest.raises(PermissionError):
importlib.reload(autosklearn.experimental.askl2)
automl = AutoSklearn2Classifier(
time_left_for_this_task=60, delete_tmp_folder_after_terminate=False
)

else:
importlib.reload(autosklearn.experimental.askl2)
for metric in autosklearn.experimental.askl2.metrics:
assert os.path.exists(
autosklearn.experimental.askl2.selector_files[metric.name]
)
automl = AutoSklearn2Classifier(
time_left_for_this_task=60, delete_tmp_folder_after_terminate=False
)
for metric in automl.selector_metrics:
assert os.path.exists(automl.selector_files[metric.name])
if selector_path is None or not os.access(selector_path, os.W_OK):
# We default to home in worst case
assert os.path.expanduser("~") in str(
autosklearn.experimental.askl2.selector_files[metric.name]
automl.selector_files[metric.name]
)
else:
# a dir provided via XDG_CACHE_HOME
assert selector_path in str(
autosklearn.experimental.askl2.selector_files[metric.name]
)
assert selector_path in str(automl.selector_files[metric.name])
# Re import it at the end so we do not affect other test
importlib.reload(autosklearn.experimental.askl2)


@pytest.mark.parametrize(
"metric",
[metric for metric in autosklearn.experimental.askl2.selector_metrics],
)
def test_askl2_fits_selector_for_given_metrics_at_init(tmp_path, metric):

assert tmp_path.is_dir()
assert len(list(tmp_path.iterdir())) == 0
temp_dir = str(tmp_path)

with unittest.mock.patch("os.environ.get") as mock_foo:
mock_foo.return_value = temp_dir
automl = AutoSklearn2Classifier(
time_left_for_this_task=60,
delete_tmp_folder_after_terminate=False,
metric=metric,
)
assert (
len(automl.selector_files) == 1
) # only one selector file should have been created
assert os.path.exists(
str(automl.selector_files[metric.name])
) # check if the path exists

# check if selector is retrained when
# another object with the same metric is created
automl_1 = AutoSklearn2Classifier(
time_left_for_this_task=60,
delete_tmp_folder_after_terminate=False,
metric=metric,
)
assert (
len(automl_1.selector_files) == 1
) # only one selector file should have been created
assert os.path.exists(
str(automl_1.selector_files[metric.name])
) # check if the path exists
assert not automl_1.required_training


def test_askl2_fit_when_no_metric_specified(tmp_path):

assert tmp_path.is_dir()
assert len(list(tmp_path.iterdir())) == 0
temp_dir = str(tmp_path)

with unittest.mock.patch("os.environ.get") as mock_foo:
mock_foo.return_value = temp_dir
automl = AutoSklearn2Classifier(
time_left_for_this_task=60, delete_tmp_folder_after_terminate=False
)
assert len(automl.selector_files) == 3
for metric in automl.selector_metrics:
assert os.path.exists(str(automl.selector_files[metric.name]))


def test_check_askl2_same_arguments_as_askl() -> None:
"""Check the asklearn2 has the same args as asklearn1

Expand Down