diff --git a/.codecov.yml b/.codecov.yml index 5fb5a76bb5..fcf754bf7d 100755 --- a/.codecov.yml +++ b/.codecov.yml @@ -1,42 +1,46 @@ #see https://github.com/codecov/support/wiki/Codecov-Yaml codecov: - notify: - require_ci_to_pass: yes + require_ci_to_pass: yes coverage: - precision: 2 # 2 = xx.xx%, 0 = xx% - round: nearest # how coverage is rounded: down/up/nearest - range: 10...90 # custom range of coverage colors from red -> yellow -> green + + # 2 = xx.xx%, 0 = xx% + precision: 2 + + # https://docs.codecov.com/docs/commit-status status: - # https://codecov.readme.io/v1.0/docs/commit-status + + # We want our total main project to always remain above 87% coverage, a + # drop of 0.20% is allowed. It should fail if coverage couldn't be uploaded + # of the CI fails otherwise project: default: - against: auto - target: 70% # specify the target coverage for each commit status - threshold: 50% # allow this little decrease on project - # https://github.com/codecov/support/wiki/Filtering-Branches - # branches: master + target: 87% + threshold: 0.20% + if_not_found: failure if_ci_failed: error - # https://github.com/codecov/support/wiki/Patch-Status + + # The code changed by a PR should have 90% coverage. This is different from the + # overall number shown above. + # This encourages small PR's as they are easier to test. patch: default: - against: auto - target: 30% # specify the target "X%" coverage to hit - threshold: 50% # allow this much decrease on patch - changes: false + target: 90% + if_not_found: failure + if_ci_failed: error +# We upload additional information on branching with pytest-cov `--cov-branch` +# This information can be used by codecov.com to increase analysis of code parsers: gcov: branch_detection: conditional: true loop: true + method: true macro: false - method: false - javascript: - enable_partials: false + comment: - layout: header, diff + layout: diff, reach + behavior: default require_changes: false - behavior: default # update if exists else create new - branches: * \ No newline at end of file diff --git a/.flake8 b/.flake8 index 704b67af15..f3a26a3b56 100644 --- a/.flake8 +++ b/.flake8 @@ -1,7 +1,12 @@ [flake8] -max-line-length = 100 show-source = True -application-import-names = autosklearn -exclude = +max-line-length = 88 +extend-exclude = venv + .venv build +extend-ignore = + # No whitespace before ':' in [x : y] + E203 + # No lambdas — too strict + E731 diff --git a/.github/workflows/benchmarking-files/regressions-util.py b/.github/workflows/benchmarking-files/regressions-util.py index c18635ca43..5128d0bbbe 100644 --- a/.github/workflows/benchmarking-files/regressions-util.py +++ b/.github/workflows/benchmarking-files/regressions-util.py @@ -7,34 +7,36 @@ import numpy as np import pandas as pd -CLASSIFICATION_METRICS = ['acc', 'auc', 'balacc', 'logloss'] -REGRESSION_METRICS = ['mae', 'r2', 'rmse'] +CLASSIFICATION_METRICS = ["acc", "auc", "balacc", "logloss"] +REGRESSION_METRICS = ["mae", "r2", "rmse"] METRICS = CLASSIFICATION_METRICS + REGRESSION_METRICS + def _get_mean_results_across_folds(df) -> pd.DataFrame: - """ Returns a dataframe with the task, id, metric and the mean values - across folds + """Returns a dataframe with the task, id, metric and the mean values + across folds - [idx: 'task', 'id', 'metric', ... mean metrics across folds ...] + [idx: 'task', 'id', 'metric', ... mean metrics across folds ...] """ # Get the information about id and metric, only need info from first fold # [idx: task, id, metric] - df_info = df[df['fold'] == 0][['task', 'id', 'metric']].set_index('task') + df_info = df[df["fold"] == 0][["task", "id", "metric"]].set_index("task") # [idx: task, ... mean metrics across folds ...] available_metrics = list(set(METRICS).intersection(set(df.columns))) - df_means = df[['task'] + available_metrics].groupby(['task']).mean() + df_means = df[["task"] + available_metrics].groupby(["task"]).mean() return df_info.join(df_means) + def generate_framework_def( user_dir: str, username: str, branch: str, commit: str, # Not used in this setup but perhaps in a different one ): - """ Creates a framework definition to run an autosklearn repo. + """Creates a framework definition to run an autosklearn repo. Technically we only use the commit to pull the targeted version but for naming consistency, we need to know the branch too. @@ -61,40 +63,44 @@ def generate_framework_def( or #8b474a437ce980bd0909db59141b40d56f6d5688 """ - assert len(commit) == 41 and commit[0] == '#' or len(commit) == 40, \ - "Not a commit hash" + assert ( + len(commit) == 41 and commit[0] == "#" or len(commit) == 40 + ), "Not a commit hash" # automlbenchmark requires the '#' to identify it's a commit rather than # a branch being targeted - if commit[0] != '#': - commit = '#' + commit + if commit[0] != "#": + commit = "#" + commit # Tried commit and ssh repo but was getting errors with ssh # Tried commit and https but getting issues with commit ref # Using branch and https version = branch - repo = f'https://github.com/{username}/auto-sklearn.git' + repo = f"https://github.com/{username}/auto-sklearn.git" # Create the framework file - lines = '\n'.join([ - f"---", - f"autosklearn_targeted:", - f" extends: autosklearn", - f" version: '{version}'", - f" repo: '{repo}'" - ]) - - filepath = os.path.join(user_dir, 'frameworks.yaml') - with open(filepath, 'w') as f: + lines = "\n".join( + [ + f"---", + f"autosklearn_targeted:", + f" extends: autosklearn", + f" version: '{version}'", + f" repo: '{repo}'", + ] + ) + + filepath = os.path.join(user_dir, "frameworks.yaml") + with open(filepath, "w") as f: f.writelines(lines) + def create_comparison( baseline_csv_classification: str, baseline_csv_regression: str, targeted_csv_classification: str, targeted_csv_regression: str, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ Creates a csv with comparisons between the baseline and results. + """Creates a csv with comparisons between the baseline and results. Scores are averaged across folds for a given task. @@ -143,28 +149,33 @@ def create_comparison( df_targeted_means = _get_mean_results_across_folds(df_targeted) # Find the set intersection of tasks they have in common - common_tasks = set(df_baseline_means.index).intersection(set(df_targeted_means.index)) + common_tasks = set(df_baseline_means.index).intersection( + set(df_targeted_means.index) + ) # Find the set of metrics that are comparable baseline_metrics = set(METRICS).intersection(set(df_baseline_means.columns)) common_metrics = baseline_metrics.intersection(set(df_targeted_means.columns)) # Calculate the differences for in common tasks, across all available metrics - df_differences = df_targeted_means.loc[common_tasks][common_metrics] \ + df_differences = ( + df_targeted_means.loc[common_tasks][common_metrics] - df_baseline_means.loc[common_tasks][common_metrics] + ) # Get the metric used for training and the dataset id of common tasks - df_info = df_baseline_means.loc[common_tasks][['id', 'metric']] + df_info = df_baseline_means.loc[common_tasks][["id", "metric"]] # Join together the info and the differences return df_baseline_means, df_targeted_means, df_info.join(df_differences) + def create_comparisons_markdown( baseline_means_csv: str, targeted_means_csv: str, compared_means_csv: str, ) -> str: - """ Creates markdown that can be posted to Github that shows + """Creates markdown that can be posted to Github that shows a comparison between the baseline and the targeted branch. Parameters @@ -186,36 +197,40 @@ def create_comparisons_markdown( """ # Create colours and func to create the markdown for it colours = { - 'Worse': ['353536', '800000', 'bd0000', 'ff0000'], - 'Better': ['353536', '306300', '51a800', '6fe600'], - 'Good': '6fe600', - 'Bad': 'ff0000', - 'Neutral': '353536', - 'NaN': '52544f', + "Worse": ["353536", "800000", "bd0000", "ff0000"], + "Better": ["353536", "306300", "51a800", "6fe600"], + "Good": "6fe600", + "Bad": "ff0000", + "Neutral": "353536", + "NaN": "52544f", } + def colour(kind, scale=None): c = colours[kind] if scale is None else colours[kind][scale] - return f'![#{c}](https://via.placeholder.com/15/{c}/000000?text=+)' + return f"![#{c}](https://via.placeholder.com/15/{c}/000000?text=+)" # Metrics, whether positive is better and the tolerances between each # Neutral, kind of good/bad, very good/bad etc... metric_tolerances = { - 'acc': { 'positive_is_better': True ,'tol': [0.001, 0.01, 0.2] }, - 'auc':{ 'positive_is_better': True ,'tol': [0.001, 0.01, 0.2] }, - 'balacc': { 'positive_is_better': True ,'tol': [0.001, 0.01, 0.2] }, - 'logloss': { 'positive_is_better': False ,'tol': [0.009, 0.01, 0.2] }, - 'mae': { 'positive_is_better': False ,'tol': [0.001, 0.01, 0.2] }, - 'r2': { 'positive_is_better': True ,'tol': [0.001, 0.01, 0.2] }, - 'rmse': { 'positive_is_better': False ,'tol': [0.001, 0.01, 0.2] }, + "acc": {"positive_is_better": True, "tol": [0.001, 0.01, 0.2]}, + "auc": {"positive_is_better": True, "tol": [0.001, 0.01, 0.2]}, + "balacc": {"positive_is_better": True, "tol": [0.001, 0.01, 0.2]}, + "logloss": {"positive_is_better": False, "tol": [0.009, 0.01, 0.2]}, + "mae": {"positive_is_better": False, "tol": [0.001, 0.01, 0.2]}, + "r2": {"positive_is_better": True, "tol": [0.001, 0.01, 0.2]}, + "rmse": {"positive_is_better": False, "tol": [0.001, 0.01, 0.2]}, } + def is_good(score, metric): return ( - score > 0 and metric_tolerances[metric]['positive_is_better'] - or score < 0 and not metric_tolerances[metric]['positive_is_better'] + score > 0 + and metric_tolerances[metric]["positive_is_better"] + or score < 0 + and not metric_tolerances[metric]["positive_is_better"] ) def is_neutral(diff, baseline_score, metric): - tolerance = metric_tolerances[metric]['tol'][0] + tolerance = metric_tolerances[metric]["tol"][0] if baseline_score == 0: baseline_score = 1e-10 prc_diff = diff / baseline_score @@ -223,18 +238,18 @@ def is_neutral(diff, baseline_score, metric): def tolerance_colour(baseline_value, comparison_value, metric): if np.isnan(baseline_value) or np.isnan(comparison_value): - return colour('NaN') + return colour("NaN") if baseline_value == 0: baseline_value = 1e-10 prc_diff = comparison_value / baseline_value - tolerances = metric_tolerances[metric]['tol'] - if metric_tolerances[metric]['positive_is_better']: - diff_color = 'Better' if prc_diff > 0 else 'Worse' + tolerances = metric_tolerances[metric]["tol"] + if metric_tolerances[metric]["positive_is_better"]: + diff_color = "Better" if prc_diff > 0 else "Worse" else: - diff_color = 'Better' if prc_diff < 0 else 'Worse' + diff_color = "Better" if prc_diff < 0 else "Worse" if abs(prc_diff) < tolerances[0]: return colour(diff_color, 0) @@ -245,23 +260,24 @@ def tolerance_colour(baseline_value, comparison_value, metric): else: return colour(diff_color, 3) - legend = { - 'B': 'Baseline', - 'T': 'Target Version', - '**Bold**': 'Training Metric', - '/': 'Missing Value', - '---': 'Missing Task' + "B": "Baseline", + "T": "Target Version", + "**Bold**": "Training Metric", + "/": "Missing Value", + "---": "Missing Task", } - legend.update({ - key: colour(key) - for key in set(colours.keys()) - set(['Worse', 'Better', 'Good', 'Bad']) - }) + legend.update( + { + key: colour(key) + for key in set(colours.keys()) - set(["Worse", "Better", "Good", "Bad"]) + } + ) # Worse and better are handled seperatly - compared = pd.read_csv(compared_means_csv, index_col='task') - baseline = pd.read_csv(baseline_means_csv, index_col='task') - targeted = pd.read_csv(targeted_means_csv, index_col='task') + compared = pd.read_csv(compared_means_csv, index_col="task") + baseline = pd.read_csv(baseline_means_csv, index_col="task") + targeted = pd.read_csv(targeted_means_csv, index_col="task") # Some things to keep track of for the textual summary n_performed_equally = 0 @@ -269,9 +285,9 @@ def tolerance_colour(baseline_value, comparison_value, metric): n_performed_worse = 0 n_could_not_compare = 0 - headers = ['task', 'metric'] + METRICS - table_header = '|'.join(headers) - seperator = '|'.join(len(headers) * ['---']) + headers = ["task", "metric"] + METRICS + table_header = "|".join(headers) + seperator = "|".join(len(headers) * ["---"]) lines = [table_header, seperator] @@ -279,13 +295,13 @@ def tolerance_colour(baseline_value, comparison_value, metric): # The chosen metric name and the csv column differ with neg_logloss and # logloss - training_metric = baseline.loc[task]['metric'] + training_metric = baseline.loc[task]["metric"] if training_metric == "neg_logloss": training_metric = "logloss" # The baseline has tasks we can't compare with if task not in compared.index: - line = '|'.join([task, training_metric] + len(METRICS) * ['---']) + line = "|".join([task, training_metric] + len(METRICS) * ["---"]) lines.append(line) # We can compare for a given tasks @@ -299,34 +315,23 @@ def tolerance_colour(baseline_value, comparison_value, metric): # If the metric does not exist for either, do fill it in as # missing - if ( - metric not in baseline.columns - and metric not in compared.columns - ): + if metric not in baseline.columns and metric not in compared.columns: n_could_not_compare += 1 - entry = '/' + entry = "/" # If the metric exists in the baseline but not in the comparison - elif ( - metric in baseline.columns - and not metric in compared.columns - ): + elif metric in baseline.columns and not metric in compared.columns: n_could_not_compare += 1 - entry = '
'.join([ - f' B : {baseline.loc[task][metric]:.3f}', - f' T : /' - ]) + entry = "
".join( + [f" B : {baseline.loc[task][metric]:.3f}", f" T : /"] + ) # If the metric exists in the comparison but not in the baseline - elif ( - metric in compared.columns - and not metric in baseline.columns - ): + elif metric in compared.columns and not metric in baseline.columns: n_could_not_compare += 1 - entry = '
'.join([ - f' B : /', - f' T : {targeted.loc[task][metric]:.3f}' - ]) + entry = "
".join( + [f" B : /", f" T : {targeted.loc[task][metric]:.3f}"] + ) # The metric must exist in both else: @@ -339,37 +344,43 @@ def tolerance_colour(baseline_value, comparison_value, metric): else: n_performed_worse += 1 - diff_colour = tolerance_colour(baseline_score, - compared_score, - metric) - entry = '
'.join([ - f' B : {baseline.loc[task][metric]:.3f}', - f' T : {targeted.loc[task][metric]:.3f}', - f'{diff_colour}: {compared.loc[task][metric]:.3f}' - ]) + diff_colour = tolerance_colour( + baseline_score, compared_score, metric + ) + entry = "
".join( + [ + f" B : {baseline.loc[task][metric]:.3f}", + f" T : {targeted.loc[task][metric]:.3f}", + f"{diff_colour}: {compared.loc[task][metric]:.3f}", + ] + ) # Make the training metric entry bold if metric == training_metric: - entry = '' + entry + '' + entry = "" + entry + "" entries.append(entry) # Create the line - line = '|'.join([task, training_metric] + entries) + line = "|".join([task, training_metric] + entries) lines.append(line) # Create the legend line score_scale = { - 'worse': "".join(colour('Worse', scale) for scale in range(len(colours['Worse']) - 1, 0, -1)), - 'better': "".join(colour('Better', scale) for scale in range(len(colours['Better']))) + "worse": "".join( + colour("Worse", scale) for scale in range(len(colours["Worse"]) - 1, 0, -1) + ), + "better": "".join( + colour("Better", scale) for scale in range(len(colours["Better"])) + ), } score_scale = f'worse {score_scale["worse"] + score_scale["better"]} better' - legend_str = '\t\t\t||\t\t'.join([score_scale] + [ - f'{key} - {text}' for key, text in legend.items() - ]) + legend_str = "\t\t\t||\t\t".join( + [score_scale] + [f"{key} - {text}" for key, text in legend.items()] + ) - lines.append('') + lines.append("") lines.append(legend_str) # Create a textual summary to go at the top @@ -377,7 +388,6 @@ def tolerance_colour(baseline_value, comparison_value, metric): compared_tasks = list(compared.index) non_compared_tasks = list(set(baseline.index) - set(compared_tasks)) - # Populate info about each metric per_metric_info = {} for metric in compared_metrics: @@ -387,36 +397,37 @@ def tolerance_colour(baseline_value, comparison_value, metric): item_colour = "" if is_neutral(compared_average, baseline_average, metric): - item_colour = colour('Neutral') + item_colour = colour("Neutral") elif is_good(compared_average, metric): - item_colour = colour('Good') + item_colour = colour("Good") else: - item_colour = colour('Bad') + item_colour = colour("Bad") per_metric_info[metric] = { - 'average': compared_average, - 'n_compared': n_compared, - 'colour': item_colour + "average": compared_average, + "n_compared": n_compared, + "colour": item_colour, } - summary = '\n'.join([ - f"# Results", - f"Overall the targeted versions performance across {len(compared_tasks)} task(s) and {len(compared_metrics)} metric(s)", - f"", - f"* Equally on {n_performed_equally} comparisons", - f"* Better on {n_performed_better} comparisons", - f"* Worse on {n_performed_worse} comparisons", - f"", - f"There were {len(non_compared_tasks)} task(s) that could not be compared.", - f"", - f"The average change for each metric is:" - f"" - ] + [ + summary = "\n".join( + [ + f"# Results", + f"Overall the targeted versions performance across {len(compared_tasks)} task(s) and {len(compared_metrics)} metric(s)", + f"", + f"* Equally on {n_performed_equally} comparisons", + f"* Better on {n_performed_better} comparisons", + f"* Worse on {n_performed_worse} comparisons", + f"", + f"There were {len(non_compared_tasks)} task(s) that could not be compared.", + f"", + f"The average change for each metric is:" f"", + ] + + [ f"* {metric}: {info['colour']} {info['average']:.4f} across {info['n_compared']} task(s)" for metric, info in per_metric_info.items() ] ) - return '\n'.join([summary] + [""] + lines) + return "\n".join([summary] + [""] + lines) if __name__ == "__main__": @@ -424,30 +435,30 @@ def tolerance_colour(baseline_value, comparison_value, metric): # Generates a framework definition for automlbenchmark so that we can target # auto-sklearn versions that are not our own - parser.add_argument('--generate-framework-def', action='store_true') - parser.add_argument('--user-dir', type=str) - parser.add_argument('--owner', type=str) - parser.add_argument('--branch', type=str) - parser.add_argument('--commit', type=str) + parser.add_argument("--generate-framework-def", action="store_true") + parser.add_argument("--user-dir", type=str) + parser.add_argument("--owner", type=str) + parser.add_argument("--branch", type=str) + parser.add_argument("--commit", type=str) # For comparing results generated by automlbenchmark for: # -> baseline results generated [--baseline-csv] # -> targeted results generated [--target-csv] - # by automlbenchmark and the target branch 'results' generated - parser.add_argument('--compare-results', action='store_true') - parser.add_argument('--baseline-csv-classification', type=str) - parser.add_argument('--baseline-csv-regression', type=str) - parser.add_argument('--targeted-csv-classification', type=str) - parser.add_argument('--targeted-csv-regression', type=str) - parser.add_argument('--baseline-means-to', type=str) - parser.add_argument('--targeted-means-to', type=str) - parser.add_argument('--compared-means-to', type=str) + # by automlbenchmark and the target branch 'results' generated + parser.add_argument("--compare-results", action="store_true") + parser.add_argument("--baseline-csv-classification", type=str) + parser.add_argument("--baseline-csv-regression", type=str) + parser.add_argument("--targeted-csv-classification", type=str) + parser.add_argument("--targeted-csv-regression", type=str) + parser.add_argument("--baseline-means-to", type=str) + parser.add_argument("--targeted-means-to", type=str) + parser.add_argument("--compared-means-to", type=str) # For generating markdown that can be posted to github that shows the results - parser.add_argument('--generate-markdown', action='store_true') - parser.add_argument('--compared-means-csv', type=str) - parser.add_argument('--baseline-means-csv', type=str) - parser.add_argument('--targeted-means-csv', type=str) + parser.add_argument("--generate-markdown", action="store_true") + parser.add_argument("--compared-means-csv", type=str) + parser.add_argument("--baseline-means-csv", type=str) + parser.add_argument("--targeted-means-csv", type=str) args = parser.parse_args() @@ -459,11 +470,17 @@ def tolerance_colour(baseline_value, comparison_value, metric): elif args.compare_results: - assert all([ - args.baseline_csv_classification, args.baseline_csv_regression, - args.targeted_csv_classification, args.baseline_csv_regression, - args.baseline_means_to, args.targeted_means_to, args.compared_means_to - ]) + assert all( + [ + args.baseline_csv_classification, + args.baseline_csv_regression, + args.targeted_csv_classification, + args.baseline_csv_regression, + args.baseline_means_to, + args.targeted_means_to, + args.compared_means_to, + ] + ) baseline_means, targeted_means, compared_means = create_comparison( baseline_csv_classification=args.baseline_csv_classification, @@ -480,9 +497,9 @@ def tolerance_colour(baseline_value, comparison_value, metric): df.to_csv(path) elif args.generate_markdown: - assert all([ - args.baseline_means_csv, args.targeted_means_csv, args.compared_means_csv - ]) + assert all( + [args.baseline_means_csv, args.targeted_means_csv, args.compared_means_csv] + ) comparisons_table = create_comparisons_markdown( baseline_means_csv=args.baseline_means_csv, diff --git a/.github/workflows/black_checker.yml b/.github/workflows/black_checker.yml deleted file mode 100644 index fac1723682..0000000000 --- a/.github/workflows/black_checker.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: black-format-check - -on: - # Manually triggerable in github - workflow_dispatch: - - # When a push occurs on either of these branches - push: - branches: - - master - - development - - # When a push occurs on a PR that targets these branches - pull_request: - branches: - - master - - development - -env: - #If STRICT is set to true, it will fail on black check fail - STRICT: false - -jobs: - - black-format-check: - runs-on: ubuntu-latest - steps: - - - name: Checkout - uses: actions/checkout@v2 - with: - submodules: recursive - - - name: Setup Python 3.7 - uses: actions/setup-python@v2 - with: - python-version: "3.7" - - - name: Install black - run: | - pip install black - - - name: Run Black Check - run: | - black --check --diff --line-length 100 ./autosklearn || ! $STRICT - black --check --diff --line-length 100 ./test || ! $STRICT - black --check --diff --line-length 100 ./examples|| ! $STRICT diff --git a/.github/workflows/isort_checker.yml b/.github/workflows/isort_checker.yml deleted file mode 100644 index 4f1f03f5a8..0000000000 --- a/.github/workflows/isort_checker.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: isort-check - -on: - # Manually triggerable in github - workflow_dispatch: - - # When a push occurs on either of these branches - push: - branches: - - master - - development - - # When a push occurs on a PR that targets these branches - pull_request: - branches: - - master - - development - -env: - #If STRICT is set to true, it will fail on isort check fail - STRICT: false - -jobs: - - isort-format-check: - runs-on: ubuntu-latest - steps: - - - name: Checkout - uses: actions/checkout@v2 - with: - submodules: recursive - - - name: Setup Python 3.7 - uses: actions/setup-python@v2 - with: - python-version: "3.7" - - - name: Install isort - run: | - pip install isort - - - name: Run isort Check - run: | - isort --check-only autosklearn || ! $STRICT diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 91b5bbdf54..4c8a59dc4e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,57 +1,62 @@ +# If you see me, please update my `rev` field using the provided links +# Click the repo and update to latest tags. +# If things break on update, raise an issue repos: + + - repo: https://github.com/pycqa/isort + rev: 5.10.1 + hooks: + - id: isort + name: isort imports autosklearn + files: autosklearn/.* + args: [--check] + + - id: isort + name: isort imports test + files: test/.* + args: [--check] + + - repo: https://github.com/ambv/black + rev: 22.1.0 + hooks: + - id: black + name: black formatter autosklearn + files: autosklearn/.* + args: [--check] + + - id: black + name: black formatter test + files: test/.* + args: [--check] + + - id: black + name: black formatter examples + files: examples/.* + args: [--check] + + # This is disabled as most modules fail this + - repo: https://github.com/pycqa/pydocstyle + rev: 6.1.1 + hooks: + - id: pydocstyle + files: DISABLED # autosklearn/.* + always_run: false + additional_dependencies: ["toml"] # Needed to parse pyproject.toml + - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.761 + rev: v0.930 hooks: - id: mypy - args: [--show-error-codes] - name: mypy auto-sklearn-ensembles - files: autosklearn/ensembles - - id: mypy - args: [--show-error-codes] - name: mypy auto-sklearn-metrics - files: autosklearn/metrics - - id: mypy - args: [--show-error-codes] - name: mypy auto-sklearn-data - files: autosklearn/data - - id: mypy - args: [--show-error-codes] - name: mypy auto-sklearn-util - files: autosklearn/util - - id: mypy - args: [--show-error-codes] - name: mypy auto-sklearn-evaluation - files: autosklearn/evaluation - - id: mypy - args: [--show-error-codes] - name: mypy auto-sklearn-datapreprocessing - files: autosklearn/pipeline/components/data_preprocessing/ + name: mypy auto-sklearn + files: autosklearn/.* + - repo: https://gitlab.com/pycqa/flake8 - rev: 3.8.3 + rev: 4.0.1 hooks: - id: flake8 - name: flake8 auto-sklearn + name: flake8 autosklearn files: autosklearn/.* + - id: flake8 - name: flake8 file-order-data - files: autosklearn/data - additional_dependencies: - - flake8-import-order - - id: flake8 - name: flake8 file-order-ensemble - files: autosklearn/ensembles - additional_dependencies: - - flake8-import-order - - id: flake8 - name: flake8 file-order-metrics - files: autosklearn/metrics - additional_dependencies: - - flake8-import-order - - id: flake8 - name: flake8 file-order-util - files: autosklearn/util - additional_dependencies: - - flake8-import-order - - id: flake8 - name: flake8 autosklearn-test + name: flake8 test files: test/.* diff --git a/Makefile b/Makefile index b00a2392ec..495b1c2f24 100644 --- a/Makefile +++ b/Makefile @@ -1,32 +1,122 @@ -# simple makefile to simplify repetitive build env management tasks under posix +# NOTE: Used on linux, limited support outside of Linux +# +# A simple makefile to help with small tasks related to development of autosklearn +# These have been configured to only really run short tasks. Longer form tasks +# are usually completed in github actions. + +.PHONY: help install-dev check format pre-commit clean clean-doc clean-build build doc links examples publish test + +help: + @echo "Makefile autosklearn" + @echo "* install-dev to install all dev requirements and install pre-commit" + @echo "* check to check the source code for issues" + @echo "* format to format the code with black and isort" + @echo "* pre-commit to run the pre-commit check" + @echo "* clean to clean the dist and doc build files" + @echo "* build to build a dist" + @echo "* doc to generate and view the html files" + @echo "* linkcheck to check the documentation links" + @echo "* examples to run and generate the examples" + @echo "* publish to help publish the current branch to pypi" + @echo "* test to run the tests" PYTHON ?= python CYTHON ?= cython PYTEST ?= python -m pytest CTAGS ?= ctags +PIP ?= python -m pip +MAKE ?= make +BLACK ?= black +ISORT ?= isort +PYDOCSTYLE ?= pydocstyle +MYPY ?= mypy +PRECOMMIT ?= pre-commit +FLAKE8 ?= flake8 + +DIR := ${CURDIR} +DIST := ${CURDIR}/dist +DOCDIR := ${DIR}/doc +INDEX_HTML := file://${DOCDIR}/html/build/index.html + +install-dev: + $(PIP) install -e ".[test,examples,docs]" + pre-commit install + +check-black: + $(BLACK) autosklearn examples test --check || : + +check-isort: + $(ISORT) autosklearn test --check || : + +check-pydocstyle: + $(PYDOCSTYLE) autosklearn || : + +check-mypy: + $(MYPY) autosklearn || : + +check-flake8: + $(FLAKE8) autosklearn || : + $(FLAKE8) test || : -all: clean inplace test +# pydocstyle does not have easy ignore rules, instead, we include as they are covered +check: check-black check-isort check-mypy check-flake8 # check-pydocstyle -clean: +pre-commit: + $(PRECOMMIT) run --all-files + +format-black: + $(BLACK) autosklearn/.* + $(BLACK) test/.* + $(BLACK) examples/.* + +format-isort: + $(ISORT) autosklearn + $(ISORT) test + + +format: format-black format-isort + +clean-doc: + $(MAKE) -C ${DOCDIR} clean + +clean-build: $(PYTHON) setup.py clean - rm -rf dist + rm -rf ${DIST} -in: inplace # just a shortcut -inplace: - $(PYTHON) setup.py build_ext -i +# Clean up any builds in ./dist as well as doc +clean: clean-doc clean-build + +# Build a distribution in ./dist +build: + $(PYTHON) setup.py bdist doc: - cd ./doc - make html - cd .. + $(MAKE) -C ${DOCDIR} html-noexamples + @echo + @echo "View docs at:" + @echo ${INDEX_HTML} + +links: + $(MAKE) -C ${DOCDIR} linkcheck -test-code: in - $(PYTEST) -s -v test -test-doc: - $(PYTEST) -s -v doc/*.rst +examples: + $(MAKE) -C ${DOCDIR} html + @echo + @echo "View docs at:" + @echo ${INDEX_HTML} -test-coverage: - rm -rf coverage .coverage - $(PYTEST) -s -v --with-coverage test +# Publish to testpypi +# Will echo the commands to actually publish to be run to publish to actual PyPi +# This is done to prevent accidental publishing but provide the same conveniences +publish: clean-build build + $(PIP) install twine + $(PYTHON) -m twine upload --repository testpypi ${DIST}/* + @echo + @echo "Test with the following line:" + @echo "pip install --index-url https://test.pypi.org/simple/ auto-sklearn" + @echo + @echo "Once you have decided it works, publish to actual pypi with" + @echo "python -m twine upload dist/*" -test: test-code test-sphinxext test-doc +test: + $(PYTEST) test diff --git a/autosklearn/__init__.py b/autosklearn/__init__.py index dae47a1089..2bff637af8 100644 --- a/autosklearn/__init__.py +++ b/autosklearn/__init__.py @@ -1,27 +1,26 @@ # -*- encoding: utf-8 -*- import os -import pkg_resources import sys -from autosklearn.util import dependencies -from autosklearn.__version__ import __version__ # noqa (imported but unused) +import pkg_resources +from autosklearn.__version__ import __version__ # noqa (imported but unused) +from autosklearn.util import dependencies -requirements = pkg_resources.resource_string('autosklearn', 'requirements.txt') -requirements = requirements.decode('utf-8') +requirements = pkg_resources.resource_string("autosklearn", "requirements.txt") +requirements = requirements.decode("utf-8") dependencies.verify_packages(requirements) -if os.name != 'posix': +if os.name != "posix": raise ValueError( - 'Detected unsupported operating system: %s. Please check ' - 'the compability information of auto-sklearn: https://automl.github.io' - '/auto-sklearn/stable/installation.html#windows-osx-compability' % - sys.platform + "Detected unsupported operating system: %s. Please check " + "the compability information of auto-sklearn: https://automl.github.io" + "/auto-sklearn/stable/installation.html#windows-osx-compability" % sys.platform ) if sys.version_info < (3, 6): raise ValueError( - 'Unsupported python version %s found. Auto-sklearn requires Python ' - '3.6 or higher.' % sys.version_info + "Unsupported python version %s found. Auto-sklearn requires Python " + "3.6 or higher." % sys.version_info ) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index f8057d130b..cee7c492f3 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -1,90 +1,102 @@ # -*- encoding: utf-8 -*- +from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast + import copy -import distro import io import json -import platform import logging.handlers import multiprocessing import os +import platform import sys +import tempfile import time -from typing import Any, Dict, Mapping, Optional, List, Tuple, Union, cast -import uuid import unittest.mock -import tempfile +import uuid -from ConfigSpace.configuration_space import Configuration -from ConfigSpace.read_and_write import json as cs_json import dask import dask.distributed +import distro +import joblib import numpy as np import numpy.ma as ma import pandas as pd import pkg_resources import scipy.stats -from sklearn.base import BaseEstimator -from sklearn.ensemble import VotingRegressor -from sklearn.model_selection._split import _RepeatedSplits, \ - BaseShuffleSplit, BaseCrossValidator -from smac.runhistory.runhistory import RunInfo, RunValue -from smac.tae import StatusType -from smac.stats.stats import Stats -import joblib import sklearn.utils +from ConfigSpace.configuration_space import Configuration +from ConfigSpace.read_and_write import json as cs_json from scipy.sparse import spmatrix +from sklearn.base import BaseEstimator +from sklearn.dummy import DummyClassifier, DummyRegressor +from sklearn.ensemble import VotingRegressor +from sklearn.metrics._classification import type_of_target +from sklearn.model_selection._split import ( + BaseCrossValidator, + BaseShuffleSplit, + _RepeatedSplits, +) from sklearn.utils import check_random_state from sklearn.utils.validation import check_is_fitted -from sklearn.metrics._classification import type_of_target -from sklearn.dummy import DummyClassifier, DummyRegressor +from smac.runhistory.runhistory import RunInfo, RunValue +from smac.stats.stats import Stats +from smac.tae import StatusType from autosklearn.automl_common.common.utils.backend import Backend, create - -from autosklearn.metrics import Scorer, default_metric_for_task -from autosklearn.data.xy_data_manager import XYDataManager +from autosklearn.constants import ( + BINARY_CLASSIFICATION, + CLASSIFICATION_TASKS, + MULTICLASS_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, + MULTIOUTPUT_REGRESSION, + REGRESSION, + REGRESSION_TASKS, +) from autosklearn.data.validation import ( - convert_if_sparse, - InputValidator, SUPPORTED_FEAT_TYPES, SUPPORTED_TARGET_TYPES, + InputValidator, + convert_if_sparse, ) +from autosklearn.data.xy_data_manager import XYDataManager +from autosklearn.ensemble_builder import EnsembleBuilderManager +from autosklearn.ensembles.singlebest_ensemble import SingleBest from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings from autosklearn.evaluation.train_evaluator import TrainEvaluator, _fit_with_budget -from autosklearn.metrics import calculate_metric +from autosklearn.metrics import Scorer, calculate_metric, default_metric_for_task +from autosklearn.pipeline.base import BasePipeline +from autosklearn.pipeline.components.classification import ClassifierChoice +from autosklearn.pipeline.components.data_preprocessing.categorical_encoding import ( + OHEChoice, +) +from autosklearn.pipeline.components.data_preprocessing.minority_coalescense import ( + CoalescenseChoice, +) +from autosklearn.pipeline.components.data_preprocessing.rescaling import RescalingChoice +from autosklearn.pipeline.components.feature_preprocessing import ( + FeaturePreprocessorChoice, +) +from autosklearn.pipeline.components.regression import RegressorChoice +from autosklearn.smbo import AutoMLSMBO +from autosklearn.util import RE_PATTERN, pipeline from autosklearn.util.data import ( + DatasetCompressionSpec, + default_dataset_compression_arg, reduce_dataset_size_if_too_large, supported_precision_reductions, validate_dataset_compression_arg, - default_dataset_compression_arg, - DatasetCompressionSpec, ) -from autosklearn.util.stopwatch import StopWatch from autosklearn.util.logging_ import ( + PicklableClientLogger, + get_named_client_logger, setup_logger, start_log_server, - get_named_client_logger, warnings_to, - PicklableClientLogger, ) -from autosklearn.util import pipeline, RE_PATTERN from autosklearn.util.parallel import preload_modules -from autosklearn.ensemble_builder import EnsembleBuilderManager -from autosklearn.ensembles.singlebest_ensemble import SingleBest -from autosklearn.smbo import AutoMLSMBO -from autosklearn.constants import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, \ - REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION, \ - CLASSIFICATION_TASKS -from autosklearn.pipeline.base import BasePipeline -from autosklearn.pipeline.components.classification import ClassifierChoice -from autosklearn.pipeline.components.regression import RegressorChoice -from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice -from autosklearn.pipeline.components.data_preprocessing.categorical_encoding import OHEChoice -from autosklearn.pipeline.components.data_preprocessing.minority_coalescense import ( - CoalescenseChoice -) -from autosklearn.pipeline.components.data_preprocessing.rescaling import RescalingChoice from autosklearn.util.single_thread_client import SingleThreadedClient +from autosklearn.util.stopwatch import StopWatch def _model_predict( @@ -94,7 +106,7 @@ def _model_predict( batch_size: Optional[int] = None, logger: Optional[PicklableClientLogger] = None, ) -> np.ndarray: - """ Generates the predictions from a model. + """Generates the predictions from a model. This is seperated out into a seperate function to allow for multiprocessing and perform parallel predictions. @@ -149,24 +161,25 @@ def _model_predict( else: predict_func = model.predict - if batch_size is not None and hasattr(model, 'batch_size'): + if batch_size is not None and hasattr(model, "batch_size"): prediction = predict_func(X_, batch_size=batch_size) else: prediction = predict_func(X_) # Check that probability values lie between 0 and 1. if task in CLASSIFICATION_TASKS: - assert (prediction >= 0).all() and (prediction <= 1).all(), \ - f"For {model}, prediction probability not within [0, 1]!" + assert (prediction >= 0).all() and ( + prediction <= 1 + ).all(), f"For {model}, prediction probability not within [0, 1]!" - assert prediction.shape[0] == X_.shape[0], \ - f"Prediction shape {model} is {prediction.shape} while X_.shape is {X_.shape}" + assert ( + prediction.shape[0] == X_.shape[0] + ), f"Prediction shape {model} is {prediction.shape} while X_.shape is {X_.shape}" return prediction class AutoML(BaseEstimator): - def __init__( self, time_left_for_this_task, @@ -183,7 +196,7 @@ def __init__( debug_mode=False, include=None, exclude=None, - resampling_strategy='holdout-iterative-fit', + resampling_strategy="holdout-iterative-fit", resampling_strategy_arguments=None, n_jobs=None, dask_client: Optional[dask.distributed.Client] = None, @@ -195,7 +208,7 @@ def __init__( metric=None, scoring_functions=None, get_trials_callback=None, - dataset_compression: Union[bool, Mapping[str, Any]] = True + dataset_compression: Union[bool, Mapping[str, Any]] = True, ): super(AutoML, self).__init__() self.configuration_space = None @@ -205,8 +218,9 @@ def __init__( # self._tmp_dir = tmp_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit - self._initial_configurations_via_metalearning = \ + self._initial_configurations_via_metalearning = ( initial_configurations_via_metalearning + ) self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._max_models_on_disc = max_models_on_disc @@ -217,9 +231,14 @@ def __init__( self._include = include self._exclude = exclude self._resampling_strategy = resampling_strategy - self._scoring_functions = scoring_functions if scoring_functions is not None else [] - self._resampling_strategy_arguments = resampling_strategy_arguments \ - if resampling_strategy_arguments is not None else {} + self._scoring_functions = ( + scoring_functions if scoring_functions is not None else [] + ) + self._resampling_strategy_arguments = ( + resampling_strategy_arguments + if resampling_strategy_arguments is not None + else {} + ) self._n_jobs = n_jobs self._dask_client = dask_client @@ -227,15 +246,24 @@ def __init__( self._disable_evaluator_output = disable_evaluator_output # Check arguments prior to doing anything! if not isinstance(self._disable_evaluator_output, (bool, List)): - raise ValueError('disable_evaluator_output must be of type bool ' - 'or list.') + raise ValueError( + "disable_evaluator_output must be of type bool " "or list." + ) if isinstance(self._disable_evaluator_output, List): - allowed_elements = ['model', 'cv_model', 'y_optimization', 'y_test', 'y_valid'] + allowed_elements = [ + "model", + "cv_model", + "y_optimization", + "y_test", + "y_valid", + ] for element in self._disable_evaluator_output: if element not in allowed_elements: - raise ValueError("List member '%s' for argument " - "'disable_evaluator_output' must be one " - "of " + str(allowed_elements)) + raise ValueError( + "List member '%s' for argument " + "'disable_evaluator_output' must be one " + "of " + str(allowed_elements) + ) self._get_smac_object_callback = get_smac_object_callback self._get_trials_callback = get_trials_callback self._smac_scenario_args = smac_scenario_args @@ -280,17 +308,21 @@ def __init__( # examples. Nevertheless, multi-process runs # have spawn as requirement to reduce the # possibility of a deadlock - self._multiprocessing_context = 'forkserver' + self._multiprocessing_context = "forkserver" if self._n_jobs == 1 and self._dask_client is None: - self._multiprocessing_context = 'fork' + self._multiprocessing_context = "fork" self._dask_client = SingleThreadedClient() if not isinstance(self._time_for_task, int): - raise ValueError("time_left_for_this_task not of type integer, " - "but %s" % str(type(self._time_for_task))) + raise ValueError( + "time_left_for_this_task not of type integer, " + "but %s" % str(type(self._time_for_task)) + ) if not isinstance(self._per_run_time_limit, int): - raise ValueError("per_run_time_limit not of type integer, but %s" % - str(type(self._per_run_time_limit))) + raise ValueError( + "per_run_time_limit not of type integer, but %s" + % str(type(self._per_run_time_limit)) + ) # By default try to use the TCP logging port or get a new port self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT @@ -331,7 +363,7 @@ def _create_dask_client(self): def _close_dask_client(self): if ( - hasattr(self, '_is_dask_client_internally_created') + hasattr(self, "_is_dask_client_internally_created") and self._is_dask_client_internally_created and self._dask_client ): @@ -343,13 +375,13 @@ def _close_dask_client(self): del self._is_dask_client_internally_created def _get_logger(self, name): - logger_name = 'AutoML(%d):%s' % (self._seed, name) + logger_name = "AutoML(%d):%s" % (self._seed, name) # Setup the configuration for the logger # This is gonna be honored by the server # Which is created below setup_logger( - filename='%s.log' % str(logger_name), + filename="%s.log" % str(logger_name), logging_config=self.logging_config, output_dir=self._backend.temporary_directory, ) @@ -363,17 +395,17 @@ def _get_logger(self, name): context = multiprocessing.get_context(self._multiprocessing_context) preload_modules(context) self.stop_logging_server = context.Event() - port = context.Value('l') # be safe by using a long + port = context.Value("l") # be safe by using a long port.value = -1 self.logging_server = context.Process( target=start_log_server, kwargs=dict( - host='localhost', + host="localhost", logname=logger_name, event=self.stop_logging_server, port=port, - filename='%s.log' % str(logger_name), + filename="%s.log" % str(logger_name), logging_config=self.logging_config, output_dir=self._backend.temporary_directory, ), @@ -392,12 +424,12 @@ def _get_logger(self, name): return get_named_client_logger( name=logger_name, - host='localhost', + host="localhost", port=self._logger_port, ) def _clean_logger(self): - if not hasattr(self, 'stop_logging_server') or self.stop_logging_server is None: + if not hasattr(self, "stop_logging_server") or self.stop_logging_server is None: return # Clean up the logger @@ -423,20 +455,19 @@ def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod - def _print_load_time(basename, time_left_for_this_task, - time_for_load_data, logger): + def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): - time_left_after_reading = max( - 0, time_left_for_this_task - time_for_load_data) - logger.info('Remaining time after reading %s %5.2f sec' % - (basename, time_left_after_reading)) + time_left_after_reading = max(0, time_left_for_this_task - time_for_load_data) + logger.info( + "Remaining time after reading %s %5.2f sec" + % (basename, time_left_after_reading) + ) return time_for_load_data def _do_dummy_prediction(self, datamanager: XYDataManager, num_run: int) -> int: # When using partial-cv it makes no sense to do dummy predictions - if self._resampling_strategy in ['partial-cv', - 'partial-cv-iterative-fit']: + if self._resampling_strategy in ["partial-cv", "partial-cv-iterative-fit"]: return num_run self._logger.info("Starting to create dummy predictions.") @@ -451,53 +482,47 @@ def _do_dummy_prediction(self, datamanager: XYDataManager, num_run: int) -> int: # already be generated here! stats = Stats(scenario_mock) stats.start_timing() - ta = ExecuteTaFuncWithQueue(backend=self._backend, - autosklearn_seed=self._seed, - resampling_strategy=self._resampling_strategy, - initial_num_run=num_run, - stats=stats, - metric=self._metric, - memory_limit=memory_limit, - disable_file_output=self._disable_evaluator_output, - abort_on_first_run_crash=False, - cost_for_crash=get_cost_of_crash(self._metric), - port=self._logger_port, - pynisher_context=self._multiprocessing_context, - **self._resampling_strategy_arguments) - - status, cost, runtime, additional_info = ta.run(num_run, cutoff=self._time_for_task) + ta = ExecuteTaFuncWithQueue( + backend=self._backend, + autosklearn_seed=self._seed, + resampling_strategy=self._resampling_strategy, + initial_num_run=num_run, + stats=stats, + metric=self._metric, + memory_limit=memory_limit, + disable_file_output=self._disable_evaluator_output, + abort_on_first_run_crash=False, + cost_for_crash=get_cost_of_crash(self._metric), + port=self._logger_port, + pynisher_context=self._multiprocessing_context, + **self._resampling_strategy_arguments, + ) + + status, cost, runtime, additional_info = ta.run( + num_run, cutoff=self._time_for_task + ) if status == StatusType.SUCCESS: self._logger.info("Finished creating dummy predictions.") + + # Fail if dummy prediction fails. else: - if additional_info.get('exitcode') == -6: - self._logger.error( - "Dummy prediction failed with run state %s. " - "The error suggests that the provided memory limits were too tight. Please " - "increase the 'memory_limit' and try again. If this does not solve your " - "problem, please open an issue and paste the additional output. " - "Additional output: %s.", - str(status), str(additional_info), + if additional_info.get("exitcode") == -6: + msg = ( + f"Dummy prediction failed with run state {status}." + " The error suggests that the provided memory limits are too tight." + " Please increase the 'memory_limit' and try again. If this does" + " not solve your problem, please open an issue and paste the" + f" additional output. Additional output: {additional_info}" ) - # Fail if dummy prediction fails. - raise ValueError( - "Dummy prediction failed with run state %s. " - "The error suggests that the provided memory limits were too tight. Please " - "increase the 'memory_limit' and try again. If this does not solve your " - "problem, please open an issue and paste the additional output. " - "Additional output: %s." % - (str(status), str(additional_info)), - ) - else: - self._logger.error( - "Dummy prediction failed with run state %s and additional output: %s.", - str(status), str(additional_info), - ) - # Fail if dummy prediction fails. - raise ValueError( - "Dummy prediction failed with run state %s and additional output: %s." - % (str(status), str(additional_info)) + msg = ( + f" Dummy prediction failed with run state {status} and" + f" additional output: {additional_info}.", ) + + self._logger.error(msg) + raise ValueError(msg) + return num_run @classmethod @@ -620,8 +645,9 @@ def fit( if task is None: y_task = type_of_target(y) if not self._supports_task_type(y_task): - raise ValueError(f"{self.__class__.__name__} does not support" - f" task {y_task}") + raise ValueError( + f"{self.__class__.__name__} does not support" f" task {y_task}" + ) self._task = self._task_type_id(y_task) else: self._task = task @@ -675,10 +701,9 @@ def fit( memory_allocation = self._dataset_compression["memory_allocation"] # Remove precision reduction if we can't perform it - if ( - X.dtype not in supported_precision_reductions - and "precision" in cast(List[str], methods) # Removable with TypedDict - ): + if X.dtype not in supported_precision_reductions and "precision" in cast( + List[str], methods + ): # Removable with TypedDict methods = [method for method in methods if method != "precision"] with warnings_to(self._logger): @@ -689,13 +714,15 @@ def fit( is_classification=is_classification, random_state=self._seed, operations=methods, - memory_allocation=memory_allocation + memory_allocation=memory_allocation, ) # Check the re-sampling strategy try: self._check_resampling_strategy( - X=X, y=y, task=self._task, + X=X, + y=y, + task=self._task, ) except Exception as e: self._fit_cleanup() @@ -710,16 +737,16 @@ def fit( # It can be provided in the constructor, or automatically # defined in the estimator fit call if self._metric is None: - raise ValueError('No metric given.') + raise ValueError("No metric given.") if not isinstance(self._metric, Scorer): - raise ValueError('Metric must be instance of ' - 'autosklearn.metrics.Scorer.') + raise ValueError( + "Metric must be instance of " "autosklearn.metrics.Scorer." + ) # If no dask client was provided, we create one, so that we can # start a ensemble process in parallel to smbo optimize - if ( - self._dask_client is None and - (self._ensemble_size > 0 or self._n_jobs is not None and self._n_jobs > 1) + if self._dask_client is None and ( + self._ensemble_size > 0 or self._n_jobs is not None and self._n_jobs > 1 ): self._create_dask_client() else: @@ -732,78 +759,95 @@ def fit( self._feat_type = self.InputValidator.feature_validator.feat_type # Produce debug information to the logfile - self._logger.debug('Starting to print environment information') - self._logger.debug(' Python version: %s', sys.version.split('\n')) + self._logger.debug("Starting to print environment information") + self._logger.debug(" Python version: %s", sys.version.split("\n")) try: - self._logger.debug(f'\tDistribution: {distro.id()}-{distro.version()}-{distro.name()}') + self._logger.debug( + f"\tDistribution: {distro.id()}-{distro.version()}-{distro.name()}" + ) except AttributeError: pass - self._logger.debug(' System: %s', platform.system()) - self._logger.debug(' Machine: %s', platform.machine()) - self._logger.debug(' Platform: %s', platform.platform()) + self._logger.debug(" System: %s", platform.system()) + self._logger.debug(" Machine: %s", platform.machine()) + self._logger.debug(" Platform: %s", platform.platform()) # UNAME appears to leak sensible information # self._logger.debug(' uname: %s', platform.uname()) - self._logger.debug(' Version: %s', platform.version()) - self._logger.debug(' Mac version: %s', platform.mac_ver()) - requirements = pkg_resources.resource_string('autosklearn', 'requirements.txt') - requirements = requirements.decode('utf-8') - requirements = [requirement for requirement in requirements.split('\n')] + self._logger.debug(" Version: %s", platform.version()) + self._logger.debug(" Mac version: %s", platform.mac_ver()) + requirements = pkg_resources.resource_string("autosklearn", "requirements.txt") + requirements = requirements.decode("utf-8") + requirements = [requirement for requirement in requirements.split("\n")] for requirement in requirements: if not requirement: continue match = RE_PATTERN.match(requirement) if match: - name = match.group('name') + name = match.group("name") module_dist = pkg_resources.get_distribution(name) - self._logger.debug(' %s', module_dist) + self._logger.debug(" %s", module_dist) else: - raise ValueError('Unable to read requirement: %s' % requirement) - self._logger.debug('Done printing environment information') - self._logger.debug('Starting to print arguments to auto-sklearn') - self._logger.debug(' tmp_folder: %s', self._backend.context._temporary_directory) - self._logger.debug(' time_left_for_this_task: %f', self._time_for_task) - self._logger.debug(' per_run_time_limit: %f', self._per_run_time_limit) + raise ValueError("Unable to read requirement: %s" % requirement) + self._logger.debug("Done printing environment information") + self._logger.debug("Starting to print arguments to auto-sklearn") + self._logger.debug( + " tmp_folder: %s", self._backend.context._temporary_directory + ) + self._logger.debug(" time_left_for_this_task: %f", self._time_for_task) + self._logger.debug(" per_run_time_limit: %f", self._per_run_time_limit) self._logger.debug( - ' initial_configurations_via_metalearning: %d', + " initial_configurations_via_metalearning: %d", self._initial_configurations_via_metalearning, ) - self._logger.debug(' ensemble_size: %d', self._ensemble_size) - self._logger.debug(' ensemble_nbest: %f', self._ensemble_nbest) - self._logger.debug(' max_models_on_disc: %s', str(self._max_models_on_disc)) - self._logger.debug(' seed: %d', self._seed) - self._logger.debug(' memory_limit: %s', str(self._memory_limit)) - self._logger.debug(' metadata_directory: %s', self._metadata_directory) - self._logger.debug(' debug_mode: %s', self._debug_mode) - self._logger.debug(' include: %s', str(self._include)) - self._logger.debug(' exclude: %s', str(self._exclude)) - self._logger.debug(' resampling_strategy: %s', str(self._resampling_strategy)) - self._logger.debug(' resampling_strategy_arguments: %s', - str(self._resampling_strategy_arguments)) - self._logger.debug(' n_jobs: %s', str(self._n_jobs)) - self._logger.debug(' multiprocessing_context: %s', str(self._multiprocessing_context)) - self._logger.debug(' dask_client: %s', str(self._dask_client)) - self._logger.debug(' precision: %s', str(self.precision)) - self._logger.debug(' disable_evaluator_output: %s', str(self._disable_evaluator_output)) - self._logger.debug(' get_smac_objective_callback: %s', str(self._get_smac_object_callback)) - self._logger.debug(' smac_scenario_args: %s', str(self._smac_scenario_args)) - self._logger.debug(' logging_config: %s', str(self.logging_config)) - self._logger.debug(' metric: %s', str(self._metric)) - self._logger.debug('Done printing arguments to auto-sklearn') - self._logger.debug('Starting to print available components') + self._logger.debug(" ensemble_size: %d", self._ensemble_size) + self._logger.debug(" ensemble_nbest: %f", self._ensemble_nbest) + self._logger.debug(" max_models_on_disc: %s", str(self._max_models_on_disc)) + self._logger.debug(" seed: %d", self._seed) + self._logger.debug(" memory_limit: %s", str(self._memory_limit)) + self._logger.debug(" metadata_directory: %s", self._metadata_directory) + self._logger.debug(" debug_mode: %s", self._debug_mode) + self._logger.debug(" include: %s", str(self._include)) + self._logger.debug(" exclude: %s", str(self._exclude)) + self._logger.debug(" resampling_strategy: %s", str(self._resampling_strategy)) + self._logger.debug( + " resampling_strategy_arguments: %s", + str(self._resampling_strategy_arguments), + ) + self._logger.debug(" n_jobs: %s", str(self._n_jobs)) + self._logger.debug( + " multiprocessing_context: %s", str(self._multiprocessing_context) + ) + self._logger.debug(" dask_client: %s", str(self._dask_client)) + self._logger.debug(" precision: %s", str(self.precision)) + self._logger.debug( + " disable_evaluator_output: %s", str(self._disable_evaluator_output) + ) + self._logger.debug( + " get_smac_objective_callback: %s", str(self._get_smac_object_callback) + ) + self._logger.debug(" smac_scenario_args: %s", str(self._smac_scenario_args)) + self._logger.debug(" logging_config: %s", str(self.logging_config)) + self._logger.debug(" metric: %s", str(self._metric)) + self._logger.debug("Done printing arguments to auto-sklearn") + self._logger.debug("Starting to print available components") for choice in ( - ClassifierChoice, RegressorChoice, FeaturePreprocessorChoice, - OHEChoice, RescalingChoice, CoalescenseChoice, + ClassifierChoice, + RegressorChoice, + FeaturePreprocessorChoice, + OHEChoice, + RescalingChoice, + CoalescenseChoice, ): self._logger.debug( - '%s: %s', + "%s: %s", choice.__name__, choice.get_components(), ) - self._logger.debug('Done printing available components') + self._logger.debug("Done printing available components") datamanager = XYDataManager( - X, y, + X, + y, X_test=X_test, y_test=y_test, task=self._task, @@ -812,7 +856,7 @@ def fit( ) self._backend._make_internals_directory() - self._label_num = datamanager.info['label_num'] + self._label_num = datamanager.info["label_num"] # == Pickle the data manager to speed up loading self._backend.save_datamanager(datamanager) @@ -824,7 +868,8 @@ def fit( self._dataset_name, self._time_for_task, time_for_load_data, - self._logger) + self._logger, + ) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a @@ -852,7 +897,7 @@ def fit( # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long - ensemble_task_name = 'runEnsemble' + ensemble_task_name = "runEnsemble" self._stopwatch.start_task(ensemble_task_name) elapsed_time = self._stopwatch.wall_elapsed(self._dataset_name) time_left_for_ensembles = max(0, self._time_for_task - elapsed_time) @@ -861,15 +906,19 @@ def fit( # Fit only raises error when ensemble_size is not zero but # time_left_for_ensembles is zero. if self._ensemble_size > 0: - raise ValueError("Not starting ensemble builder because there " - "is no time left. Try increasing the value " - "of time_left_for_this_task.") + raise ValueError( + "Not starting ensemble builder because there " + "is no time left. Try increasing the value " + "of time_left_for_this_task." + ) elif self._ensemble_size <= 0: - self._logger.info('Not starting ensemble builder because ' - 'ensemble size is <= 0.') + self._logger.info( + "Not starting ensemble builder because " "ensemble size is <= 0." + ) else: self._logger.info( - 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) + "Start Ensemble with %5.2fsec time left" % time_left_for_ensembles + ) proc_ensemble = EnsembleBuilderManager( start_time=time.time(), @@ -900,26 +949,26 @@ def fit( pass # => RUN SMAC - smac_task_name = 'runSMAC' + smac_task_name = "runSMAC" self._stopwatch.start_task(smac_task_name) elapsed_time = self._stopwatch.wall_elapsed(self._dataset_name) time_left_for_smac = max(0, self._time_for_task - elapsed_time) if self._logger: - self._logger.info( - 'Start SMAC with %5.2fsec time left' % time_left_for_smac) + self._logger.info("Start SMAC with %5.2fsec time left" % time_left_for_smac) if time_left_for_smac <= 0: - self._logger.warning("Not starting SMAC because there is no time " - "left.") + self._logger.warning("Not starting SMAC because there is no time " "left.") _proc_smac = None self._budget_type = None else: - if self._per_run_time_limit is None or \ - self._per_run_time_limit > time_left_for_smac: + if ( + self._per_run_time_limit is None + or self._per_run_time_limit > time_left_for_smac + ): self._logger.warning( - 'Time limit for a single run is higher than total time ' - 'limit. Capping the limit for a single run to the total ' - 'time given to SMAC (%f)' % time_left_for_smac + "Time limit for a single run is higher than total time " + "limit. Capping the limit for a single run to the total " + "time given to SMAC (%f)" % time_left_for_smac ) per_run_time_limit = time_left_for_smac else: @@ -928,7 +977,7 @@ def fit( # Make sure that at least 2 models are created for the ensemble process num_models = time_left_for_smac // per_run_time_limit if num_models < 2: - per_run_time_limit = time_left_for_smac//2 + per_run_time_limit = time_left_for_smac // 2 self._logger.warning( "Capping the per_run_time_limit to {} to have " "time for a least 2 models in each process.".format( @@ -964,19 +1013,24 @@ def fit( port=self._logger_port, pynisher_context=self._multiprocessing_context, ensemble_callback=proc_ensemble, - trials_callback=self._get_trials_callback + trials_callback=self._get_trials_callback, ) try: - self.runhistory_, self.trajectory_, self._budget_type = \ - _proc_smac.run_smbo() + ( + self.runhistory_, + self.trajectory_, + self._budget_type, + ) = _proc_smac.run_smbo() trajectory_filename = os.path.join( self._backend.get_smac_output_directory_for_run(self._seed), - 'trajectory.json') - saveable_trajectory = \ - [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) - for entry in self.trajectory_] - with open(trajectory_filename, 'w') as fh: + "trajectory.json", + ) + saveable_trajectory = [ + list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) + for entry in self.trajectory_ + ] + with open(trajectory_filename, "w") as fh: json.dump(saveable_trajectory, fh) except Exception as e: self._logger.exception(e) @@ -989,9 +1043,11 @@ def fit( self.ensemble_performance_history = list(proc_ensemble.history) if len(proc_ensemble.futures) > 0: - # Now we need to wait for the future to return as it cannot be cancelled while it - # is running: https://stackoverflow.com/a/49203129 - self._logger.info("Ensemble script still running, waiting for it to finish.") + # Now we need to wait for the future to return as it cannot be cancelled + # while it is running: https://stackoverflow.com/a/49203129 + self._logger.info( + "Ensemble script still running, waiting for it to finish." + ) result = proc_ensemble.futures.pop().result() if result: ensemble_history, _, _, _, _ = result @@ -1001,7 +1057,10 @@ def fit( # save the ensemble performance history file if len(self.ensemble_performance_history) > 0: pd.DataFrame(self.ensemble_performance_history).to_json( - os.path.join(self._backend.internals_directory, 'ensemble_history.json')) + os.path.join( + self._backend.internals_directory, "ensemble_history.json" + ) + ) if load_models: self._logger.info("Loading models...") @@ -1046,40 +1105,58 @@ def _check_resampling_strategy( """ is_split_object = isinstance( self._resampling_strategy, - (BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit) + (BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit), ) - if self._resampling_strategy not in [ - 'holdout', - 'holdout-iterative-fit', - 'cv', - 'cv-iterative-fit', - 'partial-cv', - 'partial-cv-iterative-fit', - ] and not is_split_object: - raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) + if ( + self._resampling_strategy + not in [ + "holdout", + "holdout-iterative-fit", + "cv", + "cv-iterative-fit", + "partial-cv", + "partial-cv-iterative-fit", + ] + and not is_split_object + ): + raise ValueError( + "Illegal resampling strategy: %s" % self._resampling_strategy + ) elif is_split_object: TrainEvaluator.check_splitter_resampling_strategy( - X=X, y=y, task=task, - groups=self._resampling_strategy_arguments.get('groups', None), + X=X, + y=y, + task=task, + groups=self._resampling_strategy_arguments.get("groups", None), resampling_strategy=self._resampling_strategy, ) - elif self._resampling_strategy in [ - 'partial-cv', - 'partial-cv-iterative-fit', - ] and self._ensemble_size != 0: - raise ValueError("Resampling strategy %s cannot be used " - "together with ensembles." % self._resampling_strategy) - - elif self._resampling_strategy in [ - 'partial-cv', - 'cv', - 'cv-iterative-fit', - 'partial-cv-iterative-fit', - ] and 'folds' not in self._resampling_strategy_arguments: - self._resampling_strategy_arguments['folds'] = 5 + elif ( + self._resampling_strategy + in [ + "partial-cv", + "partial-cv-iterative-fit", + ] + and self._ensemble_size != 0 + ): + raise ValueError( + "Resampling strategy %s cannot be used " + "together with ensembles." % self._resampling_strategy + ) + + elif ( + self._resampling_strategy + in [ + "partial-cv", + "cv", + "cv-iterative-fit", + "partial-cv-iterative-fit", + ] + and "folds" not in self._resampling_strategy_arguments + ): + self._resampling_strategy_arguments["folds"] = 5 return @@ -1089,8 +1166,10 @@ def refit(self, X, y): # Make sure input data is valid if self.InputValidator is None or not self.InputValidator._is_fitted: - raise ValueError("refit() is only supported after calling fit. Kindly call first " - "the estimator fit() method.") + raise ValueError( + "refit() is only supported after calling fit. Kindly call first " + "the estimator fit() method." + ) X, y = self.InputValidator.transform(X, y) if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None: @@ -1142,7 +1221,7 @@ def fit_pipeline( X: SUPPORTED_FEAT_TYPES, y: Union[SUPPORTED_TARGET_TYPES, spmatrix], is_classification: bool, - config: Union[Configuration, Dict[str, Union[str, float, int]]], + config: Union[Configuration, Dict[str, Union[str, float, int]]], task: Optional[int] = None, dataset_name: Optional[str] = None, X_test: Optional[SUPPORTED_FEAT_TYPES] = None, @@ -1150,7 +1229,7 @@ def fit_pipeline( feat_type: Optional[List[str]] = None, **kwargs: Dict, ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]: - """ Fits and individual pipeline configuration and returns + """Fits and individual pipeline configuration and returns the result to the user. The Estimator constraints are honored, for example the resampling @@ -1170,8 +1249,8 @@ def fit_pipeline( y_test: array-like If provided, the testing performance will be tracked on this labels config: Union[Configuration, Dict[str, Union[str, float, int]]] - A configuration object used to define the pipeline steps. If a dictionary is passed, - a configuration is created based on this dictionary. + A configuration object used to define the pipeline steps. + If a dict is passed, a configuration is created based on this dict. dataset_name: Optional[str] A string to tag and identify the Auto-Sklearn run is_classification: bool @@ -1202,8 +1281,9 @@ def fit_pipeline( if task is None: y_task = type_of_target(y) if not self._supports_task_type(y_task): - raise ValueError(f"{self.__class__.__name__} does not support" - f" task {y_task}") + raise ValueError( + f"{self.__class__.__name__} does not support" f" task {y_task}" + ) self._task = self._task_type_id(y_task) else: self._task = task @@ -1217,12 +1297,16 @@ def fit_pipeline( # dataset if self.configuration_space is None: self.configuration_space = self.fit( - X=X, y=y, - dataset_name=dataset_name if dataset_name is not None else self._dataset_name, + X=X, + y=y, + dataset_name=dataset_name + if dataset_name is not None + else self._dataset_name, X_test=X_test, y_test=y_test, feat_type=feat_type, - only_return_configuration_space=True) + only_return_configuration_space=True, + ) # We do not want to overwrite existing runs self.num_run += 1 @@ -1231,25 +1315,25 @@ def fit_pipeline( config.config_id = self.num_run # Prepare missing components to the TAE function call - if 'include' not in kwargs: - kwargs['include'] = self._include - if 'exclude' not in kwargs: - kwargs['exclude'] = self._exclude - if 'memory_limit' not in kwargs: - kwargs['memory_limit'] = self._memory_limit - if 'resampling_strategy' not in kwargs: - kwargs['resampling_strategy'] = self._resampling_strategy - if 'metric' not in kwargs: - kwargs['metric'] = self._metric - if 'disable_file_output' not in kwargs: - kwargs['disable_file_output'] = self._disable_evaluator_output - if 'pynisher_context' not in kwargs: - kwargs['pynisher_context'] = self._multiprocessing_context - if 'stats' not in kwargs: + if "include" not in kwargs: + kwargs["include"] = self._include + if "exclude" not in kwargs: + kwargs["exclude"] = self._exclude + if "memory_limit" not in kwargs: + kwargs["memory_limit"] = self._memory_limit + if "resampling_strategy" not in kwargs: + kwargs["resampling_strategy"] = self._resampling_strategy + if "metric" not in kwargs: + kwargs["metric"] = self._metric + if "disable_file_output" not in kwargs: + kwargs["disable_file_output"] = self._disable_evaluator_output + if "pynisher_context" not in kwargs: + kwargs["pynisher_context"] = self._multiprocessing_context + if "stats" not in kwargs: scenario_mock = unittest.mock.Mock() scenario_mock.wallclock_limit = self._time_for_task - kwargs['stats'] = Stats(scenario_mock) - kwargs['stats'].start_timing() + kwargs["stats"] = Stats(scenario_mock) + kwargs["stats"].start_timing() # Fit a pipeline, which will be stored on disk # which we can later load via the backend @@ -1257,10 +1341,10 @@ def fit_pipeline( backend=self._backend, autosklearn_seed=self._seed, abort_on_first_run_crash=False, - cost_for_crash=get_cost_of_crash(kwargs['metric']), + cost_for_crash=get_cost_of_crash(kwargs["metric"]), port=self._logger_port, **kwargs, - **self._resampling_strategy_arguments + **self._resampling_strategy_arguments, ) run_info, run_value = ta.run_wrapper( @@ -1269,16 +1353,16 @@ def fit_pipeline( instance=None, instance_specific=None, seed=self._seed, - cutoff=kwargs.pop('cutoff', self._per_run_time_limit), + cutoff=kwargs.pop("cutoff", self._per_run_time_limit), capped=False, ) ) pipeline = None - if kwargs['disable_file_output'] or kwargs['resampling_strategy'] == 'test': + if kwargs["disable_file_output"] or kwargs["resampling_strategy"] == "test": self._logger.warning("File output is disabled. No pipeline can returned") elif run_value.status == StatusType.SUCCESS: - if kwargs['resampling_strategy'] in ('cv', 'cv-iterative-fit'): + if kwargs["resampling_strategy"] in ("cv", "cv-iterative-fit"): load_function = self._backend.load_cv_model_by_seed_and_id_and_budget else: load_function = self._backend.load_model_by_seed_and_id_and_budget @@ -1309,13 +1393,14 @@ def predict(self, X, batch_size=None, n_jobs=1): processes. """ if ( - self._resampling_strategy not in ( - 'holdout', 'holdout-iterative-fit', 'cv', 'cv-iterative-fit') + self._resampling_strategy + not in ("holdout", "holdout-iterative-fit", "cv", "cv-iterative-fit") and not self._can_predict ): raise NotImplementedError( - 'Predict is currently not implemented for resampling ' - 'strategy %s, please call refit().' % self._resampling_strategy) + "Predict is currently not implemented for resampling " + "strategy %s, please call refit()." % self._resampling_strategy + ) if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None: self._load_models() @@ -1324,13 +1409,17 @@ def predict(self, X, batch_size=None, n_jobs=1): # In such cases, raise error because predict and predict_proba cannot # be called. if self.ensemble_ is None: - raise ValueError("Predict and predict_proba can only be called " - "if 'ensemble_size != 0'") + raise ValueError( + "Predict and predict_proba can only be called " + "if 'ensemble_size != 0'" + ) # Make sure that input is valid if self.InputValidator is None or not self.InputValidator._is_fitted: - raise ValueError("predict() can only be called after performing fit(). Kindly call " - "the estimator fit() method first.") + raise ValueError( + "predict() can only be called after performing fit(). Kindly call " + "the estimator fit() method first." + ) X = self.InputValidator.feature_validator.transform(X) # Parallelize predictions across models with n_jobs processes. @@ -1352,24 +1441,26 @@ def predict(self, X, batch_size=None, n_jobs=1): check_is_fitted(list(self.cv_models_.values())[0]) models = self.cv_models_ except sklearn.exceptions.NotFittedError: - raise ValueError('Found no fitted models!') + raise ValueError("Found no fitted models!") all_predictions = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(_model_predict)( - model=models[identifier], - X=X, - task=self._task, - batch_size=batch_size + model=models[identifier], X=X, task=self._task, batch_size=batch_size ) for identifier in self.ensemble_.get_selected_model_identifiers() ) if len(all_predictions) == 0: - raise ValueError('Something went wrong generating the predictions. ' - 'The ensemble should consist of the following ' - 'models: %s, the following models were loaded: ' - '%s' % (str(list(self.ensemble_indices_.keys())), - str(list(self.models_.keys())))) + raise ValueError( + "Something went wrong generating the predictions. " + "The ensemble should consist of the following " + "models: %s, the following models were loaded: " + "%s" + % ( + str(list(self.ensemble_indices_.keys())), + str(list(self.models_.keys())), + ) + ) predictions = self.ensemble_.predict(all_predictions) @@ -1381,23 +1472,33 @@ def predict(self, X, batch_size=None, n_jobs=1): return predictions - def fit_ensemble(self, y, task=None, precision=32, - dataset_name=None, ensemble_nbest=None, - ensemble_size=None): + def fit_ensemble( + self, + y, + task=None, + precision=32, + dataset_name=None, + ensemble_nbest=None, + ensemble_size=None, + ): # AutoSklearn does not handle sparse y for now y = convert_if_sparse(y) - if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit']: - raise ValueError('Cannot call fit_ensemble with resampling ' - 'strategy %s.' % self._resampling_strategy) + if self._resampling_strategy in ["partial-cv", "partial-cv-iterative-fit"]: + raise ValueError( + "Cannot call fit_ensemble with resampling " + "strategy %s." % self._resampling_strategy + ) if self._logger is None: self._logger = self._get_logger(dataset_name) # Make sure that input is valid if self.InputValidator is None or not self.InputValidator._is_fitted: - raise ValueError("fit_ensemble() can only be called after fit. Please call the " - "estimator fit() method prior to fit_ensemble().") + raise ValueError( + "fit_ensemble() can only be called after fit. Please call the " + "estimator fit() method prior to fit_ensemble()." + ) y = self.InputValidator.target_validator.transform(y) # Create a client if needed @@ -1432,8 +1533,10 @@ def fit_ensemble(self, y, task=None, precision=32, future = manager.futures.pop() result = future.result() if result is None: - raise ValueError("Error building the ensemble - please check the log file and command " - "line output for error messages.") + raise ValueError( + "Error building the ensemble - please check the log file and command " + "line output for error messages." + ) self.ensemble_performance_history, _, _, _, _ = result self._load_models() @@ -1450,29 +1553,34 @@ def _load_models(self): if self.ensemble_: identifiers = self.ensemble_.get_selected_model_identifiers() self.models_ = self._backend.load_models_by_identifiers(identifiers) - if self._resampling_strategy in ('cv', 'cv-iterative-fit'): - self.cv_models_ = self._backend.load_cv_models_by_identifiers(identifiers) + if self._resampling_strategy in ("cv", "cv-iterative-fit"): + self.cv_models_ = self._backend.load_cv_models_by_identifiers( + identifiers + ) else: self.cv_models_ = None + if len(self.models_) == 0 and self._resampling_strategy not in [ + "partial-cv", + "partial-cv-iterative-fit", + ]: + raise ValueError("No models fitted!") if ( - len(self.models_) == 0 and - self._resampling_strategy not in ['partial-cv', 'partial-cv-iterative-fit'] - ): - raise ValueError('No models fitted!') - if ( - self._resampling_strategy in ['cv', 'cv-iterative-fit'] + self._resampling_strategy in ["cv", "cv-iterative-fit"] and len(self.cv_models_) == 0 ): - raise ValueError('No models fitted!') + raise ValueError("No models fitted!") - elif self._disable_evaluator_output is False or \ - (isinstance(self._disable_evaluator_output, List) and - 'model' not in self._disable_evaluator_output): + elif self._disable_evaluator_output is False or ( + isinstance(self._disable_evaluator_output, List) + and "model" not in self._disable_evaluator_output + ): model_names = self._backend.list_all_models(self._seed) - if len(model_names) == 0 and self._resampling_strategy not in \ - ['partial-cv', 'partial-cv-iterative-fit']: - raise ValueError('No models fitted!') + if len(model_names) == 0 and self._resampling_strategy not in [ + "partial-cv", + "partial-cv-iterative-fit", + ]: + raise ValueError("No models fitted!") self.models_ = [] @@ -1522,8 +1630,10 @@ def score(self, X, y): # Make sure that input is valid if self.InputValidator is None or not self.InputValidator._is_fitted: - raise ValueError("score() is only supported after calling fit. Kindly call first " - "the estimator fit() method.") + raise ValueError( + "score() is only supported after calling fit. Kindly call first " + "the estimator fit() method." + ) y = self.InputValidator.target_validator.transform(y) # Encode the prediction using the input validator @@ -1534,10 +1644,12 @@ def score(self, X, y): # same representation domain prediction = self.InputValidator.target_validator.transform(prediction) - return calculate_metric(solution=y, - prediction=prediction, - task_type=self._task, - metric=self._metric, ) + return calculate_metric( + solution=y, + prediction=prediction, + task_type=self._task, + metric=self._metric, + ) def _get_runhistory_models_performance(self): metric = self._metric @@ -1549,20 +1661,24 @@ def _get_runhistory_models_performance(self): continue # Alternatively, it is possible to also obtain the start time with # ``run_value.starttime`` - endtime = pd.Timestamp(time.strftime('%Y-%m-%d %H:%M:%S', - time.localtime(run_value.endtime))) + endtime = pd.Timestamp( + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(run_value.endtime)) + ) val_score = metric._optimum - (metric._sign * run_value.cost) - train_score = metric._optimum - (metric._sign * run_value.additional_info['train_loss']) + train_score = metric._optimum - ( + metric._sign * run_value.additional_info["train_loss"] + ) scores = { - 'Timestamp': endtime, - 'single_best_optimization_score': val_score, - 'single_best_train_score': train_score, + "Timestamp": endtime, + "single_best_optimization_score": val_score, + "single_best_train_score": train_score, } # Append test-scores, if data for test_loss are available. # This is the case, if X_test and y_test where provided. - if 'test_loss' in run_value.additional_info: - test_score = metric._optimum - (metric._sign * - run_value.additional_info['test_loss']) + if "test_loss" in run_value.additional_info: + test_score = metric._optimum - ( + metric._sign * run_value.additional_info["test_loss"] + ) scores["single_best_test_score"] = test_score performance_list.append(scores) @@ -1571,13 +1687,17 @@ def _get_runhistory_models_performance(self): @property def performance_over_time_(self): individual_performance_frame = self._get_runhistory_models_performance() - best_values = pd.Series({'single_best_optimization_score': -np.inf, - 'single_best_test_score': -np.inf, - 'single_best_train_score': -np.inf}) + best_values = pd.Series( + { + "single_best_optimization_score": -np.inf, + "single_best_test_score": -np.inf, + "single_best_train_score": -np.inf, + } + ) for idx in individual_performance_frame.index: if ( - individual_performance_frame.loc[idx, 'single_best_optimization_score'] - > best_values['single_best_optimization_score'] + individual_performance_frame.loc[idx, "single_best_optimization_score"] + > best_values["single_best_optimization_score"] ): best_values = individual_performance_frame.loc[idx] individual_performance_frame.loc[idx] = best_values @@ -1586,21 +1706,27 @@ def performance_over_time_(self): if self._ensemble_size != 0: ensemble_performance_frame = pd.DataFrame(self.ensemble_performance_history) - best_values = pd.Series({'ensemble_optimization_score': -np.inf, - 'ensemble_test_score': -np.inf}) + best_values = pd.Series( + {"ensemble_optimization_score": -np.inf, "ensemble_test_score": -np.inf} + ) for idx in ensemble_performance_frame.index: if ( - ensemble_performance_frame.loc[idx, 'ensemble_optimization_score'] - > best_values['ensemble_optimization_score'] + ensemble_performance_frame.loc[idx, "ensemble_optimization_score"] + > best_values["ensemble_optimization_score"] ): best_values = ensemble_performance_frame.loc[idx] ensemble_performance_frame.loc[idx] = best_values - performance_over_time = pd.merge( - ensemble_performance_frame, - individual_performance_frame, - on="Timestamp", how='outer' - ).sort_values('Timestamp').fillna(method='ffill') + performance_over_time = ( + pd.merge( + ensemble_performance_frame, + individual_performance_frame, + on="Timestamp", + how="outer", + ) + .sort_values("Timestamp") + .fillna(method="ffill") + ) return performance_over_time @@ -1623,8 +1749,8 @@ def cv_results_(self): # TODO: add those arguments # TODO remove this restriction! - if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit']: - raise ValueError('Cannot call cv_results when using partial-cv!') + if self._resampling_strategy in ["partial-cv", "partial-cv-iterative-fit"]: + raise ValueError("Cannot call cv_results when using partial-cv!") parameter_dictionaries = dict() masks = dict() @@ -1659,19 +1785,19 @@ def cv_results_(self): s = run_value.status if s == StatusType.SUCCESS: - status.append('Success') + status.append("Success") elif s == StatusType.DONOTADVANCE: - status.append('Success (but do not advance to higher budget)') + status.append("Success (but do not advance to higher budget)") elif s == StatusType.TIMEOUT: - status.append('Timeout') + status.append("Timeout") elif s == StatusType.CRASHED: - status.append('Crash') + status.append("Crash") elif s == StatusType.ABORT: - status.append('Abort') + status.append("Abort") elif s == StatusType.MEMOUT: - status.append('Memout') - # TODO remove StatusType.RUNNING at some point in the future when the new SMAC 0.13.2 - # is the new minimum required version! + status.append("Memout") + # TODO remove StatusType.RUNNING at some point in the future when the new + # SMAC 0.13.2 is the new minimum required version! elif s in (StatusType.STOP, StatusType.RUNNING): continue else: @@ -1679,7 +1805,9 @@ def cv_results_(self): param_dict = config.get_dictionary() params.append(param_dict) - mean_test_score.append(self._metric._optimum - (self._metric._sign * run_value.cost)) + mean_test_score.append( + self._metric._optimum - (self._metric._sign * run_value.cost) + ) mean_fit_time.append(run_value.time) budgets.append(run_key.budget) @@ -1705,69 +1833,79 @@ def cv_results_(self): metric_dict[metric.name].append(metric_value) metric_mask[metric.name].append(mask_value) - results['mean_test_score'] = np.array(mean_test_score) + results["mean_test_score"] = np.array(mean_test_score) for name in metric_name: - masked_array = ma.MaskedArray(metric_dict[name], - metric_mask[name]) - results['metric_%s' % name] = masked_array + masked_array = ma.MaskedArray(metric_dict[name], metric_mask[name]) + results["metric_%s" % name] = masked_array - results['mean_fit_time'] = np.array(mean_fit_time) - results['params'] = params - rank_order = -1 * self._metric._sign * results['mean_test_score'] - results['rank_test_scores'] = scipy.stats.rankdata(rank_order, method='min') - results['status'] = status - results['budgets'] = budgets + results["mean_fit_time"] = np.array(mean_fit_time) + results["params"] = params + rank_order = -1 * self._metric._sign * results["mean_test_score"] + results["rank_test_scores"] = scipy.stats.rankdata(rank_order, method="min") + results["status"] = status + results["budgets"] = budgets for hp_name in hp_names: - masked_array = ma.MaskedArray(parameter_dictionaries[hp_name], - masks[hp_name]) - results['param_%s' % hp_name] = masked_array + masked_array = ma.MaskedArray( + parameter_dictionaries[hp_name], masks[hp_name] + ) + results["param_%s" % hp_name] = masked_array return results def sprint_statistics(self): cv_results = self.cv_results_ sio = io.StringIO() - sio.write('auto-sklearn results:\n') - sio.write(' Dataset name: %s\n' % self._dataset_name) - sio.write(' Metric: %s\n' % self._metric) - idx_success = np.where(np.array( - [status in ['Success', 'Success (but do not advance to higher budget)'] - for status in cv_results['status']] - ))[0] + sio.write("auto-sklearn results:\n") + sio.write(" Dataset name: %s\n" % self._dataset_name) + sio.write(" Metric: %s\n" % self._metric) + idx_success = np.where( + np.array( + [ + status + in ["Success", "Success (but do not advance to higher budget)"] + for status in cv_results["status"] + ] + ) + )[0] if len(idx_success) > 0: if not self._metric._optimum: - idx_best_run = np.argmin(cv_results['mean_test_score'][idx_success]) + idx_best_run = np.argmin(cv_results["mean_test_score"][idx_success]) else: - idx_best_run = np.argmax(cv_results['mean_test_score'][idx_success]) - best_score = cv_results['mean_test_score'][idx_success][idx_best_run] - sio.write(' Best validation score: %f\n' % best_score) - num_runs = len(cv_results['status']) - sio.write(' Number of target algorithm runs: %d\n' % num_runs) - num_success = sum([ - s in ['Success', 'Success (but do not advance to higher budget)'] - for s in cv_results['status'] - ]) - sio.write(' Number of successful target algorithm runs: %d\n' % num_success) - num_crash = sum([s == 'Crash' for s in cv_results['status']]) - sio.write(' Number of crashed target algorithm runs: %d\n' % num_crash) - num_timeout = sum([s == 'Timeout' for s in cv_results['status']]) - sio.write(' Number of target algorithms that exceeded the time ' - 'limit: %d\n' % num_timeout) - num_memout = sum([s == 'Memout' for s in cv_results['status']]) - sio.write(' Number of target algorithms that exceeded the memory ' - 'limit: %d\n' % num_memout) + idx_best_run = np.argmax(cv_results["mean_test_score"][idx_success]) + best_score = cv_results["mean_test_score"][idx_success][idx_best_run] + sio.write(" Best validation score: %f\n" % best_score) + num_runs = len(cv_results["status"]) + sio.write(" Number of target algorithm runs: %d\n" % num_runs) + num_success = sum( + [ + s in ["Success", "Success (but do not advance to higher budget)"] + for s in cv_results["status"] + ] + ) + sio.write(" Number of successful target algorithm runs: %d\n" % num_success) + num_crash = sum([s == "Crash" for s in cv_results["status"]]) + sio.write(" Number of crashed target algorithm runs: %d\n" % num_crash) + num_timeout = sum([s == "Timeout" for s in cv_results["status"]]) + sio.write( + " Number of target algorithms that exceeded the time " + "limit: %d\n" % num_timeout + ) + num_memout = sum([s == "Memout" for s in cv_results["status"]]) + sio.write( + " Number of target algorithms that exceeded the memory " + "limit: %d\n" % num_memout + ) return sio.getvalue() def get_models_with_weights(self): - if self.models_ is None or len(self.models_) == 0 or \ - self.ensemble_ is None: + if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None: self._load_models() return self.ensemble_.get_models_with_weights(self.models_) def show_models(self) -> Dict[int, Any]: - """ Returns a dictionary containing dictionaries of ensemble models. + """Returns a dictionary containing dictionaries of ensemble models. Each model in the ensemble can be accessed by giving its ``model_id`` as key. @@ -1821,11 +1959,12 @@ def show_models(self) -> Dict[int, Any]: 'rank': 2, 'cost': 0.4550418898836528, 'ensemble_weight': 0.3, - 'data_preprocessor': , + 'data_preprocessor': , 'feature_preprocessor': , 'regressor': , - 'sklearn_regressor': ARDRegression(alpha_1=0.0003701926442639788,...) - }... + 'sklearn_regressor': ARDRegression(alpha_1=0.027,...) + }, + ... } Returns @@ -1833,8 +1972,7 @@ def show_models(self) -> Dict[int, Any]: Dict(int, Any) : dictionary of length = number of models in the ensemble A dictionary of models in the ensemble, where ``model_id`` is the key. - """ - + """ # noqa: E501 ensemble_dict = {} def has_key(rv, key): @@ -1842,26 +1980,25 @@ def has_key(rv, key): table_dict = {} for rkey, rval in self.runhistory_.data.items(): - if has_key(rval, 'num_run'): - model_id = rval.additional_info['num_run'] - table_dict[model_id] = { - 'model_id': model_id, - 'cost': rval.cost - } + if has_key(rval, "num_run"): + model_id = rval.additional_info["num_run"] + table_dict[model_id] = {"model_id": model_id, "cost": rval.cost} # Checking if the dictionary is empty if not table_dict: - raise RuntimeError('No model found. Try increasing \'time_left_for_this_task\'.') + raise RuntimeError( + "No model found. Try increasing 'time_left_for_this_task'." + ) for i, weight in enumerate(self.ensemble_.weights_): (_, model_id, _) = self.ensemble_.identifiers_[i] - table_dict[model_id]['ensemble_weight'] = weight + table_dict[model_id]["ensemble_weight"] = weight - table = pd.DataFrame.from_dict(table_dict, orient='index') - table.sort_values(by='cost', inplace=True) + table = pd.DataFrame.from_dict(table_dict, orient="index") + table.sort_values(by="cost", inplace=True) - # Checking which resampling strategy is chosen and selecting the appropriate models - is_cv = (self._resampling_strategy == "cv") + # Check which resampling strategy is chosen and selecting the appropriate models + is_cv = self._resampling_strategy == "cv" models = self.cv_models_ if is_cv else self.models_ rank = 1 # Initializing rank for the first model @@ -1869,23 +2006,23 @@ def has_key(rv, key): model_dict = {} # Declaring model dictionary # Inserting model_id, rank, cost and ensemble weight - model_dict['model_id'] = table.loc[model_id]['model_id'].astype(int) - model_dict['rank'] = rank - model_dict['cost'] = table.loc[model_id]['cost'] - model_dict['ensemble_weight'] = table.loc[model_id]['ensemble_weight'] + model_dict["model_id"] = table.loc[model_id]["model_id"].astype(int) + model_dict["rank"] = rank + model_dict["cost"] = table.loc[model_id]["cost"] + model_dict["ensemble_weight"] = table.loc[model_id]["ensemble_weight"] rank += 1 # Incrementing rank by 1 for the next model # The steps in the models pipeline are as follows: # 'data_preprocessor': DataPreprocessor, # 'balancing': Balancing, # 'feature_preprocessor': FeaturePreprocessorChoice, - # 'classifier'/'regressor': ClassifierChoice/RegressorChoice (autosklearn wrapped model) + # 'classifier'/'regressor': ClassifierChoice/RegressorChoice (wrapped model) # For 'cv' (cross validation) strategy if is_cv: # Voting model created by cross validation cv_voting_ensemble = model - model_dict['voting_model'] = cv_voting_ensemble + model_dict["voting_model"] = cv_voting_ensemble # List of models, each trained on one cv fold cv_models = [] @@ -1894,9 +2031,11 @@ def has_key(rv, key): # Adding sklearn model to the model dictionary model_type, autosklearn_wrapped_model = cv_model.steps[-1] - estimator[f'sklearn_{model_type}'] = autosklearn_wrapped_model.choice.estimator + estimator[ + f"sklearn_{model_type}" + ] = autosklearn_wrapped_model.choice.estimator cv_models.append(estimator) - model_dict['estimators'] = cv_models + model_dict["estimators"] = cv_models # For any other strategy else: @@ -1905,7 +2044,9 @@ def has_key(rv, key): # Adding sklearn model to the model dictionary model_type, autosklearn_wrapped_model = model.steps[-1] - model_dict[f'sklearn_{model_type}'] = autosklearn_wrapped_model.choice.estimator + model_dict[ + f"sklearn_{model_type}" + ] = autosklearn_wrapped_model.choice.estimator # Insterting model_dict in the ensemble dictionary ensemble_dict[model_id] = model_dict @@ -1920,21 +2061,20 @@ def _create_search_space( include: Optional[Dict[str, List[str]]] = None, exclude: Optional[Dict[str, List[str]]] = None, ): - task_name = 'CreateConfigSpace' + task_name = "CreateConfigSpace" self._stopwatch.start_task(task_name) - configspace_path = os.path.join(tmp_dir, 'space.json') + configspace_path = os.path.join(tmp_dir, "space.json") configuration_space = pipeline.get_configuration_space( datamanager.info, include=include, exclude=exclude, ) configuration_space = self.configuration_space_created_hook( - datamanager, configuration_space) + datamanager, configuration_space + ) backend.write_txt_file( - configspace_path, - cs_json.write(configuration_space), - 'Configuration space' + configspace_path, cs_json.write(configuration_space), "Configuration space" ) self._stopwatch.stop_task(task_name) @@ -1960,9 +2100,9 @@ def __del__(self): class AutoMLClassifier(AutoML): _task_mapping = { - 'multilabel-indicator': MULTILABEL_CLASSIFICATION, - 'multiclass': MULTICLASS_CLASSIFICATION, - 'binary': BINARY_CLASSIFICATION, + "multilabel-indicator": MULTILABEL_CLASSIFICATION, + "multiclass": MULTICLASS_CLASSIFICATION, + "binary": BINARY_CLASSIFICATION, } @classmethod @@ -1985,7 +2125,8 @@ def fit( load_models: bool = True, ): return super().fit( - X, y, + X, + y, X_test=X_test, y_test=y_test, feat_type=feat_type, @@ -1999,7 +2140,7 @@ def fit_pipeline( self, X: SUPPORTED_FEAT_TYPES, y: Union[SUPPORTED_TARGET_TYPES, spmatrix], - config: Union[Configuration, Dict[str, Union[str, float, int]]], + config: Union[Configuration, Dict[str, Union[str, float, int]]], dataset_name: Optional[str] = None, X_test: Optional[SUPPORTED_FEAT_TYPES] = None, y_test: Optional[Union[SUPPORTED_TARGET_TYPES, spmatrix]] = None, @@ -2007,8 +2148,10 @@ def fit_pipeline( **kwargs, ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]: return super().fit_pipeline( - X=X, y=y, - X_test=X_test, y_test=y_test, + X=X, + y=y, + X_test=X_test, + y_test=y_test, dataset_name=dataset_name, config=config, is_classification=True, @@ -2017,12 +2160,15 @@ def fit_pipeline( ) def predict(self, X, batch_size=None, n_jobs=1): - predicted_probabilities = super().predict(X, batch_size=batch_size, - n_jobs=n_jobs) + predicted_probabilities = super().predict( + X, batch_size=batch_size, n_jobs=n_jobs + ) if self.InputValidator is None or not self.InputValidator._is_fitted: - raise ValueError("predict() is only supported after calling fit. Kindly call first " - "the estimator fit() method.") + raise ValueError( + "predict() is only supported after calling fit. Kindly call first " + "the estimator fit() method." + ) if self.InputValidator.target_validator.is_single_column_target(): predicted_indexes = np.argmax(predicted_probabilities, axis=1) else: @@ -2037,9 +2183,9 @@ def predict_proba(self, X, batch_size=None, n_jobs=1): class AutoMLRegressor(AutoML): _task_mapping = { - 'continuous-multioutput': MULTIOUTPUT_REGRESSION, - 'continuous': REGRESSION, - 'multiclass': REGRESSION, + "continuous-multioutput": MULTIOUTPUT_REGRESSION, + "continuous": REGRESSION, + "multiclass": REGRESSION, } @classmethod @@ -2062,7 +2208,8 @@ def fit( load_models: bool = True, ): return super().fit( - X, y, + X, + y, X_test=X_test, y_test=y_test, feat_type=feat_type, @@ -2076,7 +2223,7 @@ def fit_pipeline( self, X: SUPPORTED_FEAT_TYPES, y: Union[SUPPORTED_TARGET_TYPES, spmatrix], - config: Union[Configuration, Dict[str, Union[str, float, int]]], + config: Union[Configuration, Dict[str, Union[str, float, int]]], dataset_name: Optional[str] = None, X_test: Optional[SUPPORTED_FEAT_TYPES] = None, y_test: Optional[Union[SUPPORTED_TARGET_TYPES, spmatrix]] = None, @@ -2084,8 +2231,10 @@ def fit_pipeline( **kwargs: Dict, ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]: return super().fit_pipeline( - X=X, y=y, - X_test=X_test, y_test=y_test, + X=X, + y=y, + X_test=X_test, + y_test=y_test, config=config, feat_type=feat_type, dataset_name=dataset_name, diff --git a/autosklearn/constants.py b/autosklearn/constants.py index 60a025999e..2db372925c 100644 --- a/autosklearn/constants.py +++ b/autosklearn/constants.py @@ -7,21 +7,26 @@ MULTIOUTPUT_REGRESSION = 5 REGRESSION_TASKS = [REGRESSION, MULTIOUTPUT_REGRESSION] -CLASSIFICATION_TASKS = [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, - MULTILABEL_CLASSIFICATION] +CLASSIFICATION_TASKS = [ + BINARY_CLASSIFICATION, + MULTICLASS_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, +] TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS -TASK_TYPES_TO_STRING = \ - {BINARY_CLASSIFICATION: 'binary.classification', - MULTICLASS_CLASSIFICATION: 'multiclass.classification', - MULTILABEL_CLASSIFICATION: 'multilabel.classification', - REGRESSION: 'regression', - MULTIOUTPUT_REGRESSION: 'multioutput.regression'} +TASK_TYPES_TO_STRING = { + BINARY_CLASSIFICATION: "binary.classification", + MULTICLASS_CLASSIFICATION: "multiclass.classification", + MULTILABEL_CLASSIFICATION: "multilabel.classification", + REGRESSION: "regression", + MULTIOUTPUT_REGRESSION: "multioutput.regression", +} -STRING_TO_TASK_TYPES = \ - {'binary.classification': BINARY_CLASSIFICATION, - 'multiclass.classification': MULTICLASS_CLASSIFICATION, - 'multilabel.classification': MULTILABEL_CLASSIFICATION, - 'regression': REGRESSION, - 'multioutput.regression': MULTIOUTPUT_REGRESSION} +STRING_TO_TASK_TYPES = { + "binary.classification": BINARY_CLASSIFICATION, + "multiclass.classification": MULTICLASS_CLASSIFICATION, + "multilabel.classification": MULTILABEL_CLASSIFICATION, + "regression": REGRESSION, + "multioutput.regression": MULTIOUTPUT_REGRESSION, +} diff --git a/autosklearn/data/abstract_data_manager.py b/autosklearn/data/abstract_data_manager.py index 739e707334..0837d59ad0 100644 --- a/autosklearn/data/abstract_data_manager.py +++ b/autosklearn/data/abstract_data_manager.py @@ -2,14 +2,14 @@ from typing import Any, Dict, Union import numpy as np - import scipy.sparse -from autosklearn.pipeline.components.data_preprocessing.feature_type \ - import FeatTypeSplit +from autosklearn.pipeline.components.data_preprocessing.feature_type import ( + FeatTypeSplit, +) -class AbstractDataManager(): +class AbstractDataManager: __metaclass__ = abc.ABCMeta def __init__(self, name: str): @@ -47,22 +47,26 @@ def encoder(self, value: FeatTypeSplit) -> FeatTypeSplit: self._encoder = value def __repr__(self) -> str: - return 'DataManager : ' + self.name + return "DataManager : " + self.name def __str__(self) -> str: - val = 'DataManager : ' + self.name + '\ninfo:\n' + val = "DataManager : " + self.name + "\ninfo:\n" for item in self.info: - val = val + '\t' + item + ' = ' + str(self.info[item]) + '\n' - val = val + 'data:\n' + val = val + "\t" + item + " = " + str(self.info[item]) + "\n" + val = val + "data:\n" for subset in self.data: - val = val + '\t%s = %s %s %s\n' % (subset, type(self.data[subset]), - str(self.data[subset].shape), - str(self.data[subset].dtype)) + val = val + "\t%s = %s %s %s\n" % ( + subset, + type(self.data[subset]), + str(self.data[subset].shape), + str(self.data[subset].dtype), + ) if isinstance(self.data[subset], scipy.sparse.spmatrix): - val = val + '\tdensity: %f\n' % \ - (float(len(self.data[subset].data)) / - self.data[subset].shape[0] / - self.data[subset].shape[1]) - val = val + 'feat_type:\t' + str(self.feat_type) + '\n' + val = val + "\tdensity: %f\n" % ( + float(len(self.data[subset].data)) + / self.data[subset].shape[0] + / self.data[subset].shape[1] + ) + val = val + "feat_type:\t" + str(self.feat_type) + "\n" return val diff --git a/autosklearn/data/feature_validator.py b/autosklearn/data/feature_validator.py index 1a21249775..0b7ae8e8b1 100644 --- a/autosklearn/data/feature_validator.py +++ b/autosklearn/data/feature_validator.py @@ -1,19 +1,16 @@ -import logging from typing import Dict, List, Optional, Tuple, Union, cast -import numpy as np +import logging +import numpy as np import pandas as pd from pandas.api.types import is_numeric_dtype, is_sparse - from scipy.sparse import csr_matrix, spmatrix - from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError from autosklearn.util.logging_ import PickableLoggerAdapter - SUPPORTED_FEAT_TYPES = Union[List, pd.DataFrame, np.ndarray, spmatrix] @@ -26,34 +23,37 @@ class FeatureValidator(BaseEstimator): Attributes ---------- - feat_type: Optional[List[str]] - In case the dataset is not a pandas DataFrame: - + If provided, this list indicates which columns should be treated as categorical - it is internally transformed into a dictionary that indicates a mapping from - column index to categorical/numerical - + If not provided, by default all columns are treated as numerical - If the input dataset is of type pandas dataframe, this argument - must be none, as the column type will be inferred from the pandas dtypes. - - data_type: - Class name of the data type provided during fit. + feat_type: Optional[List[str]] = None + In case the dataset is not a pandas DataFrame: + + If provided, this list indicates which columns should be treated as + categorical it is internally transformed into a dictionary that + indicates a mapping from column index to categorical/numerical. + + If not provided, by default all columns are treated as numerical + + If the input dataset is of type pandas dataframe, this argument + must be none, as the column type will be inferred from the pandas dtypes. + + data_type: + Class name of the data type provided during fit. """ - def __init__(self, - feat_type: Optional[List[str]] = None, - logger: Optional[PickableLoggerAdapter] = None, - ) -> None: + + def __init__( + self, + feat_type: Optional[List[str]] = None, + logger: Optional[PickableLoggerAdapter] = None, + ) -> None: # If a dataframe was provided, we populate # this attribute with a mapping from column to {numerical | categorical} - self.feat_type: Optional[ - Dict[Union[str, int], str] - ] = None + self.feat_type: Optional[Dict[Union[str, int], str]] = None if feat_type is not None: if isinstance(feat_type, dict): self.feat_type = feat_type elif not isinstance(feat_type, List): - raise ValueError("Auto-Sklearn expects a list of categorical/" - "numerical feature types, yet a" - " {} was provided".format(type(feat_type))) + raise ValueError( + "Auto-Sklearn expects a list of categorical/" + "numerical feature types, yet a" + " {} was provided".format(type(feat_type)) + ) else: # Convert to a dictionary which will be passed to the ColumnTransformer @@ -72,7 +72,7 @@ def fit( self, X_train: SUPPORTED_FEAT_TYPES, X_test: Optional[SUPPORTED_FEAT_TYPES] = None, - ) -> 'FeatureValidator': + ) -> "FeatureValidator": """ Validates input data to Auto-Sklearn. The supported data types are List, numpy arrays and pandas DataFrames. @@ -86,7 +86,6 @@ def fit( X_test: Optional[SUPPORTED_FEAT_TYPES] A hold out set of data used for checking """ - # If a list was provided, it will be converted to pandas if isinstance(X_train, List): X_train, X_test = self.list_to_dataframe(X_train, X_test) @@ -96,47 +95,56 @@ def fit( # Handle categorical feature identification for the pipeline if hasattr(X_train, "iloc"): if self.feat_type is not None: - raise ValueError("When providing a DataFrame to Auto-Sklearn, we extract " - "the feature types from the DataFrame.dtypes. That is, " - "providing the option feat_type to the fit method is not " - "supported when using a Dataframe. Please make sure that the " - "type of each column in your DataFrame is properly set. " - "More details about having the correct data type in your " - "DataFrame can be seen in " - "https://pandas.pydata.org/pandas-docs/stable/reference" - "/api/pandas.DataFrame.astype.html") + raise ValueError( + "When providing a DataFrame to Auto-Sklearn, we extract " + "the feature types from the DataFrame.dtypes. That is, " + "providing the option feat_type to the fit method is not " + "supported when using a Dataframe. Please make sure that the " + "type of each column in your DataFrame is properly set. " + "More details about having the correct data type in your " + "DataFrame can be seen in " + "https://pandas.pydata.org/pandas-docs/stable/reference" + "/api/pandas.DataFrame.astype.html" + ) else: self.feat_type = self.get_feat_type_from_columns(X_train) else: # Numpy array was provided if self.feat_type is None: # Assume numerical columns if a numpy array has no feature types - self.feat_type = {i: 'numerical' for i in range(np.shape(X_train)[1])} + self.feat_type = {i: "numerical" for i in range(np.shape(X_train)[1])} else: # Check The feat type provided if len(self.feat_type) != np.shape(X_train)[1]: - raise ValueError('Array feat_type does not have same number of ' - 'variables as X has features. %d vs %d.' % - (len(self.feat_type), np.shape(X_train)[1])) + raise ValueError( + "Array feat_type does not have same number of " + "variables as X has features. %d vs %d." + % (len(self.feat_type), np.shape(X_train)[1]) + ) if not all([isinstance(f, str) for f in self.feat_type.values()]): - raise ValueError("feat_type must only contain strings: {}".format( - list(self.feat_type.values()), - )) + raise ValueError( + "feat_type must only contain strings: {}".format( + list(self.feat_type.values()), + ) + ) for ft in self.feat_type.values(): - if ft.lower() not in ['categorical', 'numerical', 'string']: - raise ValueError('Only `Categorical`, `Numerical` and `String` are ' - 'valid feature types') + if ft.lower() not in ["categorical", "numerical", "string"]: + raise ValueError( + "Only `Categorical`, `Numerical` and `String` are " + "valid feature types" + ) if X_test is not None: self._check_data(X_test) if np.shape(X_train)[1] != np.shape(X_test)[1]: - raise ValueError("The feature dimensionality of the train and test " - "data does not match train({}) != test({})".format( - np.shape(X_train)[1], - np.shape(X_test)[1] - )) + raise ValueError( + "The feature dimensionality of the train and test " + "data does not match train({}) != test({})".format( + np.shape(X_train)[1], np.shape(X_test)[1] + ) + ) self._is_fitted = True @@ -162,7 +170,9 @@ def transform( The transformed array """ if not self._is_fitted: - raise NotFittedError("Cannot call transform on a validator that is not fitted") + raise NotFittedError( + "Cannot call transform on a validator that is not fitted" + ) # If a list was provided, it will be converted to pandas if isinstance(X, List): @@ -177,9 +187,11 @@ def transform( # Not all sparse format support index sorting if isinstance(X_transformed, spmatrix): if not isinstance(X_transformed, csr_matrix): - self.logger.warning(f"Sparse data provided is of type {type(X_transformed)} " - "yet Auto-Sklearn only support csr_matrix. Auto-sklearn " - "will convert the provided data to the csr_matrix format.") + self.logger.warning( + f"Sparse data provided is of type {type(X_transformed)} " + "yet Auto-Sklearn only support csr_matrix. Auto-sklearn " + "will convert the provided data to the csr_matrix format." + ) X_transformed = X_transformed.tocsr(copy=False) X_transformed.sort_indices() @@ -195,45 +207,42 @@ def _check_data( Parameters ---------- - X: SUPPORTED_FEAT_TYPES - A set of features that are going to be validated (type and dimensionality - checks) and a encoder fitted in the case the data needs encoding + X: SUPPORTED_FEAT_TYPES + A set of features that are going to be validated (type and dimensionality) + and a encoder fitted in the case the data needs encoding """ - # We consider columns that are all nan in a pandas frame as category - if hasattr(X, 'columns'): + if hasattr(X, "columns"): for column in cast(pd.DataFrame, X).columns: if X[column].isna().all(): - X[column] = X[column].astype('category') + X[column] = X[column].astype("category") - if not isinstance(X, (np.ndarray, pd.DataFrame)) and not isinstance(X, spmatrix): - raise ValueError("Auto-sklearn only supports Numpy arrays, Pandas DataFrames," - " scipy sparse and Python Lists, yet, the provided input is" - " of type {}".format( - type(X) - )) + if not isinstance(X, (np.ndarray, pd.DataFrame)) and not isinstance( + X, spmatrix + ): + raise ValueError( + "Auto-sklearn only supports Numpy arrays, Pandas DataFrames," + " scipy sparse and Python Lists, yet, the provided input is" + " of type {}".format(type(X)) + ) if self.data_type is None: self.data_type = type(X) if self.data_type != type(X): - self.logger.warning("Auto-sklearn previously received features of type %s " - "yet the current features have type %s. Changing the dtype " - "of inputs to an estimator might cause problems" % ( - str(self.data_type), - str(type(X)), - ), - ) + self.logger.warning( + f"Auto-sklearn previously received features of type {self.data_type} " + f"yet the current features have type {type(X)}. Changing the dtype " + "of inputs to an estimator might cause problems" + ) # Do not support category/string numpy data. Only numbers if hasattr(X, "dtype"): if not np.issubdtype(X.dtype.type, np.number): # type: ignore[union-attr] raise ValueError( - "When providing a numpy array to Auto-sklearn, the only valid " - "dtypes are numerical ones. The provided data type {} is not supported." - "".format( - X.dtype.type, # type: ignore[union-attr] - ) + "When providing a numpy array to Auto-sklearn, the only valid" + f" dtypes are numerical ones. The provided data type {X.dtype.type}" + " is not supported." ) # Then for Pandas, we do not support Nan in categorical columns @@ -247,12 +256,14 @@ def _check_data( # To support list, we need to support object inference. # In extreme cases, the train column might be all integer, # and the test column might be float. - self.logger.warning("Changing the dtype of the features after fit() is " - "not recommended. Fit() method was called with " - "{} whereas the new features have {} as type".format( - self.dtypes, - dtypes, - )) + self.logger.warning( + "Changing the dtype of the features after fit() is " + "not recommended. Fit() method was called with " + "{} whereas the new features have {} as type".format( + self.dtypes, + dtypes, + ) + ) else: self.dtypes = dtypes @@ -266,53 +277,52 @@ def get_feat_type_from_columns( Parameters ---------- - X: pd.DataFrame - A set of features that are going to be validated (type and dimensionality - checks) and a encoder fitted in the case the data needs encoding + X: pd.DataFrame + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + Returns ------- - feat_type: - dictionary with column to feature type mapping + feat_type: + dictionary with column to feature type mapping """ - # Also, register the feature types for the estimator feat_type = {} # Make sure each column is a valid type for i, column in enumerate(X.columns): if is_sparse(X[column]): - raise ValueError("Auto-sklearn does not yet support sparse pandas Series." - f" Please convert {column} to a dense format.") - elif X[column].dtype.name in ['category', 'bool']: - feat_type[column] = 'categorical' + raise ValueError( + "Auto-sklearn does not yet support sparse pandas Series." + f" Please convert {column} to a dense format." + ) + elif X[column].dtype.name in ["category", "bool"]: + feat_type[column] = "categorical" elif X[column].dtype.name == "string": - feat_type[column] = 'string' + feat_type[column] = "string" # Move away from np.issubdtype as it causes # TypeError: data type not understood in certain pandas types elif not is_numeric_dtype(X[column]): - if X[column].dtype.name == 'object': + if X[column].dtype.name == "object": raise ValueError( - "Input Column {} has invalid type object. " + f"Input Column {column} has invalid type object. " "Cast it to a valid dtype before using it in Auto-Sklearn. " "Valid types are numerical, categorical or boolean. " "You can cast it to a valid dtype using " "pandas.Series.astype ." "If working with string objects, the following " "tutorial illustrates how to work with text data: " - "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( # noqa: E501 - column, - ) + "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html" # noqa: E501 ) elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype( X[column].dtype ): raise ValueError( - "Auto-sklearn does not support time and/or date datatype as given " - "in column {}. Please convert the time information to a numerical value " - "first. One example on how to do this can be found on " - "https://stats.stackexchange.com/questions/311494/".format( - column, - ) + "Auto-sklearn does not support time and/or date datatype as " + f"given in column {column}. Please convert the time " + " information to a numerical value first. One example on how to" + " do this can be found on " + " https://stats.stackexchange.com/questions/311494/" ) else: raise ValueError( @@ -325,7 +335,7 @@ def get_feat_type_from_columns( ) ) else: - feat_type[column] = 'numerical' + feat_type[column] = "numerical" return feat_type def list_to_dataframe( @@ -334,31 +344,32 @@ def list_to_dataframe( X_test: Optional[SUPPORTED_FEAT_TYPES] = None, ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: """ - Converts a list to a pandas DataFrame. In this process, column types are inferred. + Converts a list to a DataFrame. In this process, column types are inferred. If test data is provided, we proactively match it to train data Parameters ---------- - X_train: SUPPORTED_FEAT_TYPES - A set of features that are going to be validated (type and dimensionality - checks) and a encoder fitted in the case the data needs encoding - X_test: Optional[SUPPORTED_FEAT_TYPES] - A hold out set of data used for checking + X_train: SUPPORTED_FEAT_TYPES + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + + X_test: Optional[SUPPORTED_FEAT_TYPES] + A hold out set of data used for checking + Returns ------- - pd.DataFrame: - transformed train data from list to pandas DataFrame - pd.DataFrame: - transformed test data from list to pandas DataFrame + Union[pd.DataFrame, pd.DataFrame]: + transformed (train, test) data from list to pandas DataFrame """ - # If a list was provided, it will be converted to pandas X_train = pd.DataFrame(data=X_train).convert_dtypes() # Store the dtypes and use in case of re-fit if len(self.dtypes) == 0: - self.dtypes = {col: X_train[col].dtype.name.lower() for col in X_train.columns} + self.dtypes = { + col: X_train[col].dtype.name.lower() for col in X_train.columns + } else: for col in X_train.columns: # Try to convert to the original dtype used to fit the validator @@ -367,25 +378,31 @@ def list_to_dataframe( try: X_train[col] = X_train[col].astype(self.dtypes[col]) except Exception as e: - self.logger.warning(f"Failed to format column {col} as {self.dtypes[col]}: {e}") + self.logger.warning( + f"Failed to format column {col} as {self.dtypes[col]}: {e}" + ) self.dtypes[col] = X_train[col].dtype.name.lower() - self.logger.warning("The provided feature types to autosklearn are of type list." - "Features have been interpreted as: {}".format( - [(col, t) for col, t in zip(X_train.columns, X_train.dtypes)] - )) + self.logger.warning( + "The provided feature types to autosklearn are of type list." + "Features have been interpreted as: {}".format( + [(col, t) for col, t in zip(X_train.columns, X_train.dtypes)] + ) + ) if X_test is not None: if not isinstance(X_test, List): - self.logger.warning("Train features are a list while the provided test data" - "is {}. X_test will be casted as DataFrame.".format( - type(X_test) - )) + self.logger.warning( + "Train features are a list while the provided test data" + "is {}. X_test will be casted as DataFrame.".format(type(X_test)) + ) X_test = pd.DataFrame(data=X_test) for col in X_test.columns: try: X_test[col] = X_test[col].astype(self.dtypes[col]) except Exception as e: - self.logger.warning(f"Failed to format column {col} as {self.dtypes[col]}: {e}") + self.logger.warning( + f"Failed to format column {col} as {self.dtypes[col]}: {e}" + ) self.dtypes[col] = X_test[col].dtype.name.lower() return X_train, X_test diff --git a/autosklearn/data/target_validator.py b/autosklearn/data/target_validator.py index 9f6d2e74b5..030a40b9b0 100644 --- a/autosklearn/data/target_validator.py +++ b/autosklearn/data/target_validator.py @@ -1,16 +1,14 @@ +from typing import List, Optional, Type, Union, cast + import logging import warnings -from typing import List, Optional, Type, Union, cast import numpy as np - import pandas as pd +import sklearn.utils from pandas.api.types import is_numeric_dtype from pandas.core.dtypes.base import ExtensionDtype - from scipy.sparse import spmatrix - -import sklearn.utils from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError from sklearn.preprocessing import OrdinalEncoder @@ -18,29 +16,34 @@ from autosklearn.util.logging_ import PickableLoggerAdapter - SUPPORTED_TARGET_TYPES = Union[List, pd.Series, pd.DataFrame, np.ndarray, spmatrix] class TargetValidator(BaseEstimator): - """ - A class to pre-process targets. It validates the data provided during fit (to make sure - it matches Sklearn expectation) as well as encoding the targets in case of classification + """A class to pre-process targets. + + It validates the data provided during fit to make sure it matches Sklearn + expectation as well as encoding the targets in case of classification + Attributes ---------- - is_classification: bool - A bool that indicates if the validator should operate in classification mode. - During classification, the targets are encoded. - encoder: Optional[BaseEstimator] - Host a encoder object if the data requires transformation (for example, - if provided a categorical column in a pandas DataFrame) - enc_columns: List[str] - List of columns that where encoded + is_classification: bool + A bool that indicates if the validator should operate in classification mode. + During classification, the targets are encoded. + + encoder: Optional[BaseEstimator] + Host a encoder object if the data requires transformation (for example, if + provided a categorical column in a pandas DataFrame). + + enc_columns: List[str] + List of columns that where encoded """ - def __init__(self, - is_classification: bool = False, - logger: Optional[PickableLoggerAdapter] = None, - ) -> None: + + def __init__( + self, + is_classification: bool = False, + logger: Optional[PickableLoggerAdapter] = None, + ) -> None: self.is_classification = is_classification self.data_type = None # type: Optional[type] @@ -66,7 +69,7 @@ def fit( self, y_train: Union[List, np.ndarray, pd.Series, pd.DataFrame], y_test: Optional[Union[List, np.ndarray, pd.Series, pd.DataFrame]] = None, - ) -> 'TargetValidator': + ) -> "TargetValidator": """ Validates and fit a categorical encoder (if needed) to the targets The supported data types are List, numpy arrays and pandas DataFrames. @@ -87,31 +90,40 @@ def fit( self._check_data(y_test) if len(shape) != len(np.shape(y_test)) or ( - len(shape) > 1 and (shape[1] != np.shape(y_test)[1])): - raise ValueError("The dimensionality of the train and test" - " targets do not match" - f" train {np.shape(y_train)}" - f" != test {np.shape(y_test)}") + len(shape) > 1 and (shape[1] != np.shape(y_test)[1]) + ): + raise ValueError( + "The dimensionality of the train and test" + " targets do not match" + f" train {np.shape(y_train)}" + f" != test {np.shape(y_test)}" + ) if isinstance(y_train, pd.DataFrame): if not isinstance(y_test, pd.DataFrame): y_test = pd.DataFrame(y_test) if y_train.columns.tolist() != y_test.columns.tolist(): - raise ValueError("Train and test targets must both have the" - f" same columns, yet y={y_train.columns}" - f" and y_test={y_test.columns}") + raise ValueError( + "Train and test targets must both have the" + f" same columns, yet y={y_train.columns}" + f" and y_test={y_test.columns}" + ) if list(y_train.dtypes) != list(y_test.dtypes): - raise ValueError("Train and test targets must both have the same dtypes") + raise ValueError( + "Train and test targets must both have the same dtypes" + ) if self.out_dimensionality is None: self.out_dimensionality = 1 if len(shape) == 1 else shape[1] else: _n_outputs = 1 if len(shape) == 1 else shape[1] if self.out_dimensionality != _n_outputs: - raise ValueError('Number of outputs changed from %d to %d!' % - (self.out_dimensionality, _n_outputs)) + raise ValueError( + "Number of outputs changed from %d to %d!" + % (self.out_dimensionality, _n_outputs) + ) # Fit on the training data self._fit(y_train, y_test) @@ -124,7 +136,7 @@ def _fit( self, y_train: Union[List, np.ndarray, pd.Series, pd.DataFrame], y_test: Optional[Union[List, np.ndarray, pd.Series, pd.DataFrame]] = None, - ) -> 'TargetValidator': + ) -> "TargetValidator": """ If dealing with classification, this utility encodes the targets. @@ -139,7 +151,7 @@ def _fit( y_test: Optional[SUPPORTED_TARGET_TYPES] A holdout set of labels """ - if not self.is_classification or self.type_of_target == 'multilabel-indicator': + if not self.is_classification or self.type_of_target == "multilabel-indicator": # Only fit an encoder for classification tasks # Also, encoding multilabel indicator data makes the data multiclass # Let the user employ a MultiLabelBinarizer if needed @@ -150,16 +162,17 @@ def _fit( shape = np.shape(y_train) ndim = len(shape) if ndim > 1 and shape[1] != 1: - # We should not reach this if statement as we check for type of targets before - raise ValueError("Multi-dimensional classification is not yet" - " supported. Encoding multidimensional data" - " converts multiple columns to a 1 dimensional encoding." - f" Data involved = {shape}/{self.type_of_target}") + # We should not reach this if statement, we check for type of targets before + raise ValueError( + "Multi-dimensional classification is not yet" + " supported. Encoding multidimensional data" + " converts multiple columns to a 1 dimensional encoding." + f" Data involved = {shape}/{self.type_of_target}" + ) # Creat the encoder self.encoder = OrdinalEncoder( - handle_unknown='use_encoded_value', - unknown_value=-1 + handle_unknown="use_encoded_value", unknown_value=-1 ) # Clear typing to just numpy arrays and pandas @@ -172,10 +185,12 @@ def _fit( # inverse_transform to try corretly restore it's dtype if isinstance(y, pd.Series): if isinstance(y.dtype, ExtensionDtype): - warnings.warn("Fitting transformer with a pandas series which" - f" has the dtype {y.dtype}. Inverse transform" - " may not be able preserve dtype when converting" - " to np.ndarray") + warnings.warn( + "Fitting transformer with a pandas series which" + f" has the dtype {y.dtype}. Inverse transform" + " may not be able preserve dtype when converting" + " to np.ndarray" + ) if is_numeric_dtype(y.dtype): self.dtype = y.dtype elif isinstance(y, pd.DataFrame): @@ -187,7 +202,7 @@ def _fit( # Merge y_test and y_train for encoding if y_test is not None: - if isinstance(y, (pd.Series, pd.DataFrame)): + if isinstance(y, (pd.Series, pd.DataFrame)): if isinstance(y, pd.Series): y_test = pd.Series(y_test) else: @@ -257,7 +272,7 @@ def transform( y_transformed = sklearn.utils.check_array( y_transformed, force_all_finite=True, - accept_sparse='csr', + accept_sparse="csr", ensure_2d=False, ) @@ -271,7 +286,7 @@ def inverse_transform( self, y: Union[List, pd.Series, pd.DataFrame, np.ndarray], ) -> np.ndarray: - """ Revert any encoding transformation done on a target array. + """Revert any encoding transformation done on a target array. Parameters ---------- @@ -318,9 +333,7 @@ def inverse_transform( return y_inv def is_single_column_target(self) -> bool: - """ - Output is encoded with a single column encoding - """ + """Output is encoded with a single column encoding""" return self.out_dimensionality == 1 def _check_data( @@ -332,38 +345,41 @@ def _check_data( Parameters ---------- - y: Union[np.ndarray, pd.DataFrame, pd.Series] - A set of features whose dimensionality and data type is going to be checked + y: Union[np.ndarray, pd.DataFrame, pd.Series] + A set of features whose dimensionality and data type is going to be checked """ - if not isinstance( - y, (np.ndarray, pd.DataFrame, List, pd.Series)) and not isinstance(y, spmatrix): - raise ValueError("Auto-sklearn only supports Numpy arrays, Pandas DataFrames," - " pd.Series, sparse data and Python Lists as targets, yet, " - "the provided input is of type {}".format( - type(y) - )) + y, (np.ndarray, pd.DataFrame, List, pd.Series) + ) and not isinstance(y, spmatrix): + raise ValueError( + "Auto-sklearn only supports Numpy arrays, Pandas DataFrames," + " pd.Series, sparse data and Python Lists as targets, yet, " + "the provided input is of type {}".format(type(y)) + ) if isinstance(y, spmatrix) and not np.issubdtype(y.dtype.type, np.number): - raise ValueError("When providing a sparse matrix as targets, the only supported " - "values are numerical. Please consider using a dense" - " instead." - ) + raise ValueError( + "When providing a sparse matrix as targets, the only supported " + "values are numerical. Please consider using a dense" + " instead." + ) if self.data_type is None: self.data_type = type(y) if self.data_type != type(y): - self.logger.warning("Auto-sklearn previously received targets of type %s " - "yet the current features have type %s. Changing the dtype " - "of inputs to an estimator might cause problems" % ( - str(self.data_type), - str(type(y)), - ), - ) + self.logger.warning( + "Auto-sklearn previously received targets of type %s " + "yet the current features have type %s. Changing the dtype " + "of inputs to an estimator might cause problems" + % ( + str(self.data_type), + str(type(y)), + ), + ) # No Nan is supported has_nan_values = False - if hasattr(y, 'iloc'): + if hasattr(y, "iloc"): has_nan_values = cast(pd.DataFrame, y).isnull().values.any() if isinstance(y, spmatrix): @@ -374,34 +390,39 @@ def _check_data( # but NaN, are not equal to themselves: has_nan_values = not np.array_equal(y, y) if has_nan_values: - raise ValueError("Target values cannot contain missing/NaN values. " - "This is not supported by scikit-learn. " - ) + raise ValueError( + "Target values cannot contain missing/NaN values. " + "This is not supported by scikit-learn. " + ) # Pandas Series is not supported for multi-label indicator # This format checks are done by type of target try: self.type_of_target = type_of_target(y) except Exception as e: - raise ValueError("The provided data could not be interpreted by Sklearn. " - "While determining the type of the targets via type_of_target " - "run into exception: {}.".format(e)) - - supported_output_types = ('binary', - 'continuous', - 'continuous-multioutput', - 'multiclass', - 'multilabel-indicator', - # Notice unknown/multiclass-multioutput are not supported - # This can only happen during testing only as estimators - # should filter out unsupported types. - ) + raise ValueError( + "The provided data could not be interpreted by Sklearn. " + "While determining the type of the targets via type_of_target " + "run into exception: {}.".format(e) + ) + + supported_output_types = ( + "binary", + "continuous", + "continuous-multioutput", + "multiclass", + "multilabel-indicator", + # Notice unknown/multiclass-multioutput are not supported + # This can only happen during testing only as estimators + # should filter out unsupported types. + ) if self.type_of_target not in supported_output_types: - raise ValueError("Provided targets are not supported by Auto-Sklearn. " - "Provided type is {} whereas supported types are {}.".format( - self.type_of_target, - supported_output_types - )) + raise ValueError( + "Provided targets are not supported by Auto-Sklearn. " + "Provided type is {} whereas supported types are {}.".format( + self.type_of_target, supported_output_types + ) + ) @property def classes_(self) -> np.ndarray: @@ -410,10 +431,11 @@ def classes_(self) -> np.ndarray: which consist of a ndarray of shape (n_classes,) where n_classes are the number of classes seen while fitting a encoder to the targets. + Returns ------- - classes_: np.ndarray - The unique classes seen during encoding of a classifier + classes_: np.ndarray + The unique classes seen during encoding of a classifier """ if self.encoder is None: return np.array([]) diff --git a/autosklearn/data/validation.py b/autosklearn/data/validation.py index d06082258d..89aaca85c0 100644 --- a/autosklearn/data/validation.py +++ b/autosklearn/data/validation.py @@ -1,23 +1,21 @@ # -*- encoding: utf-8 -*- -import logging from typing import List, Optional, Tuple, Union -import numpy as np +import logging +import numpy as np import pandas as pd - from scipy.sparse import spmatrix - from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError -from autosklearn.data.feature_validator import FeatureValidator, SUPPORTED_FEAT_TYPES +from autosklearn.data.feature_validator import SUPPORTED_FEAT_TYPES, FeatureValidator from autosklearn.data.target_validator import SUPPORTED_TARGET_TYPES, TargetValidator from autosklearn.util.logging_ import get_named_client_logger def convert_if_sparse( - y: SUPPORTED_TARGET_TYPES + y: SUPPORTED_TARGET_TYPES, ) -> Union[np.ndarray, List, pd.DataFrame, pd.Series]: """If the labels `y` are sparse, it will convert it to its dense representation @@ -51,26 +49,32 @@ class InputValidator(BaseEstimator): This class also perform checks for data integrity and flags the user via informative errors. + Attributes ---------- - feat_type: Optional[List[str]] - In case the dataset is not a pandas DataFrame: - + If provided, this list indicates which columns should be treated as categorical - it is internally transformed into a dictionary that indicates a mapping from - column index to categorical/numerical - + If not provided, by default all columns are treated as numerical - If the input dataset is of type pandas dataframe, this argument - must be none, as the column type will be inferred from the pandas dtypes. - is_classification: bool - For classification task, this flag indicates that the target data - should be encoded - feature_validator: FeatureValidator - A FeatureValidator instance used to validate and encode feature columns to match - sklearn expectations on the data - target_validator: TargetValidator - A TargetValidator instance used to validate and encode (in case of classification) - the target values + feat_type: Optional[List[str]] = None + In case the dataset is not a pandas DataFrame: + + If provided, this list indicates which columns should be treated as + categorical it is internally transformed into a dictionary that + indicates a mapping from column index to categorical/numerical. + + If not provided, by default all columns are treated as numerical + + If the input dataset is of type pandas dataframe, this argument + must be none, as the column type will be inferred from the pandas dtypes. + + is_classification: bool + For classification task, this flag indicates that the target data + should be encoded + + feature_validator: FeatureValidator + A FeatureValidator instance used to validate and encode feature columns to match + sklearn expectations on the data + + target_validator: TargetValidator + A TargetValidator instance used for classification to validate and encode the + target values """ + def __init__( self, feat_type: Optional[List[str]] = None, @@ -82,16 +86,18 @@ def __init__( self.logger_port = logger_port if self.logger_port is not None: self.logger = get_named_client_logger( - name='Validation', + name="Validation", port=self.logger_port, ) else: - self.logger = logging.getLogger('Validation') - - self.feature_validator = FeatureValidator(feat_type=self.feat_type, - logger=self.logger) - self.target_validator = TargetValidator(is_classification=self.is_classification, - logger=self.logger) + self.logger = logging.getLogger("Validation") + + self.feature_validator = FeatureValidator( + feat_type=self.feat_type, logger=self.logger + ) + self.target_validator = TargetValidator( + is_classification=self.is_classification, logger=self.logger + ) self._is_fitted = False def fit( @@ -106,45 +112,55 @@ def fit( a encoder for targets in the case of classification. Specifically: For features: - + Valid data types are enforced (List, np.ndarray, pd.DataFrame, pd.Series, scipy - sparse) as well as dimensionality checks - + If the provided data is a pandas DataFrame with categorical/boolean/int columns, - such columns will be encoded using an Ordinal Encoder + Valid data types are enforced (List, np.ndarray, pd.DataFrame, pd.Series, scipy + sparse) as well as dimensionality checks + + If the provided data is a pandas DataFrame with categorical/boolean/int columns, + such columns will be encoded using an Ordinal Encoder + For targets: - + Checks for dimensionality as well as missing values are performed. - + If performing a classification task, the data is going to be encoded + * Checks for dimensionality as well as missing values are performed. + * If performing a classification task, the data is going to be encoded Parameters ---------- - X_train: SUPPORTED_FEAT_TYPES - A set of features that are going to be validated (type and dimensionality - checks). If this data contains categorical columns, an encoder is going to - be instantiated and trained with this data. - y_train: SUPPORTED_TARGET_TYPES - A set of targets that are going to be encoded if the task is for classification - X_test: Optional[SUPPORTED_FEAT_TYPES] - A hold out set of features used for checking - y_test: SUPPORTED_TARGET_TYPES - A hold out set of targets used for checking. Additionally, if the current task - is a classification task, this y_test categories are also going to be used to - fit a pre-processing encoding (to prevent errors on unseen classes). + X_train: SUPPORTED_FEAT_TYPES + A set of features that are going to be validated (type and dimensionality + checks). If this data contains categorical columns, an encoder is going to + be instantiated and trained with this data. + + y_train: SUPPORTED_TARGET_TYPES + A set of targets to encoded if the task is for classification. + + X_test: Optional[SUPPORTED_FEAT_TYPES] + A hold out set of features used for checking + + y_test: SUPPORTED_TARGET_TYPES + A hold out set of targets used for checking. Additionally, if the current + task is a classification task, this y_test categories are also going to be + used to fit a pre-processing encoding (to prevent errors on unseen classes). + Returns ------- - self + self """ # Check that the data is valid if np.shape(X_train)[0] != np.shape(y_train)[0]: - raise ValueError("Inconsistent number of train datapoints for features and targets," - " {} for features and {} for targets".format( - np.shape(X_train)[0], - np.shape(y_train)[0], - )) + raise ValueError( + "Inconsistent number of train datapoints for features and targets," + " {} for features and {} for targets".format( + np.shape(X_train)[0], + np.shape(y_train)[0], + ) + ) if X_test is not None and np.shape(X_test)[0] != np.shape(y_test)[0]: - raise ValueError("Inconsistent number of test datapoints for features and targets," - " {} for features and {} for targets".format( - np.shape(X_test)[0], - np.shape(y_test)[0], - )) + raise ValueError( + "Inconsistent number of test datapoints for features and targets," + " {} for features and {} for targets".format( + np.shape(X_test)[0], + np.shape(y_test)[0], + ) + ) self.feature_validator.fit(X_train, X_test) self.target_validator.fit(y_train, y_test) @@ -175,7 +191,9 @@ def transform( The transformed targets array """ if not self._is_fitted: - raise NotFittedError("Cannot call transform on a validator that is not fitted") + raise NotFittedError( + "Cannot call transform on a validator that is not fitted" + ) X_transformed = self.feature_validator.transform(X) if y is not None: diff --git a/autosklearn/data/xy_data_manager.py b/autosklearn/data/xy_data_manager.py index 4c539157ee..d8cd467214 100644 --- a/autosklearn/data/xy_data_manager.py +++ b/autosklearn/data/xy_data_manager.py @@ -2,9 +2,7 @@ from typing import Dict, Optional, Union, cast import numpy as np - import pandas as pd - from scipy import sparse from autosklearn.constants import ( @@ -15,14 +13,10 @@ REGRESSION, ) from autosklearn.data.abstract_data_manager import AbstractDataManager -from autosklearn.data.validation import ( - SUPPORTED_FEAT_TYPES, - SUPPORTED_TARGET_TYPES, -) +from autosklearn.data.validation import SUPPORTED_FEAT_TYPES, SUPPORTED_TARGET_TYPES class XYDataManager(AbstractDataManager): - def __init__( self, X: SUPPORTED_FEAT_TYPES, @@ -31,52 +25,59 @@ def __init__( y_test: Optional[SUPPORTED_TARGET_TYPES], task: int, feat_type: Dict[Union[str, int], str], - dataset_name: str + dataset_name: str, ): super(XYDataManager, self).__init__(dataset_name) - self.info['task'] = task + self.info["task"] = task if sparse.issparse(X): - self.info['is_sparse'] = 1 - self.info['has_missing'] = np.all(np.isfinite(cast(sparse.csr_matrix, X).data)) + self.info["is_sparse"] = 1 + self.info["has_missing"] = np.all( + np.isfinite(cast(sparse.csr_matrix, X).data) + ) else: - self.info['is_sparse'] = 0 - if hasattr(X, 'iloc'): - self.info['has_missing'] = cast(pd.DataFrame, X).isnull().values.any() + self.info["is_sparse"] = 0 + if hasattr(X, "iloc"): + self.info["has_missing"] = cast(pd.DataFrame, X).isnull().values.any() else: - self.info['has_missing'] = np.all(np.isfinite(X)) + self.info["has_missing"] = np.all(np.isfinite(X)) label_num = { REGRESSION: 1, BINARY_CLASSIFICATION: 2, MULTIOUTPUT_REGRESSION: np.shape(y)[-1], MULTICLASS_CLASSIFICATION: len(np.unique(y)), - MULTILABEL_CLASSIFICATION: np.shape(y)[-1] + MULTILABEL_CLASSIFICATION: np.shape(y)[-1], } - self.info['label_num'] = label_num[task] + self.info["label_num"] = label_num[task] - self.data['X_train'] = X - self.data['Y_train'] = y + self.data["X_train"] = X + self.data["Y_train"] = y if X_test is not None: - self.data['X_test'] = X_test + self.data["X_test"] = X_test if y_test is not None: - self.data['Y_test'] = y_test + self.data["Y_test"] = y_test if isinstance(feat_type, dict): self.feat_type = feat_type else: - raise ValueError("Unsupported feat_type provided. We expect the user to " - "provide a Dict[str, str] mapping from column to categorical/ " - "numerical.") + raise ValueError( + "Unsupported feat_type provided. We expect the user to " + "provide a Dict[str, str] mapping from column to categorical/ " + "numerical." + ) # TODO: try to guess task type! if len(np.shape(y)) > 2: - raise ValueError('y must not have more than two dimensions, ' - 'but has %d.' % len(np.shape(y))) + raise ValueError( + "y must not have more than two dimensions, " + "but has %d." % len(np.shape(y)) + ) if np.shape(X)[0] != np.shape(y)[0]: - raise ValueError('X and y must have the same number of ' - 'datapoints, but have %d and %d.' % (np.shape(X)[0], - np.shape(y)[0])) + raise ValueError( + "X and y must have the same number of " + "datapoints, but have %d and %d." % (np.shape(X)[0], np.shape(y)[0]) + ) diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py index e337726b0e..3707ce84c9 100644 --- a/autosklearn/ensemble_builder.py +++ b/autosklearn/ensemble_builder.py @@ -1,21 +1,21 @@ # -*- encoding: utf-8 -*- +from typing import List, Optional, Tuple, Union + import glob import gzip -import math -import numbers import logging.handlers +import math import multiprocessing +import numbers import os import pickle import re import shutil import time import traceback -from typing import List, Optional, Tuple, Union import zlib import dask.distributed - import numpy as np import pandas as pd import pynisher @@ -24,12 +24,13 @@ from smac.runhistory.runhistory import RunInfo, RunValue from smac.tae.base import StatusType +from autosklearn.automl_common.common.ensemble_building.abstract_ensemble import ( # noqa: E501 + AbstractEnsemble, +) from autosklearn.automl_common.common.utils.backend import Backend -from autosklearn.automl_common.common.ensemble_building.abstract_ensemble import AbstractEnsemble - from autosklearn.constants import BINARY_CLASSIFICATION -from autosklearn.metrics import calculate_score, calculate_loss, Scorer from autosklearn.ensembles.ensemble_selection import EnsembleSelection +from autosklearn.metrics import Scorer, calculate_loss, calculate_score from autosklearn.util.logging_ import get_named_client_logger from autosklearn.util.parallel import preload_modules @@ -37,7 +38,7 @@ Y_VALID = 1 Y_TEST = 2 -MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]{1,3}\.[0-9]*)\.npy' +MODEL_FN_RE = r"_([0-9]*)_([0-9]*)_([0-9]{1,3}\.[0-9]*)\.npy" class EnsembleBuilderManager(IncorporateRunResultCallback): @@ -59,62 +60,78 @@ def __init__( ensemble_memory_limit: Optional[int], random_state: Union[int, np.random.RandomState], logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - pynisher_context: str = 'fork', + pynisher_context: str = "fork", ): - """ SMAC callback to handle ensemble building + """SMAC callback to handle ensemble building Parameters ---------- start_time: int - the time when this job was started, to account for any latency in job allocation + the time when this job was started, to account for any latency in job + allocation. + time_left_for_ensemble: int - How much time is left for the task. Job should finish within this allocated time + How much time is left for the task. Job should finish within this + allocated time + backend: util.backend.Backend backend to write and read files + dataset_name: str name of dataset + task_type: int type of ML task + metric: str name of metric to compute the loss of the given predictions + ensemble_size: int - maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection) + maximal size of ensemble + ensemble_nbest: int/float if int: consider only the n best prediction if float: consider only this fraction of the best models Both wrt to validation predictions If performance_range_threshold > 0, might return less models + max_models_on_disc: int Defines the maximum number of models that are kept in the disc. - If int, it must be greater or equal than 1, and dictates the max number of - models to keep. - If float, it will be interpreted as the max megabytes allowed of disc space. That - is, if the number of ensemble candidates require more disc space than this float - value, the worst models will be deleted to keep within this budget. - Models and predictions of the worst-performing models will be deleted then. - If None, the feature is disabled. - It defines an upper bound on the models that can be used in the ensemble. + + If int, it must be greater or equal than 1, and dictates the max + number of models to keep. + + If float, it will be interpreted as the max megabytes allowed of + disc space. That is, if the number of ensemble candidates require more + disc space than this float value, the worst models will be deleted to + keep within this budget. Models and predictions of the worst-performing + models will be deleted then. + + If None, the feature is disabled. It defines an upper bound on the + models that can be used in the ensemble. + seed: int random seed + max_iterations: int maximal number of iterations to run this script (default None --> deactivated) + precision: [16,32,64,128] precision of floats to read the predictions + memory_limit: Optional[int] memory limit in mb. If ``None``, no memory limit is enforced. + read_at_most: int read at most n new prediction files in each iteration + logger_port: int port that receives logging records + pynisher_context: str The multiprocessing context for pynisher. One of spawn/fork/forkserver. - Returns - ------- - List[Tuple[int, float, float, float]]: - A list with the performance history of this ensemble, of the form - [[pandas_timestamp, train_performance, val_performance, test_performance], ...] """ self.start_time = start_time self.time_left_for_ensembles = time_left_for_ensembles @@ -148,26 +165,31 @@ def __init__( def __call__( self, - smbo: 'SMBO', + smbo: "SMBO", run_info: RunInfo, result: RunValue, time_left: float, ): + """ + Returns + ------- + List[Tuple[int, float, float, float]]: + A list with the performance history of this ensemble, of the form + [(pandas_timestamp, train_performance, val_performance, test_performance)] + """ if result.status in (StatusType.STOP, StatusType.ABORT) or smbo._stop: return self.build_ensemble(smbo.tae_runner.client) def build_ensemble( - self, - dask_client: dask.distributed.Client, - unit_test: bool = False + self, dask_client: dask.distributed.Client, unit_test: bool = False ) -> None: # The second criteria is elapsed time elapsed_time = time.time() - self.start_time logger = get_named_client_logger( - name='EnsembleBuilder', + name="EnsembleBuilder", port=self.logger_port, ) @@ -181,10 +203,8 @@ def build_ensemble( return if self.max_iterations is not None and self.max_iterations <= self.iteration: logger.info( - "Terminate ensemble building because of max iterations: {} of {}".format( - self.max_iterations, - self.iteration - ) + "Terminate ensemble building because of max iterations:" + f" {self.max_iterations} of {self.iteration}" ) return @@ -193,11 +213,13 @@ def build_ensemble( result = self.futures.pop().result() if result: ensemble_history, self.ensemble_nbest, _, _, _ = result - logger.debug("iteration={} @ elapsed_time={} has history={}".format( - self.iteration, - elapsed_time, - ensemble_history, - )) + logger.debug( + "iteration={} @ elapsed_time={} has history={}".format( + self.iteration, + elapsed_time, + ensemble_history, + ) + ) self.history.extend(ensemble_history) # Only submit new jobs if the previous ensemble job finished @@ -215,28 +237,30 @@ def build_ensemble( # see it in the dask diagnostic dashboard # Notice that the forked ensemble_builder_process will # wait for the below function to be done - self.futures.append(dask_client.submit( - fit_and_return_ensemble, - backend=self.backend, - dataset_name=self.dataset_name, - task_type=self.task, - metric=self.metric, - ensemble_size=self.ensemble_size, - ensemble_nbest=self.ensemble_nbest, - max_models_on_disc=self.max_models_on_disc, - seed=self.seed, - precision=self.precision, - memory_limit=self.ensemble_memory_limit, - read_at_most=self.read_at_most, - random_state=self.random_state, - end_at=self.start_time + self.time_left_for_ensembles, - iteration=self.iteration, - return_predictions=False, - priority=100, - pynisher_context=self.pynisher_context, - logger_port=self.logger_port, - unit_test=unit_test, - )) + self.futures.append( + dask_client.submit( + fit_and_return_ensemble, + backend=self.backend, + dataset_name=self.dataset_name, + task_type=self.task, + metric=self.metric, + ensemble_size=self.ensemble_size, + ensemble_nbest=self.ensemble_nbest, + max_models_on_disc=self.max_models_on_disc, + seed=self.seed, + precision=self.precision, + memory_limit=self.ensemble_memory_limit, + read_at_most=self.read_at_most, + random_state=self.random_state, + end_at=self.start_time + self.time_left_for_ensembles, + iteration=self.iteration, + return_predictions=False, + priority=100, + pynisher_context=self.pynisher_context, + logger_port=self.logger_port, + unit_test=unit_test, + ) + ) logger.info( "{}/{} Started Ensemble builder job at {} for iteration {}.".format( @@ -276,11 +300,11 @@ def fit_and_return_ensemble( memory_limit: Optional[int] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> Tuple[ - List[Tuple[int, float, float, float]], - int, - Optional[np.ndarray], - Optional[np.ndarray], - Optional[np.ndarray], + List[Tuple[int, float, float, float]], + int, + Optional[np.ndarray], + Optional[np.ndarray], + Optional[np.ndarray], ]: """ @@ -291,60 +315,79 @@ def fit_and_return_ensemble( ---------- backend: util.backend.Backend backend to write and read files + dataset_name: str name of dataset + metric: str name of metric to compute the loss of the given predictions + task_type: int type of ML task + ensemble_size: int maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection) + ensemble_nbest: int/float if int: consider only the n best prediction if float: consider only this fraction of the best models Both wrt to validation predictions If performance_range_threshold > 0, might return less models + max_models_on_disc: int Defines the maximum number of models that are kept in the disc. + If int, it must be greater or equal than 1, and dictates the max number of models to keep. - If float, it will be interpreted as the max megabytes allowed of disc space. That - is, if the number of ensemble candidates require more disc space than this float - value, the worst models will be deleted to keep within this budget. - Models and predictions of the worst-performing models will be deleted then. + + If float, it will be interpreted as the max megabytes allowed of disc space. + That is, if the number of ensemble candidates require more disc space than + this float value, the worst models will be deleted to keep within this + budget. Models and predictions of the worst-performing models will be + deleted then. + If None, the feature is disabled. It defines an upper bound on the models that can be used in the ensemble. + seed: int random seed + precision: [16,32,64,128] precision of floats to read the predictions + read_at_most: int read at most n new prediction files in each iteration + end_at: float - At what time the job must finish. Needs to be the endtime and not the time left - because we do not know when dask schedules the job. + At what time the job must finish. Needs to be the endtime and not the + time left because we do not know when dask schedules the job. + iteration: int The current iteration + pynisher_context: str Context to use for multiprocessing, can be either fork, spawn or forkserver. + logger_port: int = DEFAULT_TCP_LOGGING_PORT The port where the logging server is listening to. + unit_test: bool = False - Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError. - Having this is very bad coding style, but I did not find a way to make - unittest.mock work through the pynisher with all spawn contexts. If you know a - better solution, please let us know by opening an issue. + Turn on unit testing mode. This currently makes fit_ensemble raise a + MemoryError. Having this is very bad coding style, but I did not find a way + to make unittest.mock work through the pynisher with all spawn contexts. + If you know a better solution, please let us know by opening an issue. + memory_limit: Optional[int] = None memory limit in mb. If ``None``, no memory limit is enforced. + random_state: Optional[int | RandomState] = None A random state used for the ensemble selection process. Returns ------- - List[Tuple[int, float, float, float]] - A list with the performance history of this ensemble, of the form - [[pandas_timestamp, train_performance, val_performance, test_performance], ...] - + List[Tuple[int, float, float, float]] + A list with the performance history of this ensemble, of the form + [(pandas_timestamp, train_performance, val_performance, test_performance)] """ result = EnsembleBuilder( backend=backend, @@ -390,58 +433,58 @@ def __init__( unit_test: bool = False, ): """ - Constructor - - Parameters - ---------- - backend: util.backend.Backend - backend to write and read files - dataset_name: str - name of dataset - task_type: int - type of ML task - metric: str - name of metric to compute the loss of the given predictions - ensemble_size: int = 10 - maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection) - ensemble_nbest: int | float = 100 - if int: consider only the n best prediction - if float: consider only this fraction of the best models - Both with respect to the validation predictions - If performance_range_threshold > 0, might return less models - max_models_on_disc: int = 100 - Defines the maximum number of models that are kept in the disc. - If int, it must be greater or equal than 1, and dictates the max number of - models to keep. - If float, it will be interpreted as the max megabytes allowed of disc space. That - is, if the number of ensemble candidates require more disc space than this float - value, the worst models will be deleted to keep within this budget. - Models and predictions of the worst-performing models will be deleted then. - If None, the feature is disabled. - It defines an upper bound on the models that can be used in the ensemble. - performance_range_threshold: float = 0 - Keep only models that are better than: - dummy + (best - dummy)*performance_range_threshold - E.g dummy=2, best=4, thresh=0.5 --> only consider models with loss > 3 - Will at most return the minimum between ensemble_nbest models, - and max_models_on_disc. Might return less - seed: int = 1 - random seed that is used as part of the filename - precision: int in [16,32,64,128] = 32 - precision of floats to read the predictions - memory_limit: Optional[int] = 1024 - memory limit in mb. If ``None``, no memory limit is enforced. - read_at_most: int = 5 - read at most n new prediction files in each iteration - logger_port: int = DEFAULT_TCP_LOGGING_PORT - port that receives logging records - random_state: Optional[int | RandomState] = None - An int or RandomState object used for generating the ensemble. - unit_test: bool = False - Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError. - Having this is very bad coding style, but I did not find a way to make - unittest.mock work through the pynisher with all spawn contexts. If you know a - better solution, please let us know by opening an issue. + Constructor + + Parameters + ---------- + backend: util.backend.Backend + backend to write and read files + dataset_name: str + name of dataset + task_type: int + type of ML task + metric: str + name of metric to compute the loss of the given predictions + ensemble_size: int = 10 + maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection) + ensemble_nbest: int | float = 100 + if int: consider only the n best prediction + if float: consider only this fraction of the best models + Both with respect to the validation predictions + If performance_range_threshold > 0, might return less models + max_models_on_disc: int = 100 + Defines the maximum number of models that are kept in the disc. + If int, it must be greater or equal than 1, and dictates the max number of + models to keep. + If float, it will be interpreted as the max megabytes allowed of disc space. + That is, if the number of ensemble candidates require more disc space than + this float value, the worst models are deleted to keep within this budget. + Models and predictions of the worst-performing models will be deleted then. + If None, the feature is disabled. + It defines an upper bound on the models that can be used in the ensemble. + performance_range_threshold: float = 0 + Keep only models that are better than: + dummy + (best - dummy)*performance_range_threshold + E.g dummy=2, best=4, thresh=0.5 --> only consider models with loss > 3 + Will at most return the minimum between ensemble_nbest models, + and max_models_on_disc. Might return less + seed: int = 1 + random seed that is used as part of the filename + precision: int in [16,32,64,128] = 32 + precision of floats to read the predictions + memory_limit: Optional[int] = 1024 + memory limit in mb. If ``None``, no memory limit is enforced. + read_at_most: int = 5 + read at most n new prediction files in each iteration + logger_port: int = DEFAULT_TCP_LOGGING_PORT + port that receives logging records + random_state: Optional[int | RandomState] = None + An int or RandomState object used for generating the ensemble. + unit_test: bool = False + Turn on unit testing mode. This currently makes fit_ensemble raise + a MemoryError. Having this is very bad coding style, but I did not find a + way to make unittest.mock work through the pynisher with all spawn contexts. + If you know a better solution, please let us know by opening an issue. """ super(EnsembleBuilder, self).__init__() @@ -454,13 +497,15 @@ def __init__( self.performance_range_threshold = performance_range_threshold if isinstance(ensemble_nbest, numbers.Integral) and ensemble_nbest < 1: - raise ValueError("Integer ensemble_nbest has to be larger 1: %s" % - ensemble_nbest) + raise ValueError( + "Integer ensemble_nbest has to be larger 1: %s" % ensemble_nbest + ) elif not isinstance(ensemble_nbest, numbers.Integral): if ensemble_nbest < 0 or ensemble_nbest > 1: raise ValueError( - "Float ensemble_nbest best has to be >= 0 and <= 1: %s" % - ensemble_nbest) + "Float ensemble_nbest best has to be >= 0 and <= 1: %s" + % ensemble_nbest + ) self.ensemble_nbest = ensemble_nbest @@ -469,9 +514,7 @@ def __init__( # max number of ensemble models. max_resident_models keeps the # maximum number of models in disc if max_models_on_disc is not None and max_models_on_disc < 0: - raise ValueError( - "max_models_on_disc has to be a positive number or None" - ) + raise ValueError("max_models_on_disc has to be a positive number or None") self.max_models_on_disc = max_models_on_disc self.max_resident_models = None @@ -485,13 +528,15 @@ def __init__( # Setup the logger self.logger_port = logger_port self.logger = get_named_client_logger( - name='EnsembleBuilder', + name="EnsembleBuilder", port=self.logger_port, ) if ensemble_nbest == 1: - self.logger.debug("Behaviour depends on int/float: %s, %s (ensemble_nbest, type)" % - (ensemble_nbest, type(ensemble_nbest))) + self.logger.debug( + "Behaviour depends on int/float: %s, %s (ensemble_nbest, type)" + % (ensemble_nbest, type(ensemble_nbest)) + ) self.start_time = 0 self.model_fn_re = re.compile(MODEL_FN_RE) @@ -528,8 +573,7 @@ def __init__( # we save the state of this dictionary to memory # and read it if available self.ensemble_memory_file = os.path.join( - self.backend.internals_directory, - 'ensemble_read_preds.pkl' + self.backend.internals_directory, "ensemble_read_preds.pkl" ) if os.path.exists(self.ensemble_memory_file): try: @@ -537,15 +581,12 @@ def __init__( self.read_preds, self.last_hash = pickle.load(memory) except Exception as e: self.logger.warning( - "Could not load the previous iterations of ensemble_builder predictions." - "This might impact the quality of the run. Exception={} {}".format( - e, - traceback.format_exc(), - ) + "Could not load the previous iterations of ensemble_builder" + " predictions. This might impact the quality of the run." + f" Exception={e} {traceback.format_exc()}" ) self.ensemble_loss_file = os.path.join( - self.backend.internals_directory, - 'ensemble_read_losses.pkl' + self.backend.internals_directory, "ensemble_read_losses.pkl" ) if os.path.exists(self.ensemble_loss_file): try: @@ -560,17 +601,17 @@ def __init__( ) ) - # hidden feature which can be activated via an environment variable. This keeps all - # models and predictions which have ever been a candidate. This is necessary to post-hoc - # compute the whole ensemble building trajectory. + # hidden feature which can be activated via an environment variable. + # This keeps all models and predictions which have ever been a candidate. + # This is necessary to post-hoc compute the whole ensemble building trajectory. self._has_been_candidate = set() self.validation_performance_ = np.inf # Track the ensemble performance datamanager = self.backend.load_datamanager() - self.y_valid = datamanager.data.get('Y_valid') - self.y_test = datamanager.data.get('Y_test') + self.y_valid = datamanager.data.get("Y_valid") + self.y_test = datamanager.data.get("Y_test") del datamanager self.ensemble_history = [] @@ -585,12 +626,12 @@ def run( ): if time_left is None and end_at is None: - raise ValueError('Must provide either time_left or end_at.') + raise ValueError("Must provide either time_left or end_at.") elif time_left is not None and end_at is not None: - raise ValueError('Cannot provide both time_left and end_at.') + raise ValueError("Cannot provide both time_left and end_at.") self.logger = get_named_client_logger( - name='EnsembleBuilder', + name="EnsembleBuilder", port=self.logger_port, ) @@ -624,37 +665,44 @@ def run( # if ensemble script died because of memory error, # reduce nbest to reduce memory consumption and try it again - # ATTENTION: main will start from scratch; # all data structures are empty again + # ATTENTION: main will start from scratch; + # all data structures are empty again try: os.remove(self.ensemble_memory_file) except: # noqa E722 pass - if isinstance(self.ensemble_nbest, numbers.Integral) and self.ensemble_nbest <= 1: + if ( + isinstance(self.ensemble_nbest, numbers.Integral) + and self.ensemble_nbest <= 1 + ): if self.read_at_most == 1: self.logger.error( - "Memory Exception -- Unable to further reduce the number of ensemble " - "members and can no further limit the number of ensemble members " - "loaded per iteration -- please restart Auto-sklearn with a higher " - "value for the argument `memory_limit` (current limit is %s MB). " - "The ensemble builder will keep running to delete files from disk in " - "case this was enabled.", self.memory_limit + "Memory Exception -- Unable to further reduce the number" + " of ensemble members and can no further limit the number" + " of ensemble members loaded per iteration, please restart" + " Auto-sklearn with a higher value for the argument" + f" `memory_limit` (current limit is {self.memory_limit}MB)." + " The ensemble builder will keep running to delete files" + " from disk in case this was enabled.", ) self.ensemble_nbest = 0 else: self.read_at_most = 1 self.logger.warning( - "Memory Exception -- Unable to further reduce the number of ensemble " - "members -- Now reducing the number of predictions per call to read " - "at most to 1." + "Memory Exception -- Unable to further reduce the number of" + " ensemble members. Now reducing the number of predictions" + " per call to read at most to 1." ) else: if isinstance(self.ensemble_nbest, numbers.Integral): self.ensemble_nbest = max(1, int(self.ensemble_nbest / 2)) else: self.ensemble_nbest = self.ensemble_nbest / 2 - self.logger.warning("Memory Exception -- restart with " - "less ensemble_nbest: %d" % self.ensemble_nbest) + self.logger.warning( + "Memory Exception -- restart with " + "less ensemble_nbest: %d" % self.ensemble_nbest + ) return [], self.ensemble_nbest, None, None, None else: return safe_ensemble_script.result @@ -667,7 +715,7 @@ def main(self, time_left, iteration, return_predictions): # the logger configuration. So we have to set it up # accordingly self.logger = get_named_client_logger( - name='EnsembleBuilder', + name="EnsembleBuilder", port=self.logger_port, ) @@ -676,7 +724,7 @@ def main(self, time_left, iteration, return_predictions): used_time = time.time() - self.start_time self.logger.debug( - 'Starting iteration %d, time left: %f', + "Starting iteration %d, time left: %f", iteration, time_left - used_time, ) @@ -684,7 +732,13 @@ def main(self, time_left, iteration, return_predictions): # populates self.read_preds and self.read_losses if not self.compute_loss_per_model(): if return_predictions: - return self.ensemble_history, self.ensemble_nbest, train_pred, valid_pred, test_pred + return ( + self.ensemble_history, + self.ensemble_nbest, + train_pred, + valid_pred, + test_pred, + ) else: return self.ensemble_history, self.ensemble_nbest, None, None, None @@ -693,22 +747,40 @@ def main(self, time_left, iteration, return_predictions): candidate_models = self.get_n_best_preds() if not candidate_models: # no candidates yet if return_predictions: - return self.ensemble_history, self.ensemble_nbest, train_pred, valid_pred, test_pred + return ( + self.ensemble_history, + self.ensemble_nbest, + train_pred, + valid_pred, + test_pred, + ) else: return self.ensemble_history, self.ensemble_nbest, None, None, None # populates predictions in self.read_preds # reduces selected models if file reading failed - n_sel_valid, n_sel_test = self. \ - get_valid_test_preds(selected_keys=candidate_models) + n_sel_valid, n_sel_test = self.get_valid_test_preds( + selected_keys=candidate_models + ) # If valid/test predictions loaded, then reduce candidate models to this set - if len(n_sel_test) != 0 and len(n_sel_valid) != 0 \ - and len(set(n_sel_valid).intersection(set(n_sel_test))) == 0: + if ( + len(n_sel_test) != 0 + and len(n_sel_valid) != 0 + and len(set(n_sel_valid).intersection(set(n_sel_test))) == 0 + ): # Both n_sel_* have entries, but there is no overlap, this is critical - self.logger.error("n_sel_valid and n_sel_test are not empty, but do not overlap") + self.logger.error( + "n_sel_valid and n_sel_test are not empty, but do not overlap" + ) if return_predictions: - return self.ensemble_history, self.ensemble_nbest, train_pred, valid_pred, test_pred + return ( + self.ensemble_history, + self.ensemble_nbest, + train_pred, + valid_pred, + test_pred, + ) else: return self.ensemble_history, self.ensemble_nbest, None, None, None @@ -716,24 +788,31 @@ def main(self, time_left, iteration, return_predictions): # then ensure candidate_models AND n_sel_test are sorted the same candidate_models_set = set(candidate_models) if candidate_models_set.intersection(n_sel_valid).intersection(n_sel_test): - candidate_models = sorted(list(candidate_models_set.intersection( - n_sel_valid).intersection(n_sel_test))) + candidate_models = sorted( + list( + candidate_models_set.intersection(n_sel_valid).intersection( + n_sel_test + ) + ) + ) n_sel_test = candidate_models n_sel_valid = candidate_models elif candidate_models_set.intersection(n_sel_valid): - candidate_models = sorted(list(candidate_models_set.intersection( - n_sel_valid))) + candidate_models = sorted( + list(candidate_models_set.intersection(n_sel_valid)) + ) n_sel_valid = candidate_models elif candidate_models_set.intersection(n_sel_test): - candidate_models = sorted(list(candidate_models_set.intersection( - n_sel_test))) + candidate_models = sorted( + list(candidate_models_set.intersection(n_sel_test)) + ) n_sel_test = candidate_models else: # This has to be the case n_sel_test = [] n_sel_valid = [] - if os.environ.get('ENSEMBLE_KEEP_ALL_CANDIDATES'): + if os.environ.get("ENSEMBLE_KEEP_ALL_CANDIDATES"): for candidate in candidate_models: self._has_been_candidate.add(candidate) @@ -744,8 +823,9 @@ def main(self, time_left, iteration, return_predictions): if ensemble is not None and self.SAVE2DISC: self.backend.save_ensemble(ensemble, iteration, self.seed) - # Delete files of non-candidate models - can only be done after fitting the ensemble and - # saving it to disc so we do not accidentally delete models in the previous ensemble + # Delete files of non-candidate models - can only be done after fitting the + # ensemble and saving it to disc so we do not accidentally delete models in + # the previous ensemble if self.max_resident_models is not None: self._delete_excess_models(selected_keys=candidate_models) @@ -754,39 +834,47 @@ def main(self, time_left, iteration, return_predictions): pickle.dump(self.read_losses, memory) if ensemble is not None: - train_pred = self.predict(set_="train", - ensemble=ensemble, - selected_keys=candidate_models, - n_preds=len(candidate_models), - index_run=iteration) + train_pred = self.predict( + set_="train", + ensemble=ensemble, + selected_keys=candidate_models, + n_preds=len(candidate_models), + index_run=iteration, + ) # We can't use candidate_models here, as n_sel_* might be empty - valid_pred = self.predict(set_="valid", - ensemble=ensemble, - selected_keys=n_sel_valid, - n_preds=len(candidate_models), - index_run=iteration) + valid_pred = self.predict( + set_="valid", + ensemble=ensemble, + selected_keys=n_sel_valid, + n_preds=len(candidate_models), + index_run=iteration, + ) # TODO if predictions fails, build the model again during the # next iteration! - test_pred = self.predict(set_="test", - ensemble=ensemble, - selected_keys=n_sel_test, - n_preds=len(candidate_models), - index_run=iteration) + test_pred = self.predict( + set_="test", + ensemble=ensemble, + selected_keys=n_sel_test, + n_preds=len(candidate_models), + index_run=iteration, + ) # Add a score to run history to see ensemble progress - self._add_ensemble_trajectory( - train_pred, - valid_pred, - test_pred - ) + self._add_ensemble_trajectory(train_pred, valid_pred, test_pred) - # The loaded predictions and the hash can only be saved after the ensemble has been + # The loaded predictions and hash can only be saved after the ensemble has been # built, because the hash is computed during the construction of the ensemble with open(self.ensemble_memory_file, "wb") as memory: pickle.dump((self.read_preds, self.last_hash), memory) if return_predictions: - return self.ensemble_history, self.ensemble_nbest, train_pred, valid_pred, test_pred + return ( + self.ensemble_history, + self.ensemble_nbest, + train_pred, + valid_pred, + test_pred, + ) else: return self.ensemble_history, self.ensemble_nbest, None, None, None @@ -803,10 +891,14 @@ def get_disk_consumption(self, pred_path): _budget = float(match.group(3)) stored_files_for_run = os.listdir( - self.backend.get_numrun_directory(_seed, _num_run, _budget)) + self.backend.get_numrun_directory(_seed, _num_run, _budget) + ) stored_files_for_run = [ - os.path.join(self.backend.get_numrun_directory(_seed, _num_run, _budget), file_name) - for file_name in stored_files_for_run] + os.path.join( + self.backend.get_numrun_directory(_seed, _num_run, _budget), file_name + ) + for file_name in stored_files_for_run + ] this_model_cost = sum([os.path.getsize(path) for path in stored_files_for_run]) # get the megabytes @@ -814,8 +906,8 @@ def get_disk_consumption(self, pred_path): def compute_loss_per_model(self): """ - Compute the loss of the predictions on ensemble building data set; - populates self.read_preds and self.read_losses + Compute the loss of the predictions on ensemble building data set; + populates self.read_preds and self.read_losses """ self.logger.debug("Read ensemble data set predictions") @@ -832,17 +924,21 @@ def compute_loss_per_model(self): pred_path = os.path.join( glob.escape(self.backend.get_runs_directory()), - '%d_*_*' % self.seed, - 'predictions_ensemble_%s_*_*.npy*' % self.seed, + "%d_*_*" % self.seed, + "predictions_ensemble_%s_*_*.npy*" % self.seed, ) y_ens_files = glob.glob(pred_path) - y_ens_files = [y_ens_file for y_ens_file in y_ens_files - if y_ens_file.endswith('.npy') or y_ens_file.endswith('.npy.gz')] + y_ens_files = [ + y_ens_file + for y_ens_file in y_ens_files + if y_ens_file.endswith(".npy") or y_ens_file.endswith(".npy.gz") + ] self.y_ens_files = y_ens_files # no validation predictions so far -- no files if len(self.y_ens_files) == 0: - self.logger.debug("Found no prediction files on ensemble data set:" - " %s" % pred_path) + self.logger.debug( + "Found no prediction files on ensemble data set:" " %s" % pred_path + ) return False # First sort files chronologically @@ -858,15 +954,18 @@ def compute_loss_per_model(self): n_read_files = 0 # Now read file wrt to num_run - for y_ens_fn, match, _seed, _num_run, _budget, mtime in \ - sorted(to_read, key=lambda x: x[5]): + for y_ens_fn, match, _seed, _num_run, _budget, mtime in sorted( + to_read, key=lambda x: x[5] + ): if self.read_at_most and n_read_files >= self.read_at_most: # limit the number of files that will be read # to limit memory consumption break if not y_ens_fn.endswith(".npy") and not y_ens_fn.endswith(".npy.gz"): - self.logger.info('Error loading file (not .npy or .npy.gz): %s', y_ens_fn) + self.logger.info( + "Error loading file (not .npy or .npy.gz): %s", y_ens_fn + ) continue if not self.read_losses.get(y_ens_fn): @@ -884,7 +983,7 @@ def compute_loss_per_model(self): # 1 - loaded and in memory # 2 - loaded but dropped again # 3 - deleted from disk due to space constraints - "loaded": 0 + "loaded": 0, } if not self.read_preds.get(y_ens_fn): self.read_preds[y_ens_fn] = { @@ -900,16 +999,18 @@ def compute_loss_per_model(self): # actually read the predictions and compute their respective loss try: y_ensemble = self._read_np_fn(y_ens_fn) - loss = calculate_loss(solution=self.y_true_ensemble, - prediction=y_ensemble, - task_type=self.task_type, - metric=self.metric, - scoring_functions=None) + loss = calculate_loss( + solution=self.y_true_ensemble, + prediction=y_ensemble, + task_type=self.task_type, + metric=self.metric, + scoring_functions=None, + ) if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]): self.logger.debug( - 'Changing ensemble loss for file %s from %f to %f ' - 'because file modification time changed? %f - %f', + "Changing ensemble loss for file %s from %f to %f " + "because file modification time changed? %f - %f", y_ens_fn, self.read_losses[y_ens_fn]["ens_loss"], loss, @@ -923,39 +1024,38 @@ def compute_loss_per_model(self): # To save memory, we just compute the loss. self.read_losses[y_ens_fn]["mtime_ens"] = os.path.getmtime(y_ens_fn) self.read_losses[y_ens_fn]["loaded"] = 2 - self.read_losses[y_ens_fn]["disc_space_cost_mb"] = self.get_disk_consumption( - y_ens_fn - ) + self.read_losses[y_ens_fn][ + "disc_space_cost_mb" + ] = self.get_disk_consumption(y_ens_fn) n_read_files += 1 except Exception: self.logger.warning( - 'Error loading %s: %s', + "Error loading %s: %s", y_ens_fn, traceback.format_exc(), ) self.read_losses[y_ens_fn]["ens_loss"] = np.inf self.logger.debug( - 'Done reading %d new prediction files. Loaded %d predictions in ' - 'total.', + "Done reading %d new prediction files. Loaded %d predictions in " "total.", n_read_files, - np.sum([pred["loaded"] > 0 for pred in self.read_losses.values()]) + np.sum([pred["loaded"] > 0 for pred in self.read_losses.values()]), ) return True def get_n_best_preds(self): """ - get best n predictions (i.e., keys of self.read_losses) - according to the loss on the "ensemble set" - n: self.ensemble_nbest - - Side effects: - ->Define the n-best models to use in ensemble - ->Only the best models are loaded - ->Any model that is not best is candidate to deletion - if max models in disc is exceeded. + get best n predictions (i.e., keys of self.read_losses) + according to the loss on the "ensemble set" + n: self.ensemble_nbest + + Side effects: + ->Define the n-best models to use in ensemble + ->Only the best models are loaded + ->Any model that is not best is candidate to deletion + if max models in disc is exceeded. """ sorted_keys = self._get_list_of_sorted_preds() @@ -982,31 +1082,39 @@ def get_n_best_preds(self): # no model left; try to use dummy loss (num_run==0) # log warning when there are other models but not better than dummy model if num_keys > num_dummy: - self.logger.warning("No models better than random - using Dummy loss!" - "Number of models besides current dummy model: %d. " - "Number of dummy models: %d", - num_keys - 1, - num_dummy) + self.logger.warning( + "No models better than random - using Dummy loss!" + "Number of models besides current dummy model: %d. " + "Number of dummy models: %d", + num_keys - 1, + num_dummy, + ) sorted_keys = [ - (k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items() + (k, v["ens_loss"], v["num_run"]) + for k, v in self.read_losses.items() if v["seed"] == self.seed and v["num_run"] == 1 ] # reload predictions if losses changed over time and a model is # considered to be in the top models again! if not isinstance(self.ensemble_nbest, numbers.Integral): # Transform to number of models to keep. Keep at least one - keep_nbest = max(1, min(len(sorted_keys), - int(len(sorted_keys) * self.ensemble_nbest))) + keep_nbest = max( + 1, min(len(sorted_keys), int(len(sorted_keys) * self.ensemble_nbest)) + ) self.logger.debug( "Library pruning: using only top %f percent of the models for ensemble " "(%d out of %d)", - self.ensemble_nbest * 100, keep_nbest, len(sorted_keys) + self.ensemble_nbest * 100, + keep_nbest, + len(sorted_keys), ) else: # Keep only at most ensemble_nbest keep_nbest = min(self.ensemble_nbest, len(sorted_keys)) - self.logger.debug("Library Pruning: using for ensemble only " - " %d (out of %d) models" % (keep_nbest, len(sorted_keys))) + self.logger.debug( + "Library Pruning: using for ensemble only " + " %d (out of %d) models" % (keep_nbest, len(sorted_keys)) + ) # If max_models_on_disc is None, do nothing # One can only read at most max_models_on_disc models @@ -1016,21 +1124,28 @@ def get_n_best_preds(self): [ v["ens_loss"], v["disc_space_cost_mb"], - ] for v in self.read_losses.values() if v["disc_space_cost_mb"] is not None + ] + for v in self.read_losses.values() + if v["disc_space_cost_mb"] is not None ] max_consumption = max(c[1] for c in consumption) # We are pessimistic with the consumption limit indicated by # max_models_on_disc by 1 model. Such model is assumed to spend # max_consumption megabytes - if (sum(c[1] for c in consumption) + max_consumption) > self.max_models_on_disc: + if ( + sum(c[1] for c in consumption) + max_consumption + ) > self.max_models_on_disc: # just leave the best -- smaller is better! # This list is in descending order, to preserve the best models - sorted_cum_consumption = np.cumsum([ - c[1] for c in list(sorted(consumption)) - ]) + max_consumption - max_models = np.argmax(sorted_cum_consumption > self.max_models_on_disc) + sorted_cum_consumption = ( + np.cumsum([c[1] for c in list(sorted(consumption))]) + + max_consumption + ) + max_models = np.argmax( + sorted_cum_consumption > self.max_models_on_disc + ) # Make sure that at least 1 model survives self.max_resident_models = max(1, max_models) @@ -1040,7 +1155,7 @@ def get_n_best_preds(self): self.max_models_on_disc, (sum(c[1] for c in consumption) + max_consumption), max_consumption, - self.max_resident_models + self.max_resident_models, ) ) else: @@ -1048,11 +1163,15 @@ def get_n_best_preds(self): else: self.max_resident_models = self.max_models_on_disc - if self.max_resident_models is not None and keep_nbest > self.max_resident_models: + if ( + self.max_resident_models is not None + and keep_nbest > self.max_resident_models + ): self.logger.debug( "Restricting the number of models to %d instead of %d due to argument " "max_models_on_disc", - self.max_resident_models, keep_nbest, + self.max_resident_models, + keep_nbest, ) keep_nbest = self.max_resident_models @@ -1069,9 +1188,12 @@ def get_n_best_preds(self): # but always keep at least one model current_loss = sorted_keys[i][1] if current_loss >= worst_loss: - self.logger.debug("Dynamic Performance range: " - "Further reduce from %d to %d models", - keep_nbest, max(1, i)) + self.logger.debug( + "Dynamic Performance range: " + "Further reduce from %d to %d models", + keep_nbest, + max(1, i), + ) keep_nbest = max(1, i) break ensemble_n_best = keep_nbest @@ -1085,38 +1207,33 @@ def get_n_best_preds(self): self.read_preds[k][Y_ENSEMBLE] = None self.read_preds[k][Y_VALID] = None self.read_preds[k][Y_TEST] = None - if self.read_losses[k]['loaded'] == 1: + if self.read_losses[k]["loaded"] == 1: self.logger.debug( - 'Dropping model %s (%d,%d) with loss %f.', + "Dropping model %s (%d,%d) with loss %f.", k, - self.read_losses[k]['seed'], - self.read_losses[k]['num_run'], - self.read_losses[k]['ens_loss'], + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["ens_loss"], ) - self.read_losses[k]['loaded'] = 2 + self.read_losses[k]["loaded"] = 2 # Load the predictions for the winning for k in sorted_keys[:ensemble_n_best]: if ( - ( - k not in self.read_preds or - self.read_preds[k][Y_ENSEMBLE] is None - ) - and self.read_losses[k]['loaded'] != 3 - ): + k not in self.read_preds or self.read_preds[k][Y_ENSEMBLE] is None + ) and self.read_losses[k]["loaded"] != 3: self.read_preds[k][Y_ENSEMBLE] = self._read_np_fn(k) # No need to load valid and test here because they are loaded # only if the model ends up in the ensemble - self.read_losses[k]['loaded'] = 1 + self.read_losses[k]["loaded"] = 1 # return keys of self.read_losses with lowest losses return sorted_keys[:ensemble_n_best] - def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], List[str]]: - """ - get valid and test predictions from disc - and store them in self.read_preds - + def get_valid_test_preds( + self, selected_keys: List[str] + ) -> Tuple[List[str], List[str]]: + """Get valid and test predictions from disc and store them in self.read_preds Parameters --------- selected_keys: list @@ -1135,35 +1252,47 @@ def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], Lis valid_fn = glob.glob( os.path.join( glob.escape(self.backend.get_runs_directory()), - '%d_%d_%s' % ( + "%d_%d_%s" + % ( self.read_losses[k]["seed"], self.read_losses[k]["num_run"], self.read_losses[k]["budget"], ), - 'predictions_valid_%d_%d_%s.npy*' % ( + "predictions_valid_%d_%d_%s.npy*" + % ( self.read_losses[k]["seed"], self.read_losses[k]["num_run"], self.read_losses[k]["budget"], - ) + ), ) ) - valid_fn = [vfn for vfn in valid_fn if vfn.endswith('.npy') or vfn.endswith('.npy.gz')] + valid_fn = [ + vfn + for vfn in valid_fn + if vfn.endswith(".npy") or vfn.endswith(".npy.gz") + ] test_fn = glob.glob( os.path.join( glob.escape(self.backend.get_runs_directory()), - '%d_%d_%s' % ( + "%d_%d_%s" + % ( self.read_losses[k]["seed"], self.read_losses[k]["num_run"], self.read_losses[k]["budget"], ), - 'predictions_test_%d_%d_%s.npy*' % ( + "predictions_test_%d_%d_%s.npy*" + % ( self.read_losses[k]["seed"], self.read_losses[k]["num_run"], - self.read_losses[k]["budget"] - ) + self.read_losses[k]["budget"], + ), ) ) - test_fn = [tfn for tfn in test_fn if tfn.endswith('.npy') or tfn.endswith('.npy.gz')] + test_fn = [ + tfn + for tfn in test_fn + if tfn.endswith(".npy") or tfn.endswith(".npy.gz") + ] if len(valid_fn) == 0: # self.logger.debug("Not found validation prediction file " @@ -1185,8 +1314,9 @@ def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], Lis success_keys_valid.append(k) self.read_losses[k]["mtime_valid"] = os.path.getmtime(valid_fn) except Exception: - self.logger.warning('Error loading %s: %s', - valid_fn, traceback.format_exc()) + self.logger.warning( + "Error loading %s: %s", valid_fn, traceback.format_exc() + ) if len(test_fn) == 0: # self.logger.debug("Not found test prediction file (although " @@ -1208,26 +1338,24 @@ def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], Lis success_keys_test.append(k) self.read_losses[k]["mtime_test"] = os.path.getmtime(test_fn) except Exception: - self.logger.warning('Error loading %s: %s', - test_fn, traceback.format_exc()) + self.logger.warning( + "Error loading %s: %s", test_fn, traceback.format_exc() + ) return success_keys_valid, success_keys_test def fit_ensemble(self, selected_keys: list): """ - fit ensemble - - Parameters - --------- - selected_keys: list - list of selected keys of self.read_losses + Parameters + --------- + selected_keys: list + list of selected keys of self.read_losses - Returns - ------- - ensemble: EnsembleSelection - trained Ensemble + Returns + ------- + ensemble: EnsembleSelection + trained Ensemble """ - if self.unit_test: raise MemoryError() @@ -1238,13 +1366,16 @@ def fit_ensemble(self, selected_keys: list): self.read_losses[k]["num_run"], self.read_losses[k]["budget"], ) - for k in selected_keys] + for k in selected_keys + ] # check hash if ensemble training data changed - current_hash = "".join([ - str(zlib.adler32(predictions_train[i].data.tobytes())) - for i in range(len(predictions_train)) - ]) + current_hash = "".join( + [ + str(zlib.adler32(predictions_train[i].data.tobytes())) + for i in range(len(predictions_train)) + ] + ) if self.last_hash == current_hash: self.logger.debug( "No new model predictions selected -- skip ensemble building " @@ -1268,8 +1399,7 @@ def fit_ensemble(self, selected_keys: list): len(predictions_train), ) start_time = time.time() - ensemble.fit(predictions_train, self.y_true_ensemble, - include_num_runs) + ensemble.fit(predictions_train, self.y_true_ensemble, include_num_runs) end_time = time.time() self.logger.debug( "Fitting the ensemble took %.2f seconds.", @@ -1282,10 +1412,10 @@ def fit_ensemble(self, selected_keys: list): ) except ValueError: - self.logger.error('Caught ValueError: %s', traceback.format_exc()) + self.logger.error("Caught ValueError: %s", traceback.format_exc()) return None except IndexError: - self.logger.error('Caught IndexError: %s' + traceback.format_exc()) + self.logger.error("Caught IndexError: %s" + traceback.format_exc()) return None finally: # Explicitly free memory @@ -1293,37 +1423,39 @@ def fit_ensemble(self, selected_keys: list): return ensemble - def predict(self, set_: str, - ensemble: AbstractEnsemble, - selected_keys: list, - n_preds: int, - index_run: int): - """ - save preditions on ensemble, validation and test data on disc - - Parameters - ---------- - set_: ["valid","test"] - data split name - ensemble: EnsembleSelection - trained Ensemble - selected_keys: list - list of selected keys of self.read_losses - n_preds: int - number of prediction models used for ensemble building - same number of predictions on valid and test are necessary - index_run: int - n-th time that ensemble predictions are written to disc - - Return - ------ - y: np.ndarray + def predict( + self, + set_: str, + ensemble: AbstractEnsemble, + selected_keys: list, + n_preds: int, + index_run: int, + ): + """Save preditions on ensemble, validation and test data on disc + + Parameters + ---------- + set_: ["valid","test"] + data split name + ensemble: EnsembleSelection + trained Ensemble + selected_keys: list + list of selected keys of self.read_losses + n_preds: int + number of prediction models used for ensemble building + same number of predictions on valid and test are necessary + index_run: int + n-th time that ensemble predictions are written to disc + + Return + ------ + y: np.ndarray """ self.logger.debug("Predicting the %s set with the ensemble!", set_) - if set_ == 'valid': + if set_ == "valid": pred_set = Y_VALID - elif set_ == 'test': + elif set_ == "test": pred_set = Y_TEST else: pred_set = Y_ENSEMBLE @@ -1364,79 +1496,82 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): train_pred = np.vstack( ((1 - train_pred).reshape((1, -1)), train_pred.reshape((1, -1))) ).transpose() - if valid_pred is not None and (len(valid_pred.shape) == 1 or valid_pred.shape[1] == 1): + if valid_pred is not None and ( + len(valid_pred.shape) == 1 or valid_pred.shape[1] == 1 + ): valid_pred = np.vstack( ((1 - valid_pred).reshape((1, -1)), valid_pred.reshape((1, -1))) ).transpose() - if test_pred is not None and (len(test_pred.shape) == 1 or test_pred.shape[1] == 1): + if test_pred is not None and ( + len(test_pred.shape) == 1 or test_pred.shape[1] == 1 + ): test_pred = np.vstack( ((1 - test_pred).reshape((1, -1)), test_pred.reshape((1, -1))) ).transpose() performance_stamp = { - 'Timestamp': pd.Timestamp.now(), - 'ensemble_optimization_score': calculate_score( + "Timestamp": pd.Timestamp.now(), + "ensemble_optimization_score": calculate_score( solution=self.y_true_ensemble, prediction=train_pred, task_type=self.task_type, metric=self.metric, - scoring_functions=None - ) + scoring_functions=None, + ), } if valid_pred is not None: # TODO: valid_pred are a legacy from competition manager # and this if never happens. Re-evaluate Y_valid support - performance_stamp['ensemble_val_score'] = calculate_score( + performance_stamp["ensemble_val_score"] = calculate_score( solution=self.y_valid, prediction=valid_pred, task_type=self.task_type, metric=self.metric, - scoring_functions=None + scoring_functions=None, ) # In case test_pred was provided if test_pred is not None: - performance_stamp['ensemble_test_score'] = calculate_score( + performance_stamp["ensemble_test_score"] = calculate_score( solution=self.y_test, prediction=test_pred, task_type=self.task_type, metric=self.metric, - scoring_functions=None + scoring_functions=None, ) self.ensemble_history.append(performance_stamp) def _get_list_of_sorted_preds(self): """ - Returns a list of sorted predictions in descending order - Losses are taken from self.read_losses. + Returns a list of sorted predictions in descending order + Losses are taken from self.read_losses. - Parameters - ---------- - None + Parameters + ---------- + None - Return - ------ - sorted_keys: list + Return + ------ + sorted_keys: list """ # Sort by loss - smaller is better! - sorted_keys = list(sorted( - [ - (k, v["ens_loss"], v["num_run"]) - for k, v in self.read_losses.items() - ], - # Sort by loss as priority 1 and then by num_run on a ascending order - # We want small num_run first - key=lambda x: (x[1], x[2]), - )) + sorted_keys = list( + sorted( + [(k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items()], + # Sort by loss as priority 1 and then by num_run on a ascending order + # We want small num_run first + key=lambda x: (x[1], x[2]), + ) + ) return sorted_keys def _delete_excess_models(self, selected_keys: List[str]): """ - Deletes models excess models on disc. self.max_models_on_disc - defines the upper limit on how many models to keep. - Any additional model with a worst loss than the top - self.max_models_on_disc is deleted. + Deletes models excess models on disc. self.max_models_on_disc + defines the upper limit on how many models to keep. + Any additional model with a worst loss than the top + self.max_models_on_disc is deleted. """ @@ -1461,8 +1596,8 @@ def _delete_excess_models(self, selected_keys: List[str]): numrun_dir = self.backend.get_numrun_directory(_seed, _num_run, _budget) try: - os.rename(numrun_dir, numrun_dir + '.old') - shutil.rmtree(numrun_dir + '.old') + os.rename(numrun_dir, numrun_dir + ".old") + shutil.rmtree(numrun_dir + ".old") self.logger.info("Deleted files of non-candidate model %s", pred_path) self.read_losses[pred_path]["disc_space_cost_mb"] = None self.read_losses[pred_path]["loaded"] = 3 @@ -1470,7 +1605,9 @@ def _delete_excess_models(self, selected_keys: List[str]): except Exception as e: self.logger.error( "Failed to delete files of non-candidate model %s due" - " to error %s", pred_path, e + " to error %s", + pred_path, + e, ) def _read_np_fn(self, path): @@ -1478,9 +1615,7 @@ def _read_np_fn(self, path): # Support for string precision if isinstance(self.precision, str): precision = int(self.precision) - self.logger.warning("Interpreted str-precision as {}".format( - precision - )) + self.logger.warning("Interpreted str-precision as {}".format(precision)) else: precision = self.precision @@ -1490,7 +1625,7 @@ def _read_np_fn(self, path): open_method = open else: raise ValueError("Unknown filetype %s" % path) - with open_method(path, 'rb') as fp: + with open_method(path, "rb") as fp: if precision == 16: predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float16) elif precision == 32: diff --git a/autosklearn/ensembles/abstract_ensemble.py b/autosklearn/ensembles/abstract_ensemble.py index 752131407f..24d352ab5e 100644 --- a/autosklearn/ensembles/abstract_ensemble.py +++ b/autosklearn/ensembles/abstract_ensemble.py @@ -15,7 +15,7 @@ def fit( base_models_predictions: np.ndarray, true_targets: np.ndarray, model_identifiers: List[Tuple[int, int, float]], - ) -> 'AbstractEnsemble': + ) -> "AbstractEnsemble": """Fit an ensemble given predictions of base models and targets. Ensemble building maximizes performance (in contrast to @@ -23,7 +23,8 @@ def fit( Parameters ---------- - base_models_predictions : array of shape = [n_base_models, n_data_points, n_targets] + base_models_predictions: np.ndarray + shape = (n_base_models, n_data_points, n_targets) n_targets is the number of classes in case of classification, n_targets is 0 or 1 in case of regression @@ -40,12 +41,15 @@ def fit( pass @abstractmethod - def predict(self, base_models_predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: + def predict( + self, base_models_predictions: Union[np.ndarray, List[np.ndarray]] + ) -> np.ndarray: """Create ensemble predictions from the base model predictions. Parameters ---------- - base_models_predictions : array of shape = [n_base_models, n_data_points, n_targets] + base_models_predictions : np.ndarray + shape = (n_base_models, n_data_points, n_targets) Same as in the fit method. Returns diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index 1546c763c2..3ae216da01 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -1,9 +1,9 @@ +from typing import Any, Dict, List, Optional, Tuple, Union, cast + import random from collections import Counter -from typing import Any, Dict, List, Optional, Tuple, Union, cast import numpy as np - from sklearn.utils import check_random_state from autosklearn.constants import TASK_TYPES @@ -19,10 +19,10 @@ def __init__( task_type: int, metric: Scorer, bagging: bool = False, - mode: str = 'fast', + mode: str = "fast", random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: - """ An ensemble of selected algorithms + """An ensemble of selected algorithms Fitting an EnsembleSelection generates an ensemble from the the models generated during the search process. Can be further used for prediction. @@ -79,17 +79,19 @@ def fit( ) -> AbstractEnsemble: self.ensemble_size = int(self.ensemble_size) if self.ensemble_size < 1: - raise ValueError('Ensemble size cannot be less than one!') + raise ValueError("Ensemble size cannot be less than one!") if self.task_type not in TASK_TYPES: - raise ValueError('Unknown task type %s.' % self.task_type) + raise ValueError("Unknown task type %s." % self.task_type) if not isinstance(self.metric, Scorer): - raise ValueError("The provided metric must be an instance of Scorer, " - "nevertheless it is {}({})".format( - self.metric, - type(self.metric), - )) - if self.mode not in ('fast', 'slow'): - raise ValueError('Unknown mode %s' % self.mode) + raise ValueError( + "The provided metric must be an instance of Scorer, " + "nevertheless it is {}({})".format( + self.metric, + type(self.metric), + ) + ) + if self.mode not in ("fast", "slow"): + raise ValueError("Unknown mode %s" % self.mode) if self.bagging: self._bagging(predictions, labels) @@ -104,7 +106,7 @@ def _fit( predictions: List[np.ndarray], labels: np.ndarray, ) -> AbstractEnsemble: - if self.mode == 'fast': + if self.mode == "fast": self._fast(predictions, labels) else: self._slow(predictions, labels) @@ -149,18 +151,17 @@ def _fast( # Memory-efficient averaging! for j, pred in enumerate(predictions): # fant_ensemble_prediction is the prediction of the current ensemble - # and should be ([predictions[selected_prev_iterations] + predictions[j])/(s+1) - # We overwrite the contents of fant_ensemble_prediction - # directly with weighted_ensemble_prediction + new_prediction and then scale for avg - np.add( - weighted_ensemble_prediction, - pred, - out=fant_ensemble_prediction - ) + # and should be + # + # ([predictions[selected_prev_iterations] + predictions[j])/(s+1) + # + # We overwrite the contents of fant_ensemble_prediction directly with + # weighted_ensemble_prediction + new_prediction and then scale for avg + np.add(weighted_ensemble_prediction, pred, out=fant_ensemble_prediction) np.multiply( fant_ensemble_prediction, - (1. / float(s + 1)), - out=fant_ensemble_prediction + (1.0 / float(s + 1)), + out=fant_ensemble_prediction, ) # calculate_loss is versatile and can return a dict of losses @@ -172,8 +173,8 @@ def _fast( prediction=fant_ensemble_prediction, task_type=self.task_type, metric=self.metric, - scoring_functions=None - ) + scoring_functions=None, + ), ) all_best = np.argwhere(losses == np.nanmin(losses)).flatten() @@ -192,11 +193,7 @@ def _fast( self.trajectory_ = trajectory self.train_loss_ = trajectory[-1] - def _slow( - self, - predictions: List[np.ndarray], - labels: np.ndarray - ) -> None: + def _slow(self, predictions: List[np.ndarray], labels: np.ndarray) -> None: """Rich Caruana's ensemble selection method.""" self.num_input_models_ = len(predictions) @@ -223,8 +220,8 @@ def _slow( prediction=ensemble_prediction, task_type=self.task_type, metric=self.metric, - scoring_functions=None - ) + scoring_functions=None, + ), ) ensemble.pop() best = np.nanargmin(losses) @@ -269,7 +266,7 @@ def _bagging( n_bags: int = 20, ) -> np.ndarray: """Rich Caruana's ensemble selection method with bagging.""" - raise ValueError('Bagging might not work with class-based interface!') + raise ValueError("Bagging might not work with class-based interface!") n_models = predictions.shape[0] bag_size = int(n_models * fraction) @@ -308,30 +305,34 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra # If none of the above applies, then something must have gone wrong. else: - raise ValueError("The dimensions of ensemble predictions" - " and ensemble weights do not match!") + raise ValueError( + "The dimensions of ensemble predictions" + " and ensemble weights do not match!" + ) del tmp_predictions return average def __str__(self) -> str: - trajectory_str = ' '.join([ - f'{id}: {perf:.5f}' - for id, perf in enumerate(self.trajectory_) - ]) - identifiers_str = ' '.join([ - f'{identifier}' - for idx, identifier in enumerate(self.identifiers_) - if self.weights_[idx] > 0 - ]) - return ("Ensemble Selection:\n" - f"\tTrajectory: {trajectory_str}\n" - f"\tMembers: {self.indices_}\n" - f"\tWeights: {self.weights_}\n" - f"\tIdentifiers: {identifiers_str}\n") + trajectory_str = " ".join( + [f"{id}: {perf:.5f}" for id, perf in enumerate(self.trajectory_)] + ) + identifiers_str = " ".join( + [ + f"{identifier}" + for idx, identifier in enumerate(self.identifiers_) + if self.weights_[idx] > 0 + ] + ) + return ( + "Ensemble Selection:\n" + f"\tTrajectory: {trajectory_str}\n" + f"\tMembers: {self.indices_}\n" + f"\tWeights: {self.weights_}\n" + f"\tIdentifiers: {identifiers_str}\n" + ) def get_models_with_weights( - self, - models: BasePipeline + self, models: BasePipeline ) -> List[Tuple[float, BasePipeline]]: output = [] for i, weight in enumerate(self.weights_): diff --git a/autosklearn/ensembles/singlebest_ensemble.py b/autosklearn/ensembles/singlebest_ensemble.py index e10eee978f..58e026dff2 100644 --- a/autosklearn/ensembles/singlebest_ensemble.py +++ b/autosklearn/ensembles/singlebest_ensemble.py @@ -1,8 +1,8 @@ -import os from typing import List, Tuple, Union -import numpy as np +import os +import numpy as np from smac.runhistory.runhistory import RunHistory from autosklearn.automl_common.common.utils.backend import Backend @@ -20,6 +20,7 @@ class SingleBest(AbstractEnsemble): object, to comply with the expected interface of an AbstractEnsemble. """ + def __init__( self, metric: Scorer, @@ -38,12 +39,10 @@ def __init__( self.identifiers_ = self.get_identifiers_from_run_history() def get_identifiers_from_run_history(self) -> List[Tuple[int, int, float]]: - """ - This method parses the run history, to identify - the best performing model + """Parses the run history, to identify the best performing model - It populates the identifiers attribute, which is used - by the backend to access the actual model + Populates the identifiers attribute, which is used by the backend to access + the actual model. """ best_model_identifier = [] best_model_score = self.metric._worst_possible_result @@ -52,35 +51,38 @@ def get_identifiers_from_run_history(self) -> List[Tuple[int, int, float]]: run_value = self.run_history.data[run_key] score = self.metric._optimum - (self.metric._sign * run_value.cost) - if (score > best_model_score and self.metric._sign > 0) \ - or (score < best_model_score and self.metric._sign < 0): + if (score > best_model_score and self.metric._sign > 0) or ( + score < best_model_score and self.metric._sign < 0 + ): # Make sure that the individual best model actually exists model_dir = self.backend.get_numrun_directory( self.seed, - run_value.additional_info['num_run'], + run_value.additional_info["num_run"], run_key.budget, ) model_file_name = self.backend.get_model_filename( self.seed, - run_value.additional_info['num_run'], + run_value.additional_info["num_run"], run_key.budget, ) file_path = os.path.join(model_dir, model_file_name) if not os.path.exists(file_path): continue - best_model_identifier = [( - self.seed, - run_value.additional_info['num_run'], - run_key.budget, - )] + best_model_identifier = [ + ( + self.seed, + run_value.additional_info["num_run"], + run_key.budget, + ) + ] best_model_score = score if not best_model_identifier: raise ValueError( - "No valid model found in run history. This means smac was not able to fit" - " a valid model. Please check the log file for errors." + "No valid model found in run history. This means smac was not able to" + " fit a valid model. Please check the log file for errors." ) return best_model_identifier @@ -89,15 +91,25 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra return predictions[0] def __str__(self) -> str: - return 'Single Model Selection:\n\tMembers: %s' \ - '\n\tWeights: %s\n\tIdentifiers: %s' % \ - (self.indices_, self.weights_, - ' '.join([str(identifier) for idx, identifier in - enumerate(self.identifiers_) - if self.weights_[idx] > 0])) - - def get_models_with_weights(self, models: BasePipeline - ) -> List[Tuple[float, BasePipeline]]: + return ( + "Single Model Selection:\n\tMembers: %s" + "\n\tWeights: %s\n\tIdentifiers: %s" + % ( + self.indices_, + self.weights_, + " ".join( + [ + str(identifier) + for idx, identifier in enumerate(self.identifiers_) + if self.weights_[idx] > 0 + ] + ), + ) + ) + + def get_models_with_weights( + self, models: BasePipeline + ) -> List[Tuple[float, BasePipeline]]: output = [] for i, weight in enumerate(self.weights_): if weight > 0.0: diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 070230ae94..491309a7b8 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -1,29 +1,28 @@ # -*- encoding: utf-8 -*- -from typing import Any, Optional, Dict, List, Mapping, Tuple, Union, Iterable -from typing_extensions import Literal +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union -from ConfigSpace.configuration_space import Configuration, ConfigurationSpace import dask.distributed import joblib import numpy as np import pandas as pd +from ConfigSpace.configuration_space import Configuration, ConfigurationSpace from scipy.sparse import spmatrix from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.utils.multiclass import type_of_target from smac.runhistory.runhistory import RunInfo, RunValue +from typing_extensions import Literal +from autosklearn.automl import AutoML, AutoMLClassifier, AutoMLRegressor from autosklearn.data.validation import ( - convert_if_sparse, SUPPORTED_FEAT_TYPES, SUPPORTED_TARGET_TYPES, + convert_if_sparse, ) -from autosklearn.pipeline.base import BasePipeline -from autosklearn.automl import AutoMLClassifier, AutoMLRegressor, AutoML from autosklearn.metrics import Scorer +from autosklearn.pipeline.base import BasePipeline class AutoSklearnEstimator(BaseEstimator): - def __init__( self, time_left_for_this_task=3600, @@ -36,7 +35,7 @@ def __init__( memory_limit=3072, include: Optional[Dict[str, List[str]]] = None, exclude: Optional[Dict[str, List[str]]] = None, - resampling_strategy='holdout', + resampling_strategy="holdout", resampling_strategy_arguments=None, tmp_folder=None, delete_tmp_folder_after_terminate=True, @@ -51,7 +50,7 @@ def __init__( scoring_functions: Optional[List[Scorer]] = None, load_models: bool = True, get_trials_callback=None, - dataset_compression: Union[bool, Mapping[str, Any]] = True + dataset_compression: Union[bool, Mapping[str, Any]] = True, ): """ Parameters @@ -339,11 +338,12 @@ def __init__( """ # noqa (links are too long) # Raise error if the given total time budget is less than 30 seconds. if time_left_for_this_task < 30: - raise ValueError("Time left for this task must be at least " - "30 seconds. ") + raise ValueError("Time left for this task must be at least " "30 seconds. ") self.time_left_for_this_task = time_left_for_this_task self.per_run_time_limit = per_run_time_limit - self.initial_configurations_via_metalearning = initial_configurations_via_metalearning + self.initial_configurations_via_metalearning = ( + initial_configurations_via_metalearning + ) self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest self.max_models_on_disc = max_models_on_disc @@ -388,12 +388,13 @@ def __getstate__(self): def build_automl(self): + initial_configs = self.initial_configurations_via_metalearning automl = self._get_automl_class()( temporary_directory=self.tmp_folder, delete_tmp_folder_after_terminate=self.delete_tmp_folder_after_terminate, time_left_for_this_task=self.time_left_for_this_task, per_run_time_limit=self.per_run_time_limit, - initial_configurations_via_metalearning=self.initial_configurations_via_metalearning, + initial_configurations_via_metalearning=initial_configs, ensemble_size=self.ensemble_size, ensemble_nbest=self.ensemble_nbest, max_models_on_disc=self.max_models_on_disc, @@ -413,7 +414,7 @@ def build_automl(self): metric=self.metric, scoring_functions=self.scoring_functions, get_trials_callback=self.get_trials_callback, - dataset_compression=self.dataset_compression + dataset_compression=self.dataset_compression, ) return automl @@ -434,7 +435,7 @@ def fit_pipeline( self, X: SUPPORTED_FEAT_TYPES, y: Union[SUPPORTED_TARGET_TYPES, spmatrix], - config: Union[Configuration, Dict[str, Union[str, float, int]]], + config: Union[Configuration, Dict[str, Union[str, float, int]]], dataset_name: Optional[str] = None, X_test: Optional[SUPPORTED_FEAT_TYPES] = None, y_test: Optional[Union[SUPPORTED_TARGET_TYPES, spmatrix]] = None, @@ -442,7 +443,7 @@ def fit_pipeline( *args, **kwargs: Dict, ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]: - """ Fits and individual pipeline configuration and returns + """Fits and individual pipeline configuration and returns the result to the user. The Estimator constraints are honored, for example the resampling @@ -451,7 +452,8 @@ def fit_pipeline( arguments are redirected to the TAE evaluation function, which allows for further customization while building a pipeline. - Any additional argument provided is directly passed to the worker exercising the run. + Any additional argument provided is directly passed to the + worker exercising the run. Parameters ---------- @@ -465,7 +467,7 @@ def fit_pipeline( If provided, the testing performance will be tracked on this labels config: Union[Configuration, Dict[str, Union[str, float, int]]] A configuration object used to define the pipeline steps. - If a dictionary is passed, a configuration is created based on this dictionary. + If a dict is passed, a configuration is created based on this dict. dataset_name: Optional[str] Name that will be used to tag the Auto-Sklearn run and identify the Auto-Sklearn run @@ -489,16 +491,27 @@ def fit_pipeline( """ if self.automl_ is None: self.automl_ = self.build_automl() - return self.automl_.fit_pipeline(X=X, y=y, - dataset_name=dataset_name, - config=config, - feat_type=feat_type, - X_test=X_test, y_test=y_test, - *args, **kwargs) - - def fit_ensemble(self, y, task=None, precision=32, - dataset_name=None, ensemble_nbest=None, - ensemble_size=None): + return self.automl_.fit_pipeline( + X=X, + y=y, + dataset_name=dataset_name, + config=config, + feat_type=feat_type, + X_test=X_test, + y_test=y_test, + *args, + **kwargs, + ) + + def fit_ensemble( + self, + y, + task=None, + precision=32, + dataset_name=None, + ensemble_nbest=None, + ensemble_size=None, + ): """Fit an ensemble to models trained during an optimization process. All parameters are ``None`` by default. If no other value is given, @@ -584,30 +597,42 @@ def predict(self, X, batch_size=None, n_jobs=1): return self.automl_.predict(X, batch_size=batch_size, n_jobs=n_jobs) def predict_proba(self, X, batch_size=None, n_jobs=1): - return self.automl_.predict_proba( - X, batch_size=batch_size, n_jobs=n_jobs) + return self.automl_.predict_proba(X, batch_size=batch_size, n_jobs=n_jobs) def score(self, X, y): return self.automl_.score(X, y) def show_models(self): - """ Returns a dictionary containing dictionaries of ensemble models. + """Returns a dictionary containing dictionaries of ensemble models. Each model in the ensemble can be accessed by giving its ``model_id`` as key. A model dictionary contains the following: * ``"model_id"`` - The id given to a model by ``autosklearn``. + * ``"rank"`` - The rank of the model based on it's ``"cost"``. + * ``"cost"`` - The loss of the model on the validation set. + * ``"ensemble_weight"`` - The weight given to the model in the ensemble. + * ``"voting_model"`` - The ``cv_voting_ensemble`` model (for 'cv' resampling). - * ``"estimators"`` - List of models (dicts) in ``cv_voting_ensemble`` (for 'cv' resampling). + + * ``"estimators"`` - List of models (dicts) in ``cv_voting_ensemble`` + ('cv' resampling). + * ``"data_preprocessor"`` - The preprocessor used on the data. + * ``"balancing"`` - The balancing used on the data (for classification). + * ``"feature_preprocessor"`` - The preprocessor for features types. - * ``"classifier"`` or ``"regressor"`` - The autosklearn wrapped classifier or regressor. - * ``"sklearn_classifier"`` or ``"sklearn_regressor"`` - The sklearn classifier or regressor. + + * ``"classifier"`` / ``"regressor"`` + - The autosklearn wrapped classifier or regressor. + + * ``"sklearn_classifier"`` or ``"sklearn_regressor"`` + - The sklearn classifier or regressor. **Example** @@ -657,7 +682,7 @@ def show_models(self): Dict(int, Any) : dictionary of length = number of models in the ensemble A dictionary of models in the ensemble, where ``model_id`` is the key. - """ + """ # noqa: E501 return self.automl_.show_models() @@ -709,12 +734,12 @@ def leaderboard( self, detailed: bool = False, ensemble_only: bool = True, - top_k: Union[int, Literal['all']] = 'all', - sort_by: str = 'cost', - sort_order: Literal['auto', 'ascending', 'descending'] = 'auto', - include: Optional[Union[str, Iterable[str]]] = None + top_k: Union[int, Literal["all"]] = "all", + sort_by: str = "cost", + sort_order: Literal["auto", "ascending", "descending"] = "auto", + include: Optional[Union[str, Iterable[str]]] = None, ) -> pd.DataFrame: - """ Returns a pandas table of results for all evaluated models. + """Returns a pandas table of results for all evaluated models. Gives an overview of all models trained during the search process along with various statistics about their training. @@ -789,46 +814,53 @@ def leaderboard( # Validation of top_k if ( not (isinstance(top_k, str) or isinstance(top_k, int)) - or (isinstance(top_k, str) and top_k != 'all') + or (isinstance(top_k, str) and top_k != "all") or (isinstance(top_k, int) and top_k <= 0) ): - raise ValueError(f"top_k={top_k} must be a positive integer or pass" - " `top_k`='all' to view results for all models") + raise ValueError( + f"top_k={top_k} must be a positive integer or pass" + " `top_k`='all' to view results for all models" + ) # Validate columns to include if isinstance(include, str): include = [include] - if include == ['model_id']: - raise ValueError('Must provide more than just `model_id`') + if include == ["model_id"]: + raise ValueError("Must provide more than just `model_id`") if include is not None: columns = [*include] # 'model_id' should always be present as it is the unique index # used for pandas - if 'model_id' not in columns: - columns.append('model_id') + if "model_id" not in columns: + columns.append("model_id") - invalid_include_items = set(columns) - set(column_types['all']) + invalid_include_items = set(columns) - set(column_types["all"]) if len(invalid_include_items) != 0: - raise ValueError(f"Values {invalid_include_items} are not known" - f" columns to include, must be contained in " - f"{column_types['all']}") + raise ValueError( + f"Values {invalid_include_items} are not known" + f" columns to include, must be contained in " + f"{column_types['all']}" + ) elif detailed: - columns = column_types['all'] + columns = column_types["all"] else: - columns = column_types['simple'] + columns = column_types["simple"] # Validation of sorting - if sort_by not in column_types['all']: - raise ValueError(f"sort_by='{sort_by}' must be one of included " - f"columns {set(column_types['all'])}") + if sort_by not in column_types["all"]: + raise ValueError( + f"sort_by='{sort_by}' must be one of included " + f"columns {set(column_types['all'])}" + ) - valid_sort_orders = ['auto', 'ascending', 'descending'] + valid_sort_orders = ["auto", "ascending", "descending"] if not (isinstance(sort_order, str) and sort_order in valid_sort_orders): - raise ValueError(f"`sort_order` = {sort_order} must be a str in " - f"{valid_sort_orders}") + raise ValueError( + f"`sort_order` = {sort_order} must be a str in " f"{valid_sort_orders}" + ) # To get all the models that were optmized, we collect what we can from # runhistory first. @@ -836,29 +868,31 @@ def has_key(rv, key): return rv.additional_info and key in rv.additional_info model_runs = { - rval.additional_info['num_run']: { - 'model_id': rval.additional_info['num_run'], - 'seed': rkey.seed, - 'budget': rkey.budget, - 'duration': rval.time, - 'config_id': rkey.config_id, - 'start_time': rval.starttime, - 'end_time': rval.endtime, - 'status': str(rval.status), - 'cost': rval.cost, - 'train_loss': rval.additional_info['train_loss'] - if has_key(rval, 'train_loss') else None, - 'config_origin': rval.additional_info['configuration_origin'] - if has_key(rval, 'configuration_origin') else None + rval.additional_info["num_run"]: { + "model_id": rval.additional_info["num_run"], + "seed": rkey.seed, + "budget": rkey.budget, + "duration": rval.time, + "config_id": rkey.config_id, + "start_time": rval.starttime, + "end_time": rval.endtime, + "status": str(rval.status), + "cost": rval.cost, + "train_loss": rval.additional_info["train_loss"] + if has_key(rval, "train_loss") + else None, + "config_origin": rval.additional_info["configuration_origin"] + if has_key(rval, "configuration_origin") + else None, } for rkey, rval in self.automl_.runhistory_.data.items() - if has_key(rval, 'num_run') + if has_key(rval, "num_run") } # Next we get some info about the model itself model_class_strings = { - AutoMLClassifier: 'classifier', - AutoMLRegressor: 'regressor' + AutoMLClassifier: "classifier", + AutoMLRegressor: "regressor", } model_type = model_class_strings.get(self._get_automl_class(), None) if model_type is None: @@ -868,21 +902,25 @@ def has_key(rv, key): configurations = self.automl_.runhistory_.ids_config for model_id, run_info in model_runs.items(): - config_id = run_info['config_id'] + config_id = run_info["config_id"] run_config = configurations[config_id]._values - run_info.update({ - 'balancing_strategy': run_config.get('balancing:strategy', None), - 'type': run_config[f'{model_type}:__choice__'], - 'data_preprocessors': [ - value for key, value in run_config.items() - if 'data_preprocessing' in key and '__choice__' in key - ], - 'feature_preprocessors': [ - value for key, value in run_config.items() - if 'feature_preprocessor' in key and '__choice__' in key - ] - }) + run_info.update( + { + "balancing_strategy": run_config.get("balancing:strategy", None), + "type": run_config[f"{model_type}:__choice__"], + "data_preprocessors": [ + value + for key, value in run_config.items() + if "data_preprocessing" in key and "__choice__" in key + ], + "feature_preprocessors": [ + value + for key, value in run_config.items() + if "feature_preprocessor" in key and "__choice__" in key + ], + } + ) # Get the models ensemble weight if it has one # TODO both implementing classes of AbstractEnsemble have a property @@ -892,7 +930,7 @@ def has_key(rv, key): # tied together by ordering, might be better to store as tuple for i, weight in enumerate(self.automl_.ensemble_.weights_): (_, model_id, _) = self.automl_.ensemble_.identifiers_[i] - model_runs[model_id]['ensemble_weight'] = weight + model_runs[model_id]["ensemble_weight"] = weight # Filter out non-ensemble members if needed, else fill in a default # value of 0 if it's missing @@ -900,65 +938,70 @@ def has_key(rv, key): model_runs = { model_id: info for model_id, info in model_runs.items() - if ('ensemble_weight' in info and info['ensemble_weight'] > 0) + if ("ensemble_weight" in info and info["ensemble_weight"] > 0) } else: for model_id, info in model_runs.items(): - if 'ensemble_weight' not in info: - info['ensemble_weight'] = 0 + if "ensemble_weight" not in info: + info["ensemble_weight"] = 0 # `rank` relies on `cost` so we include `cost` # We drop it later if it's not requested - if 'rank' in columns and 'cost' not in columns: - columns = [*columns, 'cost'] + if "rank" in columns and "cost" not in columns: + columns = [*columns, "cost"] # Finally, convert into a tabular format by converting the dict into # column wise orientation. - dataframe = pd.DataFrame({ - col: [run_info[col] for run_info in model_runs.values()] - for col in columns if col != 'rank' - }) + dataframe = pd.DataFrame( + { + col: [run_info[col] for run_info in model_runs.values()] + for col in columns + if col != "rank" + } + ) # Give it an index, even if not in the `include` - dataframe.set_index('model_id', inplace=True) + dataframe.set_index("model_id", inplace=True) # Add the `rank` column if needed, dropping `cost` if it's not # requested by the user - if 'rank' in columns: - dataframe.sort_values(by='cost', ascending=True, inplace=True) - dataframe.insert(column='rank', - value=range(1, len(dataframe) + 1), - loc=list(columns).index('rank') - 1) # account for `model_id` + if "rank" in columns: + dataframe.sort_values(by="cost", ascending=True, inplace=True) + dataframe.insert( + column="rank", + value=range(1, len(dataframe) + 1), + loc=list(columns).index("rank") - 1, + ) # account for `model_id` - if 'cost' not in columns: - dataframe.drop('cost', inplace=True) + if "cost" not in columns: + dataframe.drop("cost", inplace=True) # Decide on the sort order depending on what it gets sorted by - descending_columns = ['ensemble_weight', 'duration'] - if sort_order == 'auto': + descending_columns = ["ensemble_weight", "duration"] + if sort_order == "auto": ascending_param = False if sort_by in descending_columns else True else: - ascending_param = False if sort_order == 'descending' else True + ascending_param = False if sort_order == "descending" else True # Sort by the given column name, defaulting to 'model_id' if not present if sort_by not in dataframe.columns: - self.automl_._logger.warning(f"sort_by = '{sort_by}' was not present" - ", defaulting to sort on the index " - "'model_id'") - sort_by = 'model_id' + self.automl_._logger.warning( + f"sort_by = '{sort_by}' was not present" + ", defaulting to sort on the index " + "'model_id'" + ) + sort_by = "model_id" # Cost can be the same but leave rank all over the place - if 'rank' in columns and sort_by == 'cost': - dataframe.sort_values(by=[sort_by, 'rank'], - ascending=[ascending_param, True], - inplace=True) + if "rank" in columns and sort_by == "cost": + dataframe.sort_values( + by=[sort_by, "rank"], ascending=[ascending_param, True], inplace=True + ) else: - dataframe.sort_values(by=sort_by, - ascending=ascending_param, - inplace=True) + dataframe.sort_values(by=sort_by, ascending=ascending_param, inplace=True) # Lastly, just grab the top_k - if top_k == 'all' or top_k >= len(dataframe): + if top_k == "all" or top_k >= len(dataframe): top_k = len(dataframe) dataframe = dataframe.head(top_k) @@ -966,18 +1009,29 @@ def has_key(rv, key): return dataframe @staticmethod - def _leaderboard_columns() -> Dict[Literal['all', 'simple', 'detailed'], List[str]]: + def _leaderboard_columns() -> Dict[Literal["all", "simple", "detailed"], List[str]]: all = [ - "model_id", "rank", "ensemble_weight", "type", "cost", "duration", - "config_id", "train_loss", "seed", "start_time", "end_time", - "budget", "status", "data_preprocessors", "feature_preprocessors", - "balancing_strategy", "config_origin" - ] - simple = [ - "model_id", "rank", "ensemble_weight", "type", "cost", "duration" + "model_id", + "rank", + "ensemble_weight", + "type", + "cost", + "duration", + "config_id", + "train_loss", + "seed", + "start_time", + "end_time", + "budget", + "status", + "data_preprocessors", + "feature_preprocessors", + "balancing_strategy", + "config_origin", ] + simple = ["model_id", "rank", "ensemble_weight", "type", "cost", "duration"] detailed = all - return {'all': all, 'detailed': detailed, 'simple': simple} + return {"all": all, "detailed": detailed, "simple": simple} def _get_automl_class(self): raise NotImplementedError() @@ -1012,23 +1066,25 @@ def get_configuration_space( if self.automl_ is None: self.automl_ = self.build_automl() - return self.automl_.fit( - X, y, - X_test=X_test, y_test=y_test, - dataset_name=dataset_name, - feat_type=feat_type, - only_return_configuration_space=True, - ) if self.automl_.configuration_space is None else self.automl_.configuration_space + return ( + self.automl_.fit( + X, + y, + X_test=X_test, + y_test=y_test, + dataset_name=dataset_name, + feat_type=feat_type, + only_return_configuration_space=True, + ) + if self.automl_.configuration_space is None + else self.automl_.configuration_space + ) class AutoSklearnClassifier(AutoSklearnEstimator, ClassifierMixin): - """This class implements the classification task. """ + """This class implements the classification task.""" - def fit(self, X, y, - X_test=None, - y_test=None, - feat_type=None, - dataset_name=None): + def fit(self, X, y, X_test=None, y_test=None, feat_type=None, dataset_name=None): """Fit *auto-sklearn* to given training set (X, y). Fit both optimizes the machine learning models and builds an ensemble @@ -1075,18 +1131,16 @@ def fit(self, X, y, # type of data is compatible with auto-sklearn. Legal target # types are: binary, multiclass, multilabel-indicator. target_type = type_of_target(y) - supported_types = ['binary', 'multiclass', 'multilabel-indicator'] + supported_types = ["binary", "multiclass", "multilabel-indicator"] if target_type not in supported_types: - raise ValueError("Classification with data of type {} is " - "not supported. Supported types are {}. " - "You can find more information about scikit-learn " - "data types in: " - "https://scikit-learn.org/stable/modules/multiclass.html" - "".format( - target_type, - supported_types - ) - ) + raise ValueError( + "Classification with data of type {} is " + "not supported. Supported types are {}. " + "You can find more information about scikit-learn " + "data types in: " + "https://scikit-learn.org/stable/modules/multiclass.html" + "".format(target_type, supported_types) + ) # remember target type for using in predict_proba later. self.target_type = target_type @@ -1138,22 +1192,19 @@ def predict_proba(self, X, batch_size=None, n_jobs=1): y : array of shape = [n_samples, n_classes] or [n_samples, n_labels] The predicted class probabilities. """ - pred_proba = super().predict_proba( - X, batch_size=batch_size, n_jobs=n_jobs) + pred_proba = super().predict_proba(X, batch_size=batch_size, n_jobs=n_jobs) # Check if all probabilities sum up to 1. # Assert only if target type is not multilabel-indicator. - if self.target_type not in ['multilabel-indicator']: - assert( - np.allclose( - np.sum(pred_proba, axis=1), - np.ones_like(pred_proba[:, 0])) + if self.target_type not in ["multilabel-indicator"]: + assert np.allclose( + np.sum(pred_proba, axis=1), np.ones_like(pred_proba[:, 0]) ), "prediction probability does not sum up to 1!" # Check that all probability values lie between 0 and 1. - assert( - (pred_proba >= 0).all() and (pred_proba <= 1).all() - ), "found prediction probability value outside of [0, 1]!" + assert (pred_proba >= 0).all() and ( + pred_proba <= 1 + ).all(), "found prediction probability value outside of [0, 1]!" return pred_proba @@ -1167,11 +1218,7 @@ class AutoSklearnRegressor(AutoSklearnEstimator, RegressorMixin): """ - def fit(self, X, y, - X_test=None, - y_test=None, - feat_type=None, - dataset_name=None): + def fit(self, X, y, X_test=None, y_test=None, feat_type=None, dataset_name=None): """Fit *Auto-sklearn* to given training set (X, y). Fit both optimizes the machine learning models and builds an ensemble @@ -1219,18 +1266,21 @@ def fit(self, X, y, y = convert_if_sparse(y) target_type = type_of_target(y) - supported_types = ['continuous', 'binary', 'multiclass', 'continuous-multioutput'] + supported_types = [ + "continuous", + "binary", + "multiclass", + "continuous-multioutput", + ] if target_type not in supported_types: - raise ValueError("Regression with data of type {} is " - "not supported. Supported types are {}. " - "You can find more information about scikit-learn " - "data types in: " - "https://scikit-learn.org/stable/modules/multiclass.html" - "".format( - target_type, - supported_types - ) - ) + raise ValueError( + "Regression with data of type {} is " + "not supported. Supported types are {}. " + "You can find more information about scikit-learn " + "data types in: " + "https://scikit-learn.org/stable/modules/multiclass.html" + "".format(target_type, supported_types) + ) # Fit is supposed to be idempotent! # But not if we use share_mode. diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py index 506cf51441..51ad69fbb3 100644 --- a/autosklearn/evaluation/__init__.py +++ b/autosklearn/evaluation/__init__.py @@ -1,42 +1,42 @@ # -*- encoding: utf-8 -*- +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union, cast + import functools -import logging import json +import logging import math import multiprocessing -from queue import Empty import time import traceback -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union, cast +from queue import Empty -from ConfigSpace import Configuration import numpy as np import pynisher +from ConfigSpace import Configuration +from sklearn.model_selection._split import ( + BaseCrossValidator, + BaseShuffleSplit, + _RepeatedSplits, +) from smac.runhistory.runhistory import RunInfo, RunValue from smac.stats.stats import Stats from smac.tae import StatusType, TAEAbortException from smac.tae.execute_func import AbstractTAFunc -from sklearn.model_selection._split import _RepeatedSplits, BaseShuffleSplit,\ - BaseCrossValidator - -from autosklearn.automl_common.common.utils.backend import Backend - -from autosklearn.metrics import Scorer -import autosklearn.evaluation.train_evaluator import autosklearn.evaluation.test_evaluator +import autosklearn.evaluation.train_evaluator import autosklearn.evaluation.util import autosklearn.pipeline.components +from autosklearn.automl_common.common.utils.backend import Backend from autosklearn.evaluation.train_evaluator import TYPE_ADDITIONAL_INFO +from autosklearn.metrics import Scorer from autosklearn.util.logging_ import PickableLoggerAdapter, get_named_client_logger from autosklearn.util.parallel import preload_modules def fit_predict_try_except_decorator( - ta: Callable, - queue: multiprocessing.Queue, - cost_for_crash: float, - **kwargs: Any) -> None: + ta: Callable, queue: multiprocessing.Queue, cost_for_crash: float, **kwargs: Any +) -> None: try: return ta(queue=queue, **kwargs) @@ -48,7 +48,8 @@ def fit_predict_try_except_decorator( exception_traceback = traceback.format_exc() error_message = repr(e) - # Printing stuff to stdout just in case the queue doesn't work, which happened with the + # Printing stuff to stdout just in case the queue doesn't work, + # which happened with the # following traceback: # File "auto-sklearn/autosklearn/evaluation/__init__.py", line 29, in fit_predict_try_except_decorator # noqa E501 # return ta(queue=queue, **kwargs) @@ -64,14 +65,23 @@ def fit_predict_try_except_decorator( # self._thread.start() # File "miniconda/3-4.5.4/envs/autosklearn/lib/python3.7/threading.py", line 847, in start # noqa E501 # RuntimeError: can't start new thread - print("Exception handling in `fit_predict_try_except_decorator`: " - "traceback: %s \nerror message: %s" % (exception_traceback, error_message)) - - queue.put({'loss': cost_for_crash, - 'additional_run_info': {'traceback': exception_traceback, - 'error': error_message}, - 'status': StatusType.CRASHED, - 'final_queue_element': True}, block=True) + print( + "Exception handling in `fit_predict_try_except_decorator`: " + "traceback: %s \nerror message: %s" % (exception_traceback, error_message) + ) + + queue.put( + { + "loss": cost_for_crash, + "additional_run_info": { + "traceback": exception_traceback, + "error": error_message, + }, + "status": StatusType.CRASHED, + "final_queue_element": True, + }, + block=True, + ) queue.close() @@ -94,8 +104,9 @@ def get_cost_of_crash(metric: Scorer) -> float: return worst_possible_result -def _encode_exit_status(exit_status: Union[str, int, Type[BaseException]] - ) -> Union[str, int]: +def _encode_exit_status( + exit_status: Union[str, int, Type[BaseException]] +) -> Union[str, int]: try: # If it can be dumped, then it is int exit_status = cast(int, exit_status) @@ -108,12 +119,13 @@ def _encode_exit_status(exit_status: Union[str, int, Type[BaseException]] # TODO potentially log all inputs to this class to pickle them in order to do # easier debugging of potential crashes class ExecuteTaFuncWithQueue(AbstractTAFunc): - def __init__( self, backend: Backend, autosklearn_seed: int, - resampling_strategy: Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit], + resampling_strategy: Union[ + str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit + ], metric: Scorer, cost_for_crash: float, abort_on_first_run_crash: bool, @@ -121,7 +133,7 @@ def __init__( pynisher_context: str, initial_num_run: int = 1, stats: Optional[Stats] = None, - run_obj: str = 'quality', + run_obj: str = "quality", par_factor: int = 1, scoring_functions: Optional[List[Scorer]] = None, output_y_hat_optimization: bool = True, @@ -135,26 +147,29 @@ def __init__( **resampling_strategy_args: Any, ): - if resampling_strategy == 'holdout': + if resampling_strategy == "holdout": eval_function = autosklearn.evaluation.train_evaluator.eval_holdout - elif resampling_strategy == 'holdout-iterative-fit': - eval_function = autosklearn.evaluation.train_evaluator.eval_iterative_holdout - elif resampling_strategy == 'cv-iterative-fit': + elif resampling_strategy == "holdout-iterative-fit": + eval_function = ( + autosklearn.evaluation.train_evaluator.eval_iterative_holdout + ) + elif resampling_strategy == "cv-iterative-fit": eval_function = autosklearn.evaluation.train_evaluator.eval_iterative_cv - elif resampling_strategy == 'cv' or isinstance(resampling_strategy, ( - BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit) + elif resampling_strategy == "cv" or isinstance( + resampling_strategy, (BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit) ): eval_function = autosklearn.evaluation.train_evaluator.eval_cv - elif resampling_strategy == 'partial-cv': + elif resampling_strategy == "partial-cv": eval_function = autosklearn.evaluation.train_evaluator.eval_partial_cv - elif resampling_strategy == 'partial-cv-iterative-fit': - eval_function = autosklearn.evaluation.train_evaluator.eval_partial_cv_iterative - elif resampling_strategy == 'test': + elif resampling_strategy == "partial-cv-iterative-fit": + eval_function = ( + autosklearn.evaluation.train_evaluator.eval_partial_cv_iterative + ) + elif resampling_strategy == "test": eval_function = autosklearn.evaluation.test_evaluator.eval_t output_y_hat_optimization = False else: - raise ValueError('Unknown resampling strategy %s' % - resampling_strategy) + raise ValueError("Unknown resampling strategy %s" % resampling_strategy) self.worst_possible_result = cost_for_crash @@ -181,7 +196,7 @@ def __init__( self.resampling_strategy = resampling_strategy self.resampling_strategy_args = resampling_strategy_args self.scoring_functions = scoring_functions - # TODO deactivate output_y_hat_optimization and let the respective evaluator decide + # TODO deactivate output_y_hat_optimization and let respective evaluator decide self.output_y_hat_optimization = output_y_hat_optimization self.include = include self.exclude = exclude @@ -194,11 +209,11 @@ def __init__( self.memory_limit = memory_limit dm = self.backend.load_datamanager() - if 'X_valid' in dm.data and 'Y_valid' in dm.data: + if "X_valid" in dm.data and "Y_valid" in dm.data: self._get_validation_loss = True else: self._get_validation_loss = False - if 'X_test' in dm.data and 'Y_test' in dm.data: + if "X_test" in dm.data and "Y_test" in dm.data: self._get_test_loss = True else: self._get_test_loss = False @@ -206,7 +221,9 @@ def __init__( self.port = port self.pynisher_context = pynisher_context if self.port is None: - self.logger: Union[logging.Logger, PickableLoggerAdapter] = logging.getLogger("TAE") + self.logger: Union[ + logging.Logger, PickableLoggerAdapter + ] = logging.getLogger("TAE") else: self.logger = get_named_client_logger( name="TAE", @@ -236,18 +253,23 @@ def run_wrapper( if self.budget_type is None: if run_info.budget != 0: raise ValueError( - 'If budget_type is None, budget must be.0, but is %f' % run_info.budget + "If budget_type is None, budget must be.0, but is %f" + % run_info.budget ) else: if run_info.budget == 0: run_info = run_info._replace(budget=100) elif run_info.budget <= 0 or run_info.budget > 100: - raise ValueError('Illegal value for budget, must be >0 and <=100, but is %f' % - run_info.budget) - if self.budget_type not in ('subsample', 'iterations', 'mixed'): - raise ValueError("Illegal value for budget type, must be one of " - "('subsample', 'iterations', 'mixed'), but is : %s" % - self.budget_type) + raise ValueError( + "Illegal value for budget, must be >0 and <=100, but is %f" + % run_info.budget + ) + if self.budget_type not in ("subsample", "iterations", "mixed"): + raise ValueError( + "Illegal value for budget type, must be one of " + "('subsample', 'iterations', 'mixed'), but is : %s" + % self.budget_type + ) remaining_time = self.stats.get_remaing_time_budget() @@ -255,11 +277,15 @@ def run_wrapper( run_info = run_info._replace(cutoff=int(remaining_time - 5)) config_id = ( - run_info.config if isinstance(run_info.config, int) else run_info.config.config_id + run_info.config + if isinstance(run_info.config, int) + else run_info.config.config_id ) if run_info.cutoff < 1.0: - self.logger.info("Not starting configuration %d because time is up" % config_id) + self.logger.info( + "Not starting configuration %d because time is up" % config_id + ) return run_info, RunValue( status=StatusType.STOP, cost=self.worst_possible_result, @@ -268,9 +294,8 @@ def run_wrapper( starttime=time.time(), endtime=time.time(), ) - elif ( - run_info.cutoff != int(np.ceil(run_info.cutoff)) - and not isinstance(run_info.cutoff, int) + elif run_info.cutoff != int(np.ceil(run_info.cutoff)) and not isinstance( + run_info.cutoff, int ): run_info = run_info._replace(cutoff=int(np.ceil(run_info.cutoff))) @@ -285,7 +310,9 @@ def run( seed: int = 12345, budget: float = 0.0, instance_specific: Optional[str] = None, - ) -> Tuple[StatusType, float, float, Dict[str, Union[int, float, str, Dict, List, Tuple]]]: + ) -> Tuple[ + StatusType, float, float, Dict[str, Union[int, float, str, Dict, List, Tuple]] + ]: # Additional information of each of the tae executions # Defined upfront for mypy @@ -295,14 +322,16 @@ def run( preload_modules(context) queue = context.Queue() - if not (instance_specific is None or instance_specific == '0'): + if not (instance_specific is None or instance_specific == "0"): raise ValueError(instance_specific) - init_params = {'instance': instance} + init_params = {"instance": instance} if self.init_params is not None: init_params.update(self.init_params) if self.port is None: - logger: Union[logging.Logger, PickableLoggerAdapter] = logging.getLogger("pynisher") + logger: Union[logging.Logger, PickableLoggerAdapter] = logging.getLogger( + "pynisher" + ) else: logger = get_named_client_logger( name="pynisher", @@ -341,9 +370,9 @@ def run( additional_components=autosklearn.pipeline.components.base._addons, ) - if self.resampling_strategy != 'test': - obj_kwargs['resampling_strategy'] = self.resampling_strategy - obj_kwargs['resampling_strategy_args'] = self.resampling_strategy_args + if self.resampling_strategy != "test": + obj_kwargs["resampling_strategy"] = self.resampling_strategy + obj_kwargs["resampling_strategy_args"] = self.resampling_strategy_args try: obj = pynisher.enforce_limits(**arguments)(self.ta) @@ -351,31 +380,38 @@ def run( except Exception as e: exception_traceback = traceback.format_exc() error_message = repr(e) - additional_run_info.update({ - 'traceback': exception_traceback, - 'error': error_message - }) - return StatusType.CRASHED, self.worst_possible_result, 0.0, additional_run_info + additional_run_info.update( + {"traceback": exception_traceback, "error": error_message} + ) + return ( + StatusType.CRASHED, + self.worst_possible_result, + 0.0, + additional_run_info, + ) - if obj.exit_status in (pynisher.TimeoutException, pynisher.MemorylimitException): + if obj.exit_status in ( + pynisher.TimeoutException, + pynisher.MemorylimitException, + ): # Even if the pynisher thinks that a timeout or memout occured, # it can be that the target algorithm wrote something into the queue # - then we treat it as a succesful run try: info = autosklearn.evaluation.util.read_queue(queue) - result = info[-1]['loss'] - status = info[-1]['status'] - additional_run_info = info[-1]['additional_run_info'] + result = info[-1]["loss"] + status = info[-1]["status"] + additional_run_info = info[-1]["additional_run_info"] if obj.stdout: - additional_run_info['subprocess_stdout'] = obj.stdout + additional_run_info["subprocess_stdout"] = obj.stdout if obj.stderr: - additional_run_info['subprocess_stderr'] = obj.stderr + additional_run_info["subprocess_stderr"] = obj.stderr if obj.exit_status is pynisher.TimeoutException: - additional_run_info['info'] = 'Run stopped because of timeout.' + additional_run_info["info"] = "Run stopped because of timeout." elif obj.exit_status is pynisher.MemorylimitException: - additional_run_info['info'] = 'Run stopped because of memout.' + additional_run_info["info"] = "Run stopped because of memout." if status in [StatusType.SUCCESS, StatusType.DONOTADVANCE]: cost = result @@ -386,11 +422,13 @@ def run( info = None if obj.exit_status is pynisher.TimeoutException: status = StatusType.TIMEOUT - additional_run_info = {'error': 'Timeout'} + additional_run_info = {"error": "Timeout"} elif obj.exit_status is pynisher.MemorylimitException: status = StatusType.MEMOUT additional_run_info = { - "error": "Memout (used more than {} MB).".format(self.memory_limit) + "error": "Memout (used more than {} MB).".format( + self.memory_limit + ) } else: raise ValueError(obj.exit_status) @@ -400,99 +438,111 @@ def run( info = None status = StatusType.ABORT cost = self.worst_possible_result - additional_run_info = {'error': 'Your configuration of ' - 'auto-sklearn does not work!', - 'exit_status': _encode_exit_status(obj.exit_status), - 'subprocess_stdout': obj.stdout, - 'subprocess_stderr': obj.stderr, - } + additional_run_info = { + "error": "Your configuration of " "auto-sklearn does not work!", + "exit_status": _encode_exit_status(obj.exit_status), + "subprocess_stdout": obj.stdout, + "subprocess_stderr": obj.stderr, + } else: try: info = autosklearn.evaluation.util.read_queue(queue) - result = info[-1]['loss'] - status = info[-1]['status'] - additional_run_info = info[-1]['additional_run_info'] + result = info[-1]["loss"] + status = info[-1]["status"] + additional_run_info = info[-1]["additional_run_info"] if obj.exit_status == 0: cost = result else: status = StatusType.CRASHED cost = self.worst_possible_result - additional_run_info['info'] = 'Run treated as crashed ' \ - 'because the pynisher exit ' \ - 'status %s is unknown.' % \ - str(obj.exit_status) - additional_run_info['exit_status'] = _encode_exit_status(obj.exit_status) - additional_run_info['subprocess_stdout'] = obj.stdout - additional_run_info['subprocess_stderr'] = obj.stderr + additional_run_info["info"] = ( + "Run treated as crashed " + "because the pynisher exit " + "status %s is unknown." % str(obj.exit_status) + ) + additional_run_info["exit_status"] = _encode_exit_status( + obj.exit_status + ) + additional_run_info["subprocess_stdout"] = obj.stdout + additional_run_info["subprocess_stderr"] = obj.stderr except Empty: info = None additional_run_info = { - 'error': 'Result queue is empty', - 'exit_status': _encode_exit_status(obj.exit_status), - 'subprocess_stdout': obj.stdout, - 'subprocess_stderr': obj.stderr, - 'exitcode': obj.exitcode + "error": "Result queue is empty", + "exit_status": _encode_exit_status(obj.exit_status), + "subprocess_stdout": obj.stdout, + "subprocess_stderr": obj.stderr, + "exitcode": obj.exitcode, } status = StatusType.CRASHED cost = self.worst_possible_result if ( - (self.budget_type is None or budget == 0) - and status == StatusType.DONOTADVANCE - ): + self.budget_type is None or budget == 0 + ) and status == StatusType.DONOTADVANCE: status = StatusType.SUCCESS if not isinstance(additional_run_info, dict): - additional_run_info = {'message': additional_run_info} + additional_run_info = {"message": additional_run_info} if ( info is not None - and self.resampling_strategy in ('holdout-iterative-fit', 'cv-iterative-fit') + and self.resampling_strategy + in ("holdout-iterative-fit", "cv-iterative-fit") and status != StatusType.CRASHED ): learning_curve = autosklearn.evaluation.util.extract_learning_curve(info) learning_curve_runtime = autosklearn.evaluation.util.extract_learning_curve( - info, 'duration' + info, "duration" ) if len(learning_curve) > 1: - additional_run_info['learning_curve'] = learning_curve - additional_run_info['learning_curve_runtime'] = learning_curve_runtime + additional_run_info["learning_curve"] = learning_curve + additional_run_info["learning_curve_runtime"] = learning_curve_runtime train_learning_curve = autosklearn.evaluation.util.extract_learning_curve( - info, 'train_loss' + info, "train_loss" ) if len(train_learning_curve) > 1: - additional_run_info['train_learning_curve'] = train_learning_curve - additional_run_info['learning_curve_runtime'] = learning_curve_runtime + additional_run_info["train_learning_curve"] = train_learning_curve + additional_run_info["learning_curve_runtime"] = learning_curve_runtime if self._get_validation_loss: - validation_learning_curve = autosklearn.evaluation.util.extract_learning_curve( - info, 'validation_loss', + validation_learning_curve = ( + autosklearn.evaluation.util.extract_learning_curve( + info, + "validation_loss", + ) ) if len(validation_learning_curve) > 1: - additional_run_info['validation_learning_curve'] = \ - validation_learning_curve additional_run_info[ - 'learning_curve_runtime'] = learning_curve_runtime + "validation_learning_curve" + ] = validation_learning_curve + additional_run_info[ + "learning_curve_runtime" + ] = learning_curve_runtime if self._get_test_loss: - test_learning_curve = autosklearn.evaluation.util.extract_learning_curve( - info, 'test_loss', + test_learning_curve = ( + autosklearn.evaluation.util.extract_learning_curve( + info, + "test_loss", + ) ) if len(test_learning_curve) > 1: - additional_run_info['test_learning_curve'] = test_learning_curve + additional_run_info["test_learning_curve"] = test_learning_curve additional_run_info[ - 'learning_curve_runtime'] = learning_curve_runtime + "learning_curve_runtime" + ] = learning_curve_runtime if isinstance(config, int): - origin = 'DUMMY' + origin = "DUMMY" config_id = config else: - origin = getattr(config, 'origin', 'UNKNOWN') + origin = getattr(config, "origin", "UNKNOWN") config_id = config.config_id - additional_run_info['configuration_origin'] = origin + additional_run_info["configuration_origin"] = origin runtime = float(obj.wall_clock_time) diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 36d51d7e0d..bc0be0e8d8 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -1,43 +1,36 @@ +from typing import Any, Dict, List, Optional, TextIO, Tuple, Type, Union, cast + import logging import multiprocessing import time import warnings -from typing import Any, Dict, List, Optional, TextIO, Tuple, Type, Union, cast import numpy as np - +from ConfigSpace import Configuration from sklearn.base import BaseEstimator from sklearn.dummy import DummyClassifier, DummyRegressor from sklearn.ensemble import VotingClassifier, VotingRegressor - from smac.tae import StatusType - from threadpoolctl import threadpool_limits -from autosklearn.automl_common.common.utils.backend import Backend - import autosklearn.pipeline.classification import autosklearn.pipeline.regression -from autosklearn.pipeline.components.base import ThirdPartyComponents, _addons +from autosklearn.automl_common.common.utils.backend import Backend from autosklearn.constants import ( CLASSIFICATION_TASKS, - REGRESSION_TASKS, - MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, - MULTIOUTPUT_REGRESSION + MULTILABEL_CLASSIFICATION, + MULTIOUTPUT_REGRESSION, + REGRESSION_TASKS, ) +from autosklearn.metrics import Scorer, calculate_loss +from autosklearn.pipeline.components.base import ThirdPartyComponents, _addons from autosklearn.pipeline.implementations.util import ( - convert_multioutput_multiclass_to_multilabel + convert_multioutput_multiclass_to_multilabel, ) -from autosklearn.metrics import calculate_loss, Scorer from autosklearn.util.logging_ import PicklableClientLogger, get_named_client_logger -from ConfigSpace import Configuration - - -__all__ = [ - 'AbstractEvaluator' -] +__all__ = ["AbstractEvaluator"] # General TYPE definitions for numpy @@ -66,37 +59,39 @@ def __init__( self.exclude = exclude def pre_transform( - self, - X: np.ndarray, - y: np.ndarray, - fit_params: Optional[Dict[str, Any]] = None + self, X: np.ndarray, y: np.ndarray, fit_params: Optional[Dict[str, Any]] = None ) -> Tuple[np.ndarray, Dict[str, Any]]: # pylint: disable=R0201 if fit_params is None: fit_params = {} return X, fit_params - def fit(self, X: np.ndarray, y: np.ndarray, - sample_weight: Optional[Union[np.ndarray, List]] = None - ) -> DummyClassifier: - return super(MyDummyClassifier, self).fit(np.ones((X.shape[0], 1)), y, - sample_weight=sample_weight) + def fit( + self, + X: np.ndarray, + y: np.ndarray, + sample_weight: Optional[Union[np.ndarray, List]] = None, + ) -> DummyClassifier: + return super(MyDummyClassifier, self).fit( + np.ones((X.shape[0], 1)), y, sample_weight=sample_weight + ) - def fit_estimator(self, X: np.ndarray, y: np.ndarray, - fit_params: Optional[Dict[str, Any]] = None) -> DummyClassifier: + def fit_estimator( + self, X: np.ndarray, y: np.ndarray, fit_params: Optional[Dict[str, Any]] = None + ) -> DummyClassifier: return self.fit(X, y) - def predict_proba(self, X: np.ndarray, batch_size: int = 1000 - ) -> np.ndarray: + def predict_proba(self, X: np.ndarray, batch_size: int = 1000) -> np.ndarray: new_X = np.ones((X.shape[0], 1)) probas = super(MyDummyClassifier, self).predict_proba(new_X) - probas = convert_multioutput_multiclass_to_multilabel(probas).astype( - np.float32) + probas = convert_multioutput_multiclass_to_multilabel(probas).astype(np.float32) return probas def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201 return False - def get_additional_run_info(self) -> Optional[TYPE_ADDITIONAL_INFO]: # pylint: disable=R0201 + def get_additional_run_info( + self, + ) -> Optional[TYPE_ADDITIONAL_INFO]: # pylint: disable=R0201 return None @@ -112,9 +107,9 @@ def __init__( ): self.config = config if config == 1: - super(MyDummyRegressor, self).__init__(strategy='mean') + super(MyDummyRegressor, self).__init__(strategy="mean") else: - super(MyDummyRegressor, self).__init__(strategy='median') + super(MyDummyRegressor, self).__init__(strategy="median") self.random_state = random_state self.init_params = init_params self.dataset_properties = dataset_properties @@ -122,23 +117,25 @@ def __init__( self.exclude = exclude def pre_transform( - self, - X: np.ndarray, - y: np.ndarray, - fit_params: Optional[Dict[str, Any]] = None + self, X: np.ndarray, y: np.ndarray, fit_params: Optional[Dict[str, Any]] = None ) -> Tuple[np.ndarray, Dict[str, Any]]: # pylint: disable=R0201 if fit_params is None: fit_params = {} return X, fit_params - def fit(self, X: np.ndarray, y: np.ndarray, - sample_weight: Optional[Union[np.ndarray, List]] = None - ) -> DummyRegressor: - return super(MyDummyRegressor, self).fit(np.ones((X.shape[0], 1)), y, - sample_weight=sample_weight) + def fit( + self, + X: np.ndarray, + y: np.ndarray, + sample_weight: Optional[Union[np.ndarray, List]] = None, + ) -> DummyRegressor: + return super(MyDummyRegressor, self).fit( + np.ones((X.shape[0], 1)), y, sample_weight=sample_weight + ) - def fit_estimator(self, X: np.ndarray, y: np.ndarray, - fit_params: Optional[Dict[str, Any]] = None) -> DummyRegressor: + def fit_estimator( + self, X: np.ndarray, y: np.ndarray, fit_params: Optional[Dict[str, Any]] = None + ) -> DummyRegressor: return self.fit(X, y) def predict(self, X: np.ndarray, batch_size: int = 1000) -> np.ndarray: @@ -148,7 +145,9 @@ def predict(self, X: np.ndarray, batch_size: int = 1000) -> np.ndarray: def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201 return False - def get_additional_run_info(self) -> Optional[TYPE_ADDITIONAL_INFO]: # pylint: disable=R0201 + def get_additional_run_info( + self, + ) -> Optional[TYPE_ADDITIONAL_INFO]: # pylint: disable=R0201 return None @@ -156,7 +155,7 @@ def _fit_and_suppress_warnings( logger: Union[logging.Logger, PicklableClientLogger], model: BaseEstimator, X: np.ndarray, - y: np.ndarray + y: np.ndarray, ) -> BaseEstimator: def send_warnings_to_log( message: Union[Warning, str], @@ -166,8 +165,7 @@ def send_warnings_to_log( file: Optional[TextIO] = None, line: Optional[str] = None, ) -> None: - logger.debug('%s:%s: %s:%s' % - (filename, lineno, str(category), message)) + logger.debug("%s:%s: %s:%s" % (filename, lineno, str(category), message)) return with warnings.catch_warnings(): @@ -212,13 +210,13 @@ def __init__( self.include = include self.exclude = exclude - self.X_valid = self.datamanager.data.get('X_valid') - self.y_valid = self.datamanager.data.get('Y_valid') - self.X_test = self.datamanager.data.get('X_test') - self.y_test = self.datamanager.data.get('Y_test') + self.X_valid = self.datamanager.data.get("X_valid") + self.y_valid = self.datamanager.data.get("Y_valid") + self.X_test = self.datamanager.data.get("X_test") + self.y_test = self.datamanager.data.get("Y_test") self.metric = metric - self.task_type = self.datamanager.info['task'] + self.task_type = self.datamanager.info["task"] self.seed = seed self.output_y_hat_optimization = output_y_hat_optimization @@ -227,25 +225,26 @@ def __init__( if isinstance(disable_file_output, (bool, list)): self.disable_file_output: Union[bool, List[str]] = disable_file_output else: - raise ValueError('disable_file_output should be either a bool or a list') + raise ValueError("disable_file_output should be either a bool or a list") if self.task_type in REGRESSION_TASKS: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyRegressor else: - self.model_class = \ + self.model_class = ( autosklearn.pipeline.regression.SimpleRegressionPipeline + ) self.predict_function = self._predict_regression else: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyClassifier else: - self.model_class = autosklearn.pipeline.classification.SimpleClassificationPipeline + self.model_class = ( + autosklearn.pipeline.classification.SimpleClassificationPipeline + ) self.predict_function = self._predict_proba - self._init_params = { - 'data_preprocessor:feat_type': self.datamanager.feat_type - } + self._init_params = {"data_preprocessor:feat_type": self.datamanager.feat_type} if init_params is not None: self._init_params.update(init_params) @@ -254,8 +253,11 @@ def __init__( num_run = 0 self.num_run = num_run - logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1], - self.seed, self.datamanager.name) + logger_name = "%s(%d):%s" % ( + self.__class__.__name__.split(".")[-1], + self.seed, + self.datamanager.name, + ) if self.port is None: self.logger = logging.getLogger(__name__) @@ -271,12 +273,14 @@ def __init__( self.budget = budget self.budget_type = budget_type - # Add 3rd-party components to the list of 3rd-party components in case this wasn't done - # before (this happens if we run in parallel and the components are only passed to the - # AbstractEvaluator via the TAE and are not there yet because the worker is in its own - # process). + # Add 3rd-party components to the list of 3rd-party components in case this + # wasn't done before (this happens if we run in parallel and the components + # are only passed to the AbstractEvaluator via the TAE and are not there + # yet because the worker is in its own process). for key in additional_components: - for component_name, component in additional_components[key].components.items(): + for component_name, component in additional_components[ + key + ].components.items(): if component_name not in _addons[key].components: _addons[key].add_component(component) @@ -285,34 +289,41 @@ def __init__( def _get_model(self) -> BaseEstimator: if not isinstance(self.configuration, Configuration): - model = self.model_class(config=self.configuration, - random_state=self.seed, - init_params=self._init_params) + model = self.model_class( + config=self.configuration, + random_state=self.seed, + init_params=self._init_params, + ) else: if self.task_type in REGRESSION_TASKS: dataset_properties = { - 'task': self.task_type, - 'sparse': self.datamanager.info['is_sparse'] == 1, - 'multioutput': self.task_type == MULTIOUTPUT_REGRESSION, + "task": self.task_type, + "sparse": self.datamanager.info["is_sparse"] == 1, + "multioutput": self.task_type == MULTIOUTPUT_REGRESSION, } else: dataset_properties = { - 'task': self.task_type, - 'sparse': self.datamanager.info['is_sparse'] == 1, - 'multilabel': self.task_type == MULTILABEL_CLASSIFICATION, - 'multiclass': self.task_type == MULTICLASS_CLASSIFICATION, + "task": self.task_type, + "sparse": self.datamanager.info["is_sparse"] == 1, + "multilabel": self.task_type == MULTILABEL_CLASSIFICATION, + "multiclass": self.task_type == MULTICLASS_CLASSIFICATION, } - model = self.model_class(config=self.configuration, - dataset_properties=dataset_properties, - random_state=self.seed, - include=self.include, - exclude=self.exclude, - init_params=self._init_params) + model = self.model_class( + config=self.configuration, + dataset_properties=dataset_properties, + random_state=self.seed, + include=self.include, + exclude=self.exclude, + init_params=self._init_params, + ) return model - def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, - scoring_functions: Optional[List[Scorer]] = None - ) -> Union[float, Dict[str, float]]: + def _loss( + self, + y_true: np.ndarray, + y_hat: np.ndarray, + scoring_functions: Optional[List[Scorer]] = None, + ) -> Union[float, Dict[str, float]]: """Auto-sklearn follows a minimization goal. The calculate_loss internally translate a score function to a minimization problem. @@ -324,9 +335,7 @@ def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, y_true """ scoring_functions = ( - self.scoring_functions - if scoring_functions is None - else scoring_functions + self.scoring_functions if scoring_functions is None else scoring_functions ) if not isinstance(self.configuration, Configuration): if scoring_functions: @@ -335,8 +344,12 @@ def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, return self.metric._worst_possible_result return calculate_loss( - y_true, y_hat, self.task_type, self.metric, - scoring_functions=scoring_functions) + y_true, + y_hat, + self.task_type, + self.metric, + scoring_functions=scoring_functions, + ) def finish_up( self, @@ -349,28 +362,35 @@ def finish_up( file_output: bool, final_call: bool, status: StatusType, - ) -> Tuple[float, Union[float, Dict[str, float]], int, - Dict[str, Union[str, int, float, Dict, List, Tuple]]]: - """This function does everything necessary after the fitting is done: + ) -> Tuple[ + float, + Union[float, Dict[str, float]], + int, + Dict[str, Union[str, int, float, Dict, List, Tuple]], + ]: + """Do everything necessary after the fitting is done: * predicting * saving the files for the ensembles_statistics * generate output for SMAC We use it as the signal handler so we can recycle the code for the - normal usecase and when the runsolver kills us here :)""" - + normal usecase and when the runsolver kills us here :) + """ self.duration = time.time() - self.starttime if file_output: file_out_loss, additional_run_info_ = self.file_output( - opt_pred, valid_pred, test_pred, + opt_pred, + valid_pred, + test_pred, ) else: file_out_loss = None additional_run_info_ = {} validation_loss, test_loss = self.calculate_auxiliary_losses( - valid_pred, test_pred, + valid_pred, + test_pred, ) if file_out_loss is not None: @@ -382,25 +402,25 @@ def finish_up( else: loss_ = {} - additional_run_info = ( - {} if additional_run_info is None else additional_run_info - ) + additional_run_info = {} if additional_run_info is None else additional_run_info for metric_name, value in loss_.items(): additional_run_info[metric_name] = value - additional_run_info['duration'] = self.duration - additional_run_info['num_run'] = self.num_run + additional_run_info["duration"] = self.duration + additional_run_info["num_run"] = self.num_run if train_loss is not None: - additional_run_info['train_loss'] = train_loss + additional_run_info["train_loss"] = train_loss if validation_loss is not None: - additional_run_info['validation_loss'] = validation_loss + additional_run_info["validation_loss"] = validation_loss if test_loss is not None: - additional_run_info['test_loss'] = test_loss + additional_run_info["test_loss"] = test_loss - rval_dict = {'loss': loss, - 'additional_run_info': additional_run_info, - 'status': status} + rval_dict = { + "loss": loss, + "additional_run_info": additional_run_info, + "status": status, + } if final_call: - rval_dict['final_queue_element'] = True + rval_dict["final_queue_element"] = True self.queue.put(rval_dict) return self.duration, loss_, self.seed, additional_run_info_ @@ -413,7 +433,8 @@ def calculate_auxiliary_losses( if Y_valid_pred is not None: if self.y_valid is not None: validation_loss: Optional[Union[float, Dict[str, float]]] = self._loss( - self.y_valid, Y_valid_pred) + self.y_valid, Y_valid_pred + ) if isinstance(validation_loss, dict): validation_loss = validation_loss[self.metric.name] else: @@ -424,7 +445,8 @@ def calculate_auxiliary_losses( if Y_test_pred is not None: if self.y_test is not None: test_loss: Optional[Union[float, Dict[str, float]]] = self._loss( - self.y_test, Y_test_pred) + self.y_test, Y_test_pred + ) if isinstance(test_loss, dict): test_loss = test_loss[self.metric.name] else: @@ -451,27 +473,24 @@ def file_output( return ( 1.0, { - 'error': - "Targets %s and prediction %s don't have " - "the same length. Probably training didn't " - "finish" % (np.shape(self.Y_optimization), Y_optimization_pred.shape) - }, + "error": "Targets %s and prediction %s don't have " + "the same length. Probably training didn't " + "finish" + % (np.shape(self.Y_optimization), Y_optimization_pred.shape) + }, ) # Abort if predictions contain NaNs for y, s in [ # Y_train_pred deleted here. Fix unittest accordingly. - [Y_optimization_pred, 'optimization'], - [Y_valid_pred, 'validation'], - [Y_test_pred, 'test'] + [Y_optimization_pred, "optimization"], + [Y_valid_pred, "validation"], + [Y_test_pred, "test"], ]: if y is not None and not np.all(np.isfinite(y)): return ( 1.0, - { - 'error': - 'Model predictions for %s set contains NaNs.' % s - }, + {"error": "Model predictions for %s set contains NaNs." % s}, ) # Abort if we don't want to output anything. @@ -489,17 +508,20 @@ def file_output( self.disable_file_output = cast(List, self.disable_file_output) # This file can be written independently of the others down bellow - if ('y_optimization' not in self.disable_file_output): + if "y_optimization" not in self.disable_file_output: if self.output_y_hat_optimization: self.backend.save_targets_ensemble(self.Y_optimization) models: Optional[BaseEstimator] = None - if hasattr(self, 'models'): - if len(self.models) > 0 and self.models[0] is not None: # type: ignore[attr-defined] - if ('models' not in self.disable_file_output): + if hasattr(self, "models"): + if len(self.models) > 0 and self.models[0] is not None: + if "models" not in self.disable_file_output: if self.task_type in CLASSIFICATION_TASKS: - models = VotingClassifier(estimators=None, voting='soft', ) + models = VotingClassifier( + estimators=None, + voting="soft", + ) else: models = VotingRegressor(estimators=None) # Mypy cannot understand hasattr yet @@ -509,24 +531,30 @@ def file_output( seed=self.seed, idx=self.num_run, budget=self.budget, - model=self.model if 'model' not in self.disable_file_output else None, - cv_model=models if 'cv_model' not in self.disable_file_output else None, + model=self.model if "model" not in self.disable_file_output else None, + cv_model=models if "cv_model" not in self.disable_file_output else None, ensemble_predictions=( - Y_optimization_pred if 'y_optimization' not in self.disable_file_output else None + Y_optimization_pred + if "y_optimization" not in self.disable_file_output + else None ), valid_predictions=( - Y_valid_pred if 'y_valid' not in self.disable_file_output else None + Y_valid_pred if "y_valid" not in self.disable_file_output else None ), test_predictions=( - Y_test_pred if 'y_test' not in self.disable_file_output else None + Y_test_pred if "y_test" not in self.disable_file_output else None ), ) return None, {} - def _predict_proba(self, X: np.ndarray, model: BaseEstimator, - task_type: int, Y_train: Optional[np.ndarray] = None, - ) -> np.ndarray: + def _predict_proba( + self, + X: np.ndarray, + model: BaseEstimator, + task_type: int, + Y_train: Optional[np.ndarray] = None, + ) -> np.ndarray: def send_warnings_to_log( message: Union[Warning, str], category: Type[Warning], @@ -535,8 +563,9 @@ def send_warnings_to_log( file: Optional[TextIO] = None, line: Optional[str] = None, ) -> None: - self.logger.debug('%s:%s: %s:%s' % - (filename, lineno, str(category), message)) + self.logger.debug( + "%s:%s: %s:%s" % (filename, lineno, str(category), message) + ) return with warnings.catch_warnings(): @@ -549,8 +578,13 @@ def send_warnings_to_log( Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred - def _predict_regression(self, X: np.ndarray, model: BaseEstimator, - task_type: int, Y_train: Optional[np.ndarray] = None) -> np.ndarray: + def _predict_regression( + self, + X: np.ndarray, + model: BaseEstimator, + task_type: int, + Y_train: Optional[np.ndarray] = None, + ) -> np.ndarray: def send_warnings_to_log( message: Union[Warning, str], category: Type[Warning], @@ -559,8 +593,9 @@ def send_warnings_to_log( file: Optional[TextIO] = None, line: Optional[str] = None, ) -> None: - self.logger.debug('%s:%s: %s:%s' % - (filename, lineno, str(category), message)) + self.logger.debug( + "%s:%s: %s:%s" % (filename, lineno, str(category), message) + ) return with warnings.catch_warnings(): @@ -572,14 +607,17 @@ def send_warnings_to_log( return Y_pred - def _ensure_prediction_array_sizes(self, prediction: np.ndarray, Y_train: np.ndarray - ) -> np.ndarray: - num_classes = self.datamanager.info['label_num'] + def _ensure_prediction_array_sizes( + self, prediction: np.ndarray, Y_train: np.ndarray + ) -> np.ndarray: + num_classes = self.datamanager.info["label_num"] - if self.task_type == MULTICLASS_CLASSIFICATION and \ - prediction.shape[1] < num_classes: + if ( + self.task_type == MULTICLASS_CLASSIFICATION + and prediction.shape[1] < num_classes + ): if Y_train is None: - raise ValueError('Y_train must not be None!') + raise ValueError("Y_train must not be None!") classes = list(np.unique(Y_train)) mapping = dict() @@ -587,8 +625,9 @@ def _ensure_prediction_array_sizes(self, prediction: np.ndarray, Y_train: np.nda if class_number in classes: index = classes.index(class_number) mapping[index] = class_number - new_predictions = np.zeros((prediction.shape[0], num_classes), - dtype=np.float32) + new_predictions = np.zeros( + (prediction.shape[0], num_classes), dtype=np.float32 + ) for index in mapping: class_index = mapping[index] diff --git a/autosklearn/evaluation/splitter.py b/autosklearn/evaluation/splitter.py index a18e29e08a..586d92c88f 100644 --- a/autosklearn/evaluation/splitter.py +++ b/autosklearn/evaluation/splitter.py @@ -1,31 +1,30 @@ import warnings import numpy as np - -from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold +from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit from sklearn.model_selection._split import _validate_shuffle_split -from sklearn.utils import indexable, check_random_state -from sklearn.utils import _approximate_mode -from sklearn.utils.validation import _num_samples, column_or_1d -from sklearn.utils.validation import check_array +from sklearn.utils import _approximate_mode, check_random_state, indexable from sklearn.utils.multiclass import type_of_target +from sklearn.utils.validation import _num_samples, check_array, column_or_1d class CustomStratifiedShuffleSplit(StratifiedShuffleSplit): - """Stratified ShuffleSplit cross-validator that deals with classes with too few samples - """ + """Splitter that deals with classes with too few samples""" def _iter_indices(self, X, y, groups=None): # type: ignore n_samples = _num_samples(X) y = check_array(y, ensure_2d=False, dtype=None) n_train, n_test = _validate_shuffle_split( - n_samples, self.test_size, self.train_size, - default_test_size=self._default_test_size) + n_samples, + self.test_size, + self.train_size, + default_test_size=self._default_test_size, + ) if y.ndim == 2: # for multi-label y, map each distinct row to a string repr # using join because str(row) uses an ellipsis if len(row) > 1000 - y = np.array([' '.join(row.astype('str')) for row in y]) + y = np.array([" ".join(row.astype("str")) for row in y]) classes, y_indices = np.unique(y, return_inverse=True) n_classes = classes.shape[0] @@ -33,18 +32,21 @@ def _iter_indices(self, X, y, groups=None): # type: ignore class_counts = np.bincount(y_indices) if n_train < n_classes: - raise ValueError('The train_size = %d should be greater or ' - 'equal to the number of classes = %d' % - (n_train, n_classes)) + raise ValueError( + "The train_size = %d should be greater or " + "equal to the number of classes = %d" % (n_train, n_classes) + ) if n_test < n_classes: - raise ValueError('The test_size = %d should be greater or ' - 'equal to the number of classes = %d' % - (n_test, n_classes)) + raise ValueError( + "The test_size = %d should be greater or " + "equal to the number of classes = %d" % (n_test, n_classes) + ) # Find the sorted list of instances for each class: # (np.unique above performs a sort, so code is O(n logn) already) - class_indices = np.split(np.argsort(y_indices, kind='mergesort'), - np.cumsum(class_counts)[:-1]) + class_indices = np.split( + np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1] + ) rng = check_random_state(self.random_state) @@ -62,18 +64,18 @@ def _iter_indices(self, X, y, groups=None): # type: ignore # Each list n_i, t_i represent the list of class in the # training_set and test_set resepectively. # - # n_i = [100, 100, 0, 3] # 100 instance of class '0', 0 instance of class '2' - # t_i = [300, 300, 1, 3] # 300 instances of class '0', 1 instance of class '2' + # n_i = [100, 100, 0, 3] # 100 of class '0', 0 of class '2' + # t_i = [300, 300, 1, 3] # 300 of class '0', 1 of class '2' # # To support unique labels such as class '2', which only has one sample # between both n_i and t_i, we need to make sure that n_i has at least # one sample of all classes. There is also the extra check to ensure # that the sizes stay the same. # - # n_i = [ 99, 100, 1, 3] # 100 instance of class '0', 0 instance of class '2' + # n_i = [ 99, 100, 1, 3] # 100 of class '0', 0 of class '2' # | ^ # v | - # t_i = [301, 300, 0, 3] # 300 instances of class '0', 1 instance of class '2' + # t_i = [301, 300, 0, 3] # 300 of class '0', 1 of class '2' # for i, class_count in enumerate(n_i): if class_count == 0: @@ -82,20 +84,21 @@ def _iter_indices(self, X, y, groups=None): # type: ignore j = np.argmax(n_i) if n_i[j] == 1: - warnings.warn("Can't respect size requirements for split.", - " The training set must contain all of the unique" - " labels that exist in the dataset.") + warnings.warn( + "Can't respect size requirements for split.", + " The training set must contain all of the unique" + " labels that exist in the dataset.", + ) else: n_i[j] -= 1 t_i[j] += 1 for i in range(n_classes): permutation = rng.permutation(class_counts[i]) - perm_indices_class_i = class_indices[i].take(permutation, - mode='clip') + perm_indices_class_i = class_indices[i].take(permutation, mode="clip") - train.extend(perm_indices_class_i[:n_i[i]]) - test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]) + train.extend(perm_indices_class_i[: n_i[i]]) + test.extend(perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]]) train = rng.permutation(train) test = rng.permutation(test) @@ -112,11 +115,13 @@ def _make_test_folds(self, X, y=None): # type: ignore rng = check_random_state(self.random_state) y = np.asarray(y) type_of_target_y = type_of_target(y) - allowed_target_types = ('binary', 'multiclass') + allowed_target_types = ("binary", "multiclass") if type_of_target_y not in allowed_target_types: raise ValueError( - 'Supported target types are: {}. Got {!r} instead.'.format( - allowed_target_types, type_of_target_y)) + "Supported target types are: {}. Got {!r} instead.".format( + allowed_target_types, type_of_target_y + ) + ) y = column_or_1d(y) @@ -134,13 +139,16 @@ def _make_test_folds(self, X, y=None): # type: ignore # counts, but that code is unreadable.) y_order = np.sort(y_encoded) allocation = np.asarray( - [np.bincount(y_order[i::self.n_splits], minlength=n_classes) - for i in range(self.n_splits)]) + [ + np.bincount(y_order[i :: self.n_splits], minlength=n_classes) + for i in range(self.n_splits) + ] + ) # To maintain the data order dependencies as best as possible within # the stratification constraint, we assign samples from each class in # blocks (and then mess that up when shuffle=True). - test_folds = np.empty(len(y), dtype='i') + test_folds = np.empty(len(y), dtype="i") for k in range(n_classes): # since the kth column of allocation stores the number of samples # of class k in each test set, this generates blocks of fold @@ -157,12 +165,11 @@ def split(self, X, y=None, groups=None): # type: ignore n_samples = _num_samples(X) if self.n_splits > n_samples: raise ValueError( - ("Cannot have number of splits n_splits={0} greater" - " than the number of samples: n_samples={1}.") - .format(self.n_splits, n_samples)) + f"Cannot have number of splits n_splits={self.n_splits} greater" + f" than the number of samples: n_samples={n_samples}." + ) for train, test in super().split(X, y, groups): - # print(len(np.unique(y)), len(np.unique(y[train])), len(np.unique(y[test]))) all_classes = np.unique(y) train_classes = np.unique(y[train]) train = list(train) @@ -179,11 +186,5 @@ def split(self, X, y=None, groups=None): # type: ignore # print(len(train), len(test)) train = np.array(train, dtype=int) test = np.array(test, dtype=int) - # print( - # len(np.unique(y)), - # len(np.unique(y[train])), - # len(np.unique(y[test])), - # len(train), len(test), - # ) yield train, test diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py index 181ebce233..4b6cf8452c 100644 --- a/autosklearn/evaluation/test_evaluator.py +++ b/autosklearn/evaluation/test_evaluator.py @@ -1,31 +1,24 @@ # -*- encoding: utf-8 -*- -import multiprocessing from typing import Any, Dict, List, Optional, Tuple, Union -from ConfigSpace import Configuration +import multiprocessing import numpy as np - +from ConfigSpace import Configuration from smac.tae import StatusType from autosklearn.automl_common.common.utils.backend import Backend - from autosklearn.evaluation.abstract_evaluator import ( AbstractEvaluator, _fit_and_suppress_warnings, ) +from autosklearn.metrics import Scorer, calculate_loss from autosklearn.pipeline.components.base import ThirdPartyComponents -from autosklearn.metrics import calculate_loss, Scorer - -__all__ = [ - 'eval_t', - 'TestEvaluator' -] +__all__ = ["eval_t", "TestEvaluator"] class TestEvaluator(AbstractEvaluator): - def __init__( self, backend: Backend, @@ -55,15 +48,15 @@ def __init__( include=include, exclude=exclude, disable_file_output=disable_file_output, - init_params=init_params + init_params=init_params, ) self.configuration = configuration - self.X_train = self.datamanager.data['X_train'] - self.Y_train = self.datamanager.data['Y_train'] + self.X_train = self.datamanager.data["X_train"] + self.Y_train = self.datamanager.data["Y_train"] - self.X_test = self.datamanager.data.get('X_test') - self.Y_test = self.datamanager.data.get('Y_test') + self.X_test = self.datamanager.data.get("X_test") + self.Y_test = self.datamanager.data.get("Y_test") self.model = self._get_model() @@ -87,23 +80,27 @@ def predict_and_loss( ) -> Tuple[Union[Dict[str, float], float], np.array, Any, Any]: if train: - Y_pred = self.predict_function(self.X_train, self.model, - self.task_type, self.Y_train) + Y_pred = self.predict_function( + self.X_train, self.model, self.task_type, self.Y_train + ) err = calculate_loss( solution=self.Y_train, prediction=Y_pred, task_type=self.task_type, metric=self.metric, - scoring_functions=self.scoring_functions) + scoring_functions=self.scoring_functions, + ) else: - Y_pred = self.predict_function(self.X_test, self.model, - self.task_type, self.Y_train) + Y_pred = self.predict_function( + self.X_test, self.model, self.task_type, self.Y_train + ) err = calculate_loss( solution=self.Y_test, prediction=Y_pred, task_type=self.task_type, metric=self.metric, - scoring_functions=self.scoring_functions) + scoring_functions=self.scoring_functions, + ) return err, Y_pred, None, None @@ -129,14 +126,19 @@ def eval_t( budget: Optional[float] = None, budget_type: Optional[str] = None, ) -> None: - evaluator = TestEvaluator(configuration=config, - backend=backend, metric=metric, seed=seed, - port=port, - queue=queue, - scoring_functions=scoring_functions, - include=include, exclude=exclude, - disable_file_output=disable_file_output, - additional_components=additional_components, - init_params=init_params,) + evaluator = TestEvaluator( + configuration=config, + backend=backend, + metric=metric, + seed=seed, + port=port, + queue=queue, + scoring_functions=scoring_functions, + include=include, + exclude=exclude, + disable_file_output=disable_file_output, + additional_components=additional_components, + init_params=init_params, + ) evaluator.fit_predict_and_loss() diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index 558fdd3b67..7a047d3e10 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -1,54 +1,64 @@ -import logging -import multiprocessing -import warnings from typing import Any, Dict, List, Optional, Tuple, Union, cast import copy import json - -from ConfigSpace import Configuration +import logging +import multiprocessing +import warnings import numpy as np - -from smac.tae import TAEAbortException, StatusType - +from ConfigSpace import Configuration from sklearn.base import BaseEstimator -from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit, KFold, \ - StratifiedKFold, train_test_split, BaseCrossValidator, PredefinedSplit -from sklearn.model_selection._split import _RepeatedSplits, BaseShuffleSplit +from sklearn.model_selection import ( + BaseCrossValidator, + KFold, + PredefinedSplit, + ShuffleSplit, + StratifiedKFold, + StratifiedShuffleSplit, + train_test_split, +) +from sklearn.model_selection._split import BaseShuffleSplit, _RepeatedSplits +from smac.tae import StatusType, TAEAbortException from autosklearn.automl_common.common.utils.backend import Backend - -from autosklearn.evaluation.abstract_evaluator import ( - AbstractEvaluator, - TYPE_ADDITIONAL_INFO, - _fit_and_suppress_warnings, -) -from autosklearn.evaluation.splitter import CustomStratifiedShuffleSplit, CustomStratifiedKFold -from autosklearn.data.abstract_data_manager import AbstractDataManager from autosklearn.constants import ( CLASSIFICATION_TASKS, MULTILABEL_CLASSIFICATION, + MULTIOUTPUT_REGRESSION, REGRESSION_TASKS, - MULTIOUTPUT_REGRESSION ) -from autosklearn.data.validation import ( - SUPPORTED_FEAT_TYPES, - SUPPORTED_TARGET_TYPES, - ) -from autosklearn.pipeline.base import PIPELINE_DATA_DTYPE -from autosklearn.pipeline.components.base import IterativeComponent, ThirdPartyComponents +from autosklearn.data.abstract_data_manager import AbstractDataManager +from autosklearn.data.validation import SUPPORTED_FEAT_TYPES, SUPPORTED_TARGET_TYPES +from autosklearn.evaluation.abstract_evaluator import ( + TYPE_ADDITIONAL_INFO, + AbstractEvaluator, + _fit_and_suppress_warnings, +) +from autosklearn.evaluation.splitter import ( + CustomStratifiedKFold, + CustomStratifiedShuffleSplit, +) from autosklearn.metrics import Scorer +from autosklearn.pipeline.base import PIPELINE_DATA_DTYPE +from autosklearn.pipeline.components.base import ( + IterativeComponent, + ThirdPartyComponents, +) from autosklearn.util.logging_ import PicklableClientLogger - -__all__ = ['TrainEvaluator', 'eval_holdout', 'eval_iterative_holdout', - 'eval_cv', 'eval_partial_cv', 'eval_partial_cv_iterative'] +__all__ = [ + "TrainEvaluator", + "eval_holdout", + "eval_iterative_holdout", + "eval_cv", + "eval_partial_cv", + "eval_partial_cv_iterative", +] def _get_y_array(y: SUPPORTED_TARGET_TYPES, task_type: int) -> SUPPORTED_TARGET_TYPES: - if task_type in CLASSIFICATION_TASKS and task_type != \ - MULTILABEL_CLASSIFICATION: + if task_type in CLASSIFICATION_TASKS and task_type != MULTILABEL_CLASSIFICATION: return y.ravel() else: return y @@ -58,29 +68,26 @@ def subsample_indices( train_indices: List[int], subsample: Optional[float], task_type: int, - Y_train: SUPPORTED_TARGET_TYPES + Y_train: SUPPORTED_TARGET_TYPES, ) -> List[int]: if not isinstance(subsample, float): raise ValueError( - 'Subsample must be of type float, but is of type %s' - % type(subsample) + "Subsample must be of type float, but is of type %s" % type(subsample) ) elif subsample > 1: - raise ValueError( - 'Subsample must not be larger than 1, but is %f' - % subsample - ) + raise ValueError("Subsample must not be larger than 1, but is %f" % subsample) if subsample is not None and subsample < 1: # Only subsample if there are more indices given to this method than # required to subsample because otherwise scikit-learn will complain if task_type in CLASSIFICATION_TASKS and task_type != MULTILABEL_CLASSIFICATION: - stratify: Optional[ - SUPPORTED_TARGET_TYPES - ] = Y_train.iloc[train_indices] if hasattr( - Y_train, 'iloc') else Y_train[train_indices] + stratify: Optional[SUPPORTED_TARGET_TYPES] = ( + Y_train.iloc[train_indices] + if hasattr(Y_train, "iloc") + else Y_train[train_indices] + ) else: stratify = None @@ -109,40 +116,55 @@ def _fit_with_budget( task_type: int, ) -> None: if ( - budget_type == 'iterations' - or budget_type == 'mixed' and model.estimator_supports_iterative_fit() + budget_type == "iterations" + or budget_type == "mixed" + and model.estimator_supports_iterative_fit() ): if model.estimator_supports_iterative_fit(): budget_factor = model.get_max_iter() Xt, fit_params = model.fit_transformer( - X_train.iloc[train_indices] if hasattr(X_train, 'iloc') else X_train[train_indices], - Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices], + X_train.iloc[train_indices] + if hasattr(X_train, "iloc") + else X_train[train_indices], + Y_train.iloc[train_indices] + if hasattr(Y_train, "iloc") + else Y_train[train_indices], ) n_iter = int(np.ceil(budget / 100 * budget_factor)) model.iterative_fit( Xt, - Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices], + Y_train.iloc[train_indices] + if hasattr(Y_train, "iloc") + else Y_train[train_indices], n_iter=n_iter, refit=True, - **fit_params + **fit_params, ) else: _fit_and_suppress_warnings( logger, model, - X_train.iloc[train_indices] if hasattr(X_train, 'iloc') else X_train[train_indices], - Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices], + X_train.iloc[train_indices] + if hasattr(X_train, "iloc") + else X_train[train_indices], + Y_train.iloc[train_indices] + if hasattr(Y_train, "iloc") + else Y_train[train_indices], ) elif ( - budget_type == 'subsample' - or budget_type == 'mixed' and not model.estimator_supports_iterative_fit() + budget_type == "subsample" + or budget_type == "mixed" + and not model.estimator_supports_iterative_fit() ): subsample = budget / 100 train_indices_subset = subsample_indices( - train_indices, subsample, task_type, Y_train, + train_indices, + subsample, + task_type, + Y_train, ) _fit_and_suppress_warnings( logger, @@ -167,9 +189,12 @@ def __init__( scoring_functions: Optional[List[Scorer]] = None, seed: int = 1, output_y_hat_optimization: bool = True, - resampling_strategy: Optional[Union[str, BaseCrossValidator, - _RepeatedSplits, BaseShuffleSplit]] = None, - resampling_strategy_args: Optional[Dict[str, Optional[Union[float, int, str]]]] = None, + resampling_strategy: Optional[ + Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit] + ] = None, + resampling_strategy_args: Optional[ + Dict[str, Optional[Union[float, int, str]]] + ] = None, num_run: Optional[int] = None, budget: Optional[float] = None, budget_type: Optional[str] = None, @@ -206,15 +231,17 @@ def __init__( self.resampling_strategy_args = resampling_strategy_args self.splitter = self.get_splitter(self.datamanager) self.num_cv_folds = self.splitter.get_n_splits( - groups=self.resampling_strategy_args.get('groups') + groups=self.resampling_strategy_args.get("groups") ) - self.X_train = self.datamanager.data['X_train'] - self.Y_train = self.datamanager.data['Y_train'] + self.X_train = self.datamanager.data["X_train"] + self.Y_train = self.datamanager.data["Y_train"] self.Y_optimization: Optional[SUPPORTED_TARGET_TYPES] = None self.Y_targets = [None] * self.num_cv_folds self.Y_train_targets = np.ones(self.Y_train.shape) * np.NaN self.models = [None] * self.num_cv_folds - self.indices: List[Optional[Tuple[List[int], List[int]]]] = [None] * self.num_cv_folds + self.indices: List[Optional[Tuple[List[int], List[int]]]] = [ + None + ] * self.num_cv_folds # Necessary for full CV. Makes full CV not write predictions if only # a subset of folds is evaluated but time is up. Complicated, because @@ -225,8 +252,8 @@ def __init__( def fit_predict_and_loss(self, iterative: bool = False) -> None: """Fit, predict and compute the loss for cross-validation and - holdout (both iterative and non-iterative)""" - + holdout (both iterative and non-iterative) + """ # Define beforehand for mypy additional_run_info: Optional[TYPE_ADDITIONAL_INFO] = None @@ -234,14 +261,18 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: if self.num_cv_folds == 1: for train_split, test_split in self.splitter.split( - self.X_train, self.Y_train, - groups=self.resampling_strategy_args.get('groups') + self.X_train, + self.Y_train, + groups=self.resampling_strategy_args.get("groups"), ): self.Y_optimization = self.Y_train[test_split] self.Y_actual_train = self.Y_train[train_split] - self._partial_fit_and_predict_iterative(0, train_indices=train_split, - test_indices=test_split, - add_model_to_self=True) + self._partial_fit_and_predict_iterative( + 0, + train_indices=train_split, + test_indices=test_split, + add_model_to_self=True, + ) else: # Test if the model allows for an iterative fit, if not, @@ -266,16 +297,24 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: total_n_iterations = [0] * self.num_cv_folds # model.estimator_supports_iterative_fit -> true # After the if above, we know estimator support iterative fit - model_max_iter = [cast(IterativeComponent, model).get_max_iter() - for model in self.models] - - if self.budget_type in ['iterations', 'mixed'] and self.budget is None: - raise ValueError(f"When budget type is {self.budget_type} the budget " - "can not be None") + model_max_iter = [ + cast(IterativeComponent, model).get_max_iter() + for model in self.models + ] + + if self.budget_type in ["iterations", "mixed"] and self.budget is None: + raise ValueError( + f"When budget type is {self.budget_type} the budget " + "can not be None" + ) - if self.budget_type in ['iterations', 'mixed'] and cast(float, self.budget) > 0: + if ( + self.budget_type in ["iterations", "mixed"] + and cast(float, self.budget) > 0 + ): max_n_iter_budget = int( - np.ceil(cast(float, self.budget) / 100 * model_max_iter[0])) + np.ceil(cast(float, self.budget) / 100 * model_max_iter[0]) + ) max_iter = min(model_max_iter[0], max_n_iter_budget) else: max_iter = model_max_iter[0] @@ -283,7 +322,9 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: models_current_iters = [0] * self.num_cv_folds Xt_array = [None] * self.num_cv_folds - fit_params_array = [{}] * self.num_cv_folds # type: List[Dict[str, Any]] + fit_params_array = [ + {} + ] * self.num_cv_folds # type: List[Dict[str, Any]] y = _get_y_array(self.Y_train, self.task_type) @@ -300,26 +341,33 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: splitter = self.get_splitter(self.datamanager) - for i, (train_indices, test_indices) in enumerate(splitter.split( - self.X_train, y, - groups=self.resampling_strategy_args.get('groups') - )): + for i, (train_indices, test_indices) in enumerate( + splitter.split( + self.X_train, + y, + groups=self.resampling_strategy_args.get("groups"), + ) + ): if converged[i]: continue model = self.models[i] if iterations[i] == 1: - self.Y_train_targets[train_indices] = \ - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices] + self.Y_train_targets[train_indices] = ( + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices] + ) self.Y_targets[i] = self.Y_train[test_indices] Xt, fit_params = model.fit_transformer( - self.X_train.iloc[train_indices] if hasattr( - self.X_train, 'iloc') else self.X_train[train_indices], - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices], + self.X_train.iloc[train_indices] + if hasattr(self.X_train, "iloc") + else self.X_train[train_indices], + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], ) Xt_array[i] = Xt fit_params_array[i] = fit_params @@ -328,17 +376,14 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: model.iterative_fit( Xt_array[i], - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices], - n_iter=n_iter, **fit_params_array[i] + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], + n_iter=n_iter, + **fit_params_array[i], ) - ( - train_pred, - opt_pred, - valid_pred, - test_pred - ) = self._predict( + (train_pred, opt_pred, valid_pred, test_pred) = self._predict( model, train_indices=train_indices, test_indices=test_indices, @@ -353,13 +398,14 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: # Compute train loss of this fold and store it. train_loss could # either be a scalar or a dict of scalars with metrics as keys. train_loss = self._loss( - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices], + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], train_pred, ) train_losses[i] = train_loss - # number of training data points for this fold. Used for weighting - # the average. + # Number of training data points for this fold. + # Used for weighting the average. train_fold_weights[i] = len(train_indices) # Compute validation loss of this fold and store it. @@ -382,8 +428,8 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: iterations[i] = iterations[i] + 1 - # Compute weights of each fold based on the number of samples in each - # fold. + # Compute weights of each fold based on the number of samples + # in each fold. train_fold_weights_percentage = [ w / sum(train_fold_weights) for w in train_fold_weights ] @@ -395,12 +441,17 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: # dicts, then train_loss is computed using the target metric # (self.metric). if all(isinstance(elem, dict) for elem in train_losses): - train_loss = np.average([train_losses[i][str(self.metric)] - for i in range(self.num_cv_folds)], - weights=train_fold_weights_percentage, - ) + train_loss = np.average( + [ + train_losses[i][str(self.metric)] + for i in range(self.num_cv_folds) + ], + weights=train_fold_weights_percentage, + ) else: - train_loss = np.average(train_losses, weights=train_fold_weights_percentage) + train_loss = np.average( + train_losses, weights=train_fold_weights_percentage + ) # if all_scoring_function is true, return a dict of opt_loss. # Otherwise, return a scalar. @@ -415,23 +466,36 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: weights=opt_fold_weights_percentage, ) else: - opt_loss = np.average(opt_losses, weights=opt_fold_weights_percentage) + opt_loss = np.average( + opt_losses, weights=opt_fold_weights_percentage + ) Y_targets = self.Y_targets Y_train_targets = self.Y_train_targets Y_optimization_preds = np.concatenate( - [Y_optimization_pred[i] for i in range(self.num_cv_folds) - if Y_optimization_pred[i] is not None]) - Y_targets = np.concatenate([ - Y_targets[i] for i in range(self.num_cv_folds) - if Y_targets[i] is not None - ]) + [ + Y_optimization_pred[i] + for i in range(self.num_cv_folds) + if Y_optimization_pred[i] is not None + ] + ) + Y_targets = np.concatenate( + [ + Y_targets[i] + for i in range(self.num_cv_folds) + if Y_targets[i] is not None + ] + ) if self.X_valid is not None: - Y_valid_preds = np.array([Y_valid_pred[i] - for i in range(self.num_cv_folds) - if Y_valid_pred[i] is not None]) + Y_valid_preds = np.array( + [ + Y_valid_pred[i] + for i in range(self.num_cv_folds) + if Y_valid_pred[i] is not None + ] + ) # Average the predictions of several models if len(Y_valid_preds.shape) == 3: Y_valid_preds = np.nanmean(Y_valid_preds, axis=0) @@ -439,9 +503,13 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: Y_valid_preds = None if self.X_test is not None: - Y_test_preds = np.array([Y_test_pred[i] - for i in range(self.num_cv_folds) - if Y_test_pred[i] is not None]) + Y_test_preds = np.array( + [ + Y_test_pred[i] + for i in range(self.num_cv_folds) + if Y_test_pred[i] is not None + ] + ) # Average the predictions of several models if len(Y_test_preds.shape) == 3: Y_test_preds = np.nanmean(Y_test_preds, axis=0) @@ -453,8 +521,12 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: self.model = self._get_model() status = StatusType.DONOTADVANCE - if any([model_current_iter == max_iter - for model_current_iter in models_current_iters]): + if any( + [ + model_current_iter == max_iter + for model_current_iter in models_current_iters + ] + ): status = StatusType.SUCCESS self.finish_up( loss=opt_loss, @@ -488,10 +560,11 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: # TODO: mention that no additional run info is possible in this # case! -> maybe remove full CV from the train evaluator anyway and # make the user implement this! - for i, (train_split, test_split) in enumerate(self.splitter.split( - self.X_train, y, - groups=self.resampling_strategy_args.get('groups') - )): + for i, (train_split, test_split) in enumerate( + self.splitter.split( + self.X_train, y, groups=self.resampling_strategy_args.get("groups") + ) + ): # TODO add check that split is actually an integer array, # not a boolean array (to allow indexed assignement of @@ -504,11 +577,11 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: valid_pred, test_pred, additional_run_info, - ) = ( - self._partial_fit_and_predict_standard( - i, train_indices=train_split, test_indices=test_split, - add_model_to_self=self.num_cv_folds == 1, - ) + ) = self._partial_fit_and_predict_standard( + i, + train_indices=train_split, + test_indices=test_split, + add_model_to_self=self.num_cv_folds == 1, ) else: ( @@ -517,11 +590,11 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: valid_pred, test_pred, additional_run_info, - ) = ( - self._partial_fit_and_predict_budget( - i, train_indices=train_split, test_indices=test_split, - add_model_to_self=self.num_cv_folds == 1, - ) + ) = self._partial_fit_and_predict_budget( + i, + train_indices=train_split, + test_indices=test_split, + add_model_to_self=self.num_cv_folds == 1, ) if ( @@ -531,8 +604,8 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: ): raise TAEAbortException( 'Found additional run info "%s" in fold %d, ' - 'but cannot handle additional run info if fold >= 1.' % - (additional_run_info, i) + "but cannot handle additional run info if fold >= 1." + % (additional_run_info, i) ) Y_train_pred[i] = train_pred @@ -564,16 +637,21 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: # Compute weights of each fold based on the number of samples in each # fold. - train_fold_weights = [w / sum(train_fold_weights) for w in train_fold_weights] + train_fold_weights = [ + w / sum(train_fold_weights) for w in train_fold_weights + ] opt_fold_weights = [w / sum(opt_fold_weights) for w in opt_fold_weights] # train_losses is a list of either scalars or dicts. If it contains dicts, # then train_loss is computed using the target metric (self.metric). if all(isinstance(elem, dict) for elem in train_losses): - train_loss = np.average([train_losses[i][str(self.metric)] - for i in range(self.num_cv_folds)], - weights=train_fold_weights, - ) + train_loss = np.average( + [ + train_losses[i][str(self.metric)] + for i in range(self.num_cv_folds) + ], + weights=train_fold_weights, + ) else: train_loss = np.average(train_losses, weights=train_fold_weights) @@ -582,10 +660,10 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: if self.scoring_functions: opt_loss = {} for metric in opt_losses[0].keys(): - opt_loss[metric] = np.average([opt_losses[i][metric] - for i in range(self.num_cv_folds)], - weights=opt_fold_weights, - ) + opt_loss[metric] = np.average( + [opt_losses[i][metric] for i in range(self.num_cv_folds)], + weights=opt_fold_weights, + ) else: opt_loss = np.average(opt_losses, weights=opt_fold_weights) @@ -593,23 +671,40 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: Y_train_targets = self.Y_train_targets Y_optimization_pred = np.concatenate( - [Y_optimization_pred[i] for i in range(self.num_cv_folds) - if Y_optimization_pred[i] is not None]) - Y_targets = np.concatenate([Y_targets[i] for i in range(self.num_cv_folds) - if Y_targets[i] is not None]) + [ + Y_optimization_pred[i] + for i in range(self.num_cv_folds) + if Y_optimization_pred[i] is not None + ] + ) + Y_targets = np.concatenate( + [ + Y_targets[i] + for i in range(self.num_cv_folds) + if Y_targets[i] is not None + ] + ) if self.X_valid is not None: - Y_valid_pred = np.array([Y_valid_pred[i] - for i in range(self.num_cv_folds) - if Y_valid_pred[i] is not None]) + Y_valid_pred = np.array( + [ + Y_valid_pred[i] + for i in range(self.num_cv_folds) + if Y_valid_pred[i] is not None + ] + ) # Average the predictions of several models if len(np.shape(Y_valid_pred)) == 3: Y_valid_pred = np.nanmean(Y_valid_pred, axis=0) if self.X_test is not None: - Y_test_pred = np.array([Y_test_pred[i] - for i in range(self.num_cv_folds) - if Y_test_pred[i] is not None]) + Y_test_pred = np.array( + [ + Y_test_pred[i] + for i in range(self.num_cv_folds) + if Y_test_pred[i] is not None + ] + ) # Average the predictions of several models if len(np.shape(Y_test_pred)) == 3: Y_test_pred = np.nanmean(Y_test_pred, axis=0) @@ -625,8 +720,8 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: # TODO check if there might be reasons for do-not-advance here! status = StatusType.SUCCESS elif ( - self.budget_type == 'iterations' - or self.budget_type == 'mixed' + self.budget_type == "iterations" + or self.budget_type == "mixed" and self.model.estimator_supports_iterative_fit() ): budget_factor = self.model.get_max_iter() @@ -661,19 +756,21 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: ) def partial_fit_predict_and_loss(self, fold: int, iterative: bool = False) -> None: - """Fit, predict and compute the loss for eval_partial_cv (both iterative and normal)""" - + """Fit, predict and get loss for eval_partial_cv (iterative and normal)""" if fold > self.num_cv_folds: - raise ValueError('Cannot evaluate a fold %d which is higher than ' - 'the number of folds %d.' % (fold, self.num_cv_folds)) + raise ValueError( + "Cannot evaluate a fold %d which is higher than " + "the number of folds %d." % (fold, self.num_cv_folds) + ) if self.budget_type is not None: raise NotImplementedError() y = _get_y_array(self.Y_train, self.task_type) - for i, (train_split, test_split) in enumerate(self.splitter.split( - self.X_train, y, - groups=self.resampling_strategy_args.get('groups') - )): + for i, (train_split, test_split) in enumerate( + self.splitter.split( + self.X_train, y, groups=self.resampling_strategy_args.get("groups") + ) + ): if i != fold: continue else: @@ -685,18 +782,25 @@ def partial_fit_predict_and_loss(self, fold: int, iterative: bool = False) -> No if iterative: self._partial_fit_and_predict_iterative( - fold, train_indices=train_split, test_indices=test_split, - add_model_to_self=True) + fold, + train_indices=train_split, + test_indices=test_split, + add_model_to_self=True, + ) elif self.budget_type is not None: raise NotImplementedError() else: - train_pred, opt_pred, valid_pred, test_pred, additional_run_info = ( - self._partial_fit_and_predict_standard( - fold, - train_indices=train_split, - test_indices=test_split, - add_model_to_self=True, - ) + ( + train_pred, + opt_pred, + valid_pred, + test_pred, + additional_run_info, + ) = self._partial_fit_and_predict_standard( + fold, + train_indices=train_split, + test_indices=test_split, + add_model_to_self=True, ) train_loss = self._loss(self.Y_actual_train, train_pred) loss = self._loss(self.Y_targets[fold], opt_pred) @@ -720,15 +824,19 @@ def partial_fit_predict_and_loss(self, fold: int, iterative: bool = False) -> No file_output=False, final_call=True, additional_run_info=None, - status=status + status=status, ) - def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int], - test_indices: List[int], - add_model_to_self: bool) -> None: + def _partial_fit_and_predict_iterative( + self, + fold: int, + train_indices: List[int], + test_indices: List[int], + add_model_to_self: bool, + ) -> None: model = self._get_model() - self.indices[fold] = ((train_indices, test_indices)) + self.indices[fold] = (train_indices, test_indices) # Do only output the files in the case of iterative holdout, # In case of iterative partial cv, no file output is needed @@ -737,14 +845,19 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int] if model.estimator_supports_iterative_fit(): Xt, fit_params = model.fit_transformer( - self.X_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.X_train[train_indices], - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices], + self.X_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.X_train[train_indices], + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], ) - self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices] + self.Y_train_targets[train_indices] = ( + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices] + ) iteration = 1 total_n_iteration = 0 @@ -760,19 +873,21 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int] while ( not model.configuration_fully_fitted() and model_current_iter < max_iter ): - n_iter = int(2**iteration/2) if iteration > 1 else 2 + n_iter = int(2**iteration / 2) if iteration > 1 else 2 total_n_iteration += n_iter model.iterative_fit( Xt, - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices], - n_iter=n_iter, **fit_params + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], + n_iter=n_iter, + **fit_params, ) ( Y_train_pred, Y_optimization_pred, Y_valid_pred, - Y_test_pred + Y_test_pred, ) = self._predict( model, train_indices=train_indices, @@ -783,9 +898,10 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int] self.model = model train_loss = self._loss( - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices], - Y_train_pred + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], + Y_train_pred, ) loss = self._loss(self.Y_train[test_indices], Y_optimization_pred) additional_run_info = model.get_additional_run_info() @@ -822,13 +938,15 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int] Y_optimization_pred, Y_valid_pred, Y_test_pred, - additional_run_info - ) = self._partial_fit_and_predict_standard(fold, train_indices, test_indices, - add_model_to_self) + additional_run_info, + ) = self._partial_fit_and_predict_standard( + fold, train_indices, test_indices, add_model_to_self + ) train_loss = self._loss( - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices], - Y_train_pred + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], + Y_train_pred, ) loss = self._loss(self.Y_train[test_indices], Y_optimization_pred) if self.model.estimator_supports_iterative_fit(): @@ -855,25 +973,30 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int] def _partial_fit_and_predict_standard( self, - fold: int, train_indices: List[int], + fold: int, + train_indices: List[int], test_indices: List[int], - add_model_to_self: bool = False - ) -> Tuple[PIPELINE_DATA_DTYPE, # train_pred - PIPELINE_DATA_DTYPE, # opt_pred - PIPELINE_DATA_DTYPE, # valid_pred - PIPELINE_DATA_DTYPE, # test_pred - TYPE_ADDITIONAL_INFO]: + add_model_to_self: bool = False, + ) -> Tuple[ + PIPELINE_DATA_DTYPE, # train_pred + PIPELINE_DATA_DTYPE, # opt_pred + PIPELINE_DATA_DTYPE, # valid_pred + PIPELINE_DATA_DTYPE, # test_pred + TYPE_ADDITIONAL_INFO, + ]: model = self._get_model() - self.indices[fold] = ((train_indices, test_indices)) + self.indices[fold] = (train_indices, test_indices) _fit_and_suppress_warnings( self.logger, model, - self.X_train.iloc[train_indices] if hasattr( - self.X_train, 'iloc') else self.X_train[train_indices], - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices], + self.X_train.iloc[train_indices] + if hasattr(self.X_train, "iloc") + else self.X_train[train_indices], + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], ) if add_model_to_self: @@ -881,10 +1004,16 @@ def _partial_fit_and_predict_standard( else: self.models[fold] = model - self.Y_targets[fold] = self.Y_train.iloc[test_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[test_indices] - self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices] + self.Y_targets[fold] = ( + self.Y_train.iloc[test_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[test_indices] + ) + self.Y_train_targets[train_indices] = ( + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices] + ) train_pred, opt_pred, valid_pred, test_pred = self._predict( model=model, @@ -902,24 +1031,30 @@ def _partial_fit_and_predict_standard( def _partial_fit_and_predict_budget( self, - fold: int, train_indices: List[int], + fold: int, + train_indices: List[int], test_indices: List[int], add_model_to_self: bool = False, - ) -> Tuple[PIPELINE_DATA_DTYPE, # train_pred - PIPELINE_DATA_DTYPE, # opt_pred - PIPELINE_DATA_DTYPE, # valid_pred - PIPELINE_DATA_DTYPE, # test_pred - TYPE_ADDITIONAL_INFO]: + ) -> Tuple[ + PIPELINE_DATA_DTYPE, # train_pred + PIPELINE_DATA_DTYPE, # opt_pred + PIPELINE_DATA_DTYPE, # valid_pred + PIPELINE_DATA_DTYPE, # test_pred + TYPE_ADDITIONAL_INFO, + ]: # This function is only called in the event budget is not None # Add this statement for mypy assert self.budget is not None model = self._get_model() - self.indices[fold] = ((train_indices, test_indices)) + self.indices[fold] = (train_indices, test_indices) self.Y_targets[fold] = self.Y_train[test_indices] - self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices], + self.Y_train_targets[train_indices] = ( + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], + ) _fit_with_budget( X_train=self.X_train, @@ -952,93 +1087,110 @@ def _partial_fit_and_predict_budget( additional_run_info, ) - def _predict(self, model: BaseEstimator, test_indices: List[int], - train_indices: List[int]) -> Tuple[PIPELINE_DATA_DTYPE, - PIPELINE_DATA_DTYPE, - PIPELINE_DATA_DTYPE, - PIPELINE_DATA_DTYPE]: + def _predict( + self, model: BaseEstimator, test_indices: List[int], train_indices: List[int] + ) -> Tuple[ + PIPELINE_DATA_DTYPE, + PIPELINE_DATA_DTYPE, + PIPELINE_DATA_DTYPE, + PIPELINE_DATA_DTYPE, + ]: train_pred = self.predict_function( - self.X_train.iloc[train_indices] if hasattr( - self.X_train, 'iloc') else self.X_train[train_indices], - model, self.task_type, - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices] + self.X_train.iloc[train_indices] + if hasattr(self.X_train, "iloc") + else self.X_train[train_indices], + model, + self.task_type, + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], ) opt_pred = self.predict_function( - self.X_train.iloc[test_indices] if hasattr( - self.X_train, 'iloc') else self.X_train[test_indices], - model, self.task_type, - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices] + self.X_train.iloc[test_indices] + if hasattr(self.X_train, "iloc") + else self.X_train[test_indices], + model, + self.task_type, + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], ) if self.X_valid is not None: X_valid = self.X_valid.copy() - valid_pred = self.predict_function(X_valid, model, - self.task_type, - self.Y_train[train_indices]) + valid_pred = self.predict_function( + X_valid, model, self.task_type, self.Y_train[train_indices] + ) else: valid_pred = None if self.X_test is not None: X_test = self.X_test.copy() test_pred = self.predict_function( - X_test, model, + X_test, + model, self.task_type, - self.Y_train.iloc[train_indices] if hasattr( - self.Y_train, 'iloc') else self.Y_train[train_indices] + self.Y_train.iloc[train_indices] + if hasattr(self.Y_train, "iloc") + else self.Y_train[train_indices], ) else: test_pred = None return train_pred, opt_pred, valid_pred, test_pred - def get_splitter(self, D: AbstractDataManager) -> Union[BaseCrossValidator, _RepeatedSplits, - BaseShuffleSplit]: + def get_splitter( + self, D: AbstractDataManager + ) -> Union[BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit]: if self.resampling_strategy_args is None: self.resampling_strategy_args = {} - if ( - self.resampling_strategy is not None - and not isinstance(self.resampling_strategy, str) + if self.resampling_strategy is not None and not isinstance( + self.resampling_strategy, str ): - if 'groups' not in self.resampling_strategy_args: - self.resampling_strategy_args['groups'] = None + if "groups" not in self.resampling_strategy_args: + self.resampling_strategy_args["groups"] = None - if isinstance(self.resampling_strategy, (BaseCrossValidator, - _RepeatedSplits, - BaseShuffleSplit)): + if isinstance( + self.resampling_strategy, + (BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit), + ): self.check_splitter_resampling_strategy( - X=D.data['X_train'], y=D.data['Y_train'], - groups=self.resampling_strategy_args.get('groups'), - task=D.info['task'], + X=D.data["X_train"], + y=D.data["Y_train"], + groups=self.resampling_strategy_args.get("groups"), + task=D.info["task"], resampling_strategy=self.resampling_strategy, ) return self.resampling_strategy # If it got to this point, we are dealing with a non-supported # re-sampling strategy - raise ValueError("Unsupported resampling strategy {}/{} provided".format( - self.resampling_strategy, - type(self.resampling_strategy), - )) + raise ValueError( + "Unsupported resampling strategy {}/{} provided".format( + self.resampling_strategy, + type(self.resampling_strategy), + ) + ) - y = D.data['Y_train'] - shuffle = self.resampling_strategy_args.get('shuffle', True) + y = D.data["Y_train"] + shuffle = self.resampling_strategy_args.get("shuffle", True) train_size = 0.67 if self.resampling_strategy_args: - train_size_from_user = self.resampling_strategy_args.get('train_size') + train_size_from_user = self.resampling_strategy_args.get("train_size") if train_size_from_user is not None: train_size = float(train_size_from_user) test_size = float("%.4f" % (1 - train_size)) - if D.info['task'] in CLASSIFICATION_TASKS and D.info['task'] != MULTILABEL_CLASSIFICATION: + if ( + D.info["task"] in CLASSIFICATION_TASKS + and D.info["task"] != MULTILABEL_CLASSIFICATION + ): y = y.ravel() - if self.resampling_strategy in ['holdout', - 'holdout-iterative-fit']: + if self.resampling_strategy in ["holdout", "holdout-iterative-fit"]: if shuffle: try: @@ -1050,7 +1202,7 @@ def get_splitter(self, D: AbstractDataManager) -> Union[BaseCrossValidator, _Rep test_cv = copy.deepcopy(cv) next(test_cv.split(y, y)) except ValueError as e: - if 'The least populated class in y has only' in e.args[0]: + if "The least populated class in y has only" in e.args[0]: cv = CustomStratifiedShuffleSplit( n_splits=1, test_size=test_size, @@ -1064,14 +1216,18 @@ def get_splitter(self, D: AbstractDataManager) -> Union[BaseCrossValidator, _Rep test_fold[:tmp_train_size] = -1 cv = PredefinedSplit(test_fold=test_fold) cv.n_splits = 1 # As sklearn is inconsistent here - elif self.resampling_strategy in ['cv', 'cv-iterative-fit', 'partial-cv', - 'partial-cv-iterative-fit']: + elif self.resampling_strategy in [ + "cv", + "cv-iterative-fit", + "partial-cv", + "partial-cv-iterative-fit", + ]: if shuffle: try: with warnings.catch_warnings(): - warnings.simplefilter('error') + warnings.simplefilter("error") cv = StratifiedKFold( - n_splits=self.resampling_strategy_args['folds'], + n_splits=self.resampling_strategy_args["folds"], shuffle=shuffle, random_state=1, ) @@ -1079,37 +1235,39 @@ def get_splitter(self, D: AbstractDataManager) -> Union[BaseCrossValidator, _Rep next(test_cv.split(y, y)) except UserWarning as e: print(e) - if 'The least populated class in y has only' in e.args[0]: + if "The least populated class in y has only" in e.args[0]: cv = CustomStratifiedKFold( - n_splits=self.resampling_strategy_args['folds'], + n_splits=self.resampling_strategy_args["folds"], shuffle=shuffle, random_state=1, ) else: raise e else: - cv = KFold(n_splits=self.resampling_strategy_args['folds'], - shuffle=shuffle) + cv = KFold( + n_splits=self.resampling_strategy_args["folds"], shuffle=shuffle + ) else: raise ValueError(self.resampling_strategy) else: - if self.resampling_strategy in ['holdout', - 'holdout-iterative-fit']: + if self.resampling_strategy in ["holdout", "holdout-iterative-fit"]: # TODO shuffle not taken into account for this if shuffle: - cv = ShuffleSplit(n_splits=1, test_size=test_size, - random_state=1) + cv = ShuffleSplit(n_splits=1, test_size=test_size, random_state=1) else: tmp_train_size = int(np.floor(train_size * y.shape[0])) test_fold = np.zeros(y.shape[0]) test_fold[:tmp_train_size] = -1 cv = PredefinedSplit(test_fold=test_fold) cv.n_splits = 1 # As sklearn is inconsistent here - elif self.resampling_strategy in ['cv', 'partial-cv', - 'partial-cv-iterative-fit']: + elif self.resampling_strategy in [ + "cv", + "partial-cv", + "partial-cv-iterative-fit", + ]: random_state = 1 if shuffle else None cv = KFold( - n_splits=self.resampling_strategy_args['folds'], + n_splits=self.resampling_strategy_args["folds"], shuffle=shuffle, random_state=random_state, ) @@ -1124,16 +1282,14 @@ def check_splitter_resampling_strategy( y: np.ndarray, task: int, groups: Any, - resampling_strategy: Union[BaseCrossValidator, _RepeatedSplits, - BaseShuffleSplit], + resampling_strategy: Union[ + BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit + ], ) -> None: if ( task in CLASSIFICATION_TASKS and task != MULTILABEL_CLASSIFICATION - or ( - task in REGRESSION_TASKS - and task != MULTIOUTPUT_REGRESSION - ) + or (task in REGRESSION_TASKS and task != MULTIOUTPUT_REGRESSION) ): y = y.ravel() @@ -1141,12 +1297,14 @@ def check_splitter_resampling_strategy( resampling_strategy.get_n_splits(X=X, y=y, groups=groups) next(resampling_strategy.split(X=X, y=y, groups=groups)) except Exception as e: - raise ValueError("Unsupported resampling strategy " - "{}/{} cause exception: {}".format( - resampling_strategy, - groups, - str(e), - )) + raise ValueError( + "Unsupported resampling strategy " + "{}/{} cause exception: {}".format( + resampling_strategy, + groups, + str(e), + ) + ) # create closure for evaluating an algorithm @@ -1154,7 +1312,9 @@ def eval_holdout( queue: multiprocessing.Queue, config: Union[int, Configuration], backend: Backend, - resampling_strategy: Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit], + resampling_strategy: Union[ + str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit + ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], metric: Scorer, seed: int, @@ -1199,7 +1359,9 @@ def eval_iterative_holdout( queue: multiprocessing.Queue, config: Union[int, Configuration], backend: Backend, - resampling_strategy: Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit], + resampling_strategy: Union[ + str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit + ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], metric: Scorer, seed: int, @@ -1236,7 +1398,7 @@ def eval_iterative_holdout( additional_components=additional_components, init_params=init_params, budget=budget, - budget_type=budget_type + budget_type=budget_type, ) @@ -1244,7 +1406,9 @@ def eval_partial_cv( queue: multiprocessing.Queue, config: Union[int, Configuration], backend: Backend, - resampling_strategy: Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit], + resampling_strategy: Union[ + str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit + ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], metric: Scorer, seed: int, @@ -1265,7 +1429,7 @@ def eval_partial_cv( if budget_type is not None: raise NotImplementedError() instance_dict: Dict[str, int] = json.loads(instance) if instance is not None else {} - fold = instance_dict['fold'] + fold = instance_dict["fold"] evaluator = TrainEvaluator( backend=backend, @@ -1295,7 +1459,9 @@ def eval_partial_cv_iterative( queue: multiprocessing.Queue, config: Union[int, Configuration], backend: Backend, - resampling_strategy: Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit], + resampling_strategy: Union[ + str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit + ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], metric: Scorer, seed: int, @@ -1341,7 +1507,9 @@ def eval_cv( queue: multiprocessing.Queue, config: Union[int, Configuration], backend: Backend, - resampling_strategy: Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit], + resampling_strategy: Union[ + str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit + ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], metric: Scorer, seed: int, @@ -1387,7 +1555,9 @@ def eval_iterative_cv( queue: multiprocessing.Queue, config: Union[int, Configuration], backend: Backend, - resampling_strategy: Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit], + resampling_strategy: Union[ + str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit + ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], metric: Scorer, seed: int, diff --git a/autosklearn/evaluation/util.py b/autosklearn/evaluation/util.py index e7483cbd26..c249c8be1c 100644 --- a/autosklearn/evaluation/util.py +++ b/autosklearn/evaluation/util.py @@ -3,14 +3,12 @@ import multiprocessing import queue +__all__ = ["read_queue"] -__all__ = [ - 'read_queue' -] - -def read_queue(queue_: multiprocessing.Queue - ) -> List[Dict[str, Union[str, bool, int, float, List, Dict, Tuple]]]: +def read_queue( + queue_: multiprocessing.Queue, +) -> List[Dict[str, Union[str, bool, int, float, List, Dict, Tuple]]]: stack = [] while True: try: @@ -21,8 +19,8 @@ def read_queue(queue_: multiprocessing.Queue # Check if there is a special placeholder value which tells us that # we don't have to wait until the queue times out in order to # retrieve the final value! - if 'final_queue_element' in rval: - del rval['final_queue_element'] + if "final_queue_element" in rval: + del rval["final_queue_element"] do_break = True else: do_break = False @@ -46,12 +44,13 @@ def empty_queue(queue_: multiprocessing.Queue) -> None: queue_.close() -def extract_learning_curve(stack: List[Dict[str, Any]], - key: Optional[str] = None) -> List[float]: +def extract_learning_curve( + stack: List[Dict[str, Any]], key: Optional[str] = None +) -> List[float]: learning_curve = [] for entry in stack: if key: - learning_curve.append(entry['additional_run_info'][key]) + learning_curve.append(entry["additional_run_info"][key]) else: - learning_curve.append(entry['loss']) + learning_curve.append(entry["loss"]) return list(learning_curve) diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py index 749dfc6611..7068270a8e 100644 --- a/autosklearn/experimental/askl2.py +++ b/autosklearn/experimental/askl2.py @@ -1,59 +1,65 @@ +from typing import Any, Dict, List, Mapping, Optional, Union + import hashlib import json import os import pathlib import pickle -from typing import Any, Dict, List, Optional, Union, Mapping import dask.distributed -import scipy.sparse - -from ConfigSpace import Configuration import numpy as np import pandas as pd +import scipy.sparse import sklearn +from ConfigSpace import Configuration import autosklearn -from autosklearn.classification import AutoSklearnClassifier import autosklearn.experimental.selector -from autosklearn.metrics import Scorer, balanced_accuracy, roc_auc, log_loss, accuracy +from autosklearn.classification import AutoSklearnClassifier +from autosklearn.metrics import Scorer, accuracy, balanced_accuracy, log_loss, roc_auc metrics = (balanced_accuracy, roc_auc, log_loss) selector_files = {} this_directory = pathlib.Path(__file__).resolve().parent for metric in metrics: - training_data_file = this_directory / metric.name / 'askl2_training_data.json' + training_data_file = this_directory / metric.name / "askl2_training_data.json" with open(training_data_file) as fh: training_data = json.load(fh) fh.seek(0) m = hashlib.md5() - m.update(fh.read().encode('utf8')) + m.update(fh.read().encode("utf8")) training_data_hash = m.hexdigest()[:10] selector_filename = "askl2_selector_%s_%s_%s_%s.pkl" % ( autosklearn.__version__, sklearn.__version__, metric.name, - training_data_hash + training_data_hash, ) - selector_directory = os.environ.get('XDG_CACHE_HOME') + selector_directory = os.environ.get("XDG_CACHE_HOME") if selector_directory is None: selector_directory = pathlib.Path.home() - selector_directory = pathlib.Path(selector_directory).joinpath('auto-sklearn').expanduser() + selector_directory = ( + pathlib.Path(selector_directory).joinpath("auto-sklearn").expanduser() + ) selector_files[metric.name] = selector_directory / selector_filename - metafeatures = pd.DataFrame(training_data['metafeatures']) - strategies = training_data['strategies'] - y_values = pd.DataFrame(training_data['y_values'], columns=strategies, index=metafeatures.index) - minima_for_methods = training_data['minima_for_methods'] - maxima_for_methods = training_data['maxima_for_methods'] - default_strategies = training_data['tie_break_order'] + metafeatures = pd.DataFrame(training_data["metafeatures"]) + strategies = training_data["strategies"] + y_values = pd.DataFrame( + training_data["y_values"], columns=strategies, index=metafeatures.index + ) + minima_for_methods = training_data["minima_for_methods"] + maxima_for_methods = training_data["maxima_for_methods"] + default_strategies = training_data["tie_break_order"] if not selector_files[metric.name].exists(): selector = autosklearn.experimental.selector.OVORF( - configuration=training_data['configuration'], + configuration=training_data["configuration"], random_state=np.random.RandomState(1), n_estimators=500, tie_break_order=default_strategies, ) - selector = autosklearn.experimental.selector.FallbackWrapper(selector, default_strategies) + selector = autosklearn.experimental.selector.FallbackWrapper( + selector, default_strategies + ) selector.fit( X=metafeatures, y=y_values, @@ -63,12 +69,14 @@ selector_files[metric.name].parent.mkdir(exist_ok=True, parents=True) try: - with open(selector_files[metric.name], 'wb') as fh: + with open(selector_files[metric.name], "wb") as fh: pickle.dump(selector, fh) except Exception as e: - print("AutoSklearn2Classifier needs to create a selector file under " - "the user's home directory or XDG_CACHE_HOME. Nevertheless " - "the path {} is not writable.".format(selector_files[metric.name])) + print( + "AutoSklearn2Classifier needs to create a selector file under " + "the user's home directory or XDG_CACHE_HOME. Nevertheless " + "the path {} is not writable.".format(selector_files[metric.name]) + ) raise e @@ -87,9 +95,9 @@ def __call__( dask_client, ): from smac.facade.smac_ac_facade import SMAC4AC + from smac.intensification.simple_intensifier import SimpleIntensifier from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost from smac.scenario.scenario import Scenario - from smac.intensification.simple_intensifier import SimpleIntensifier scenario = Scenario(scenario_dict) @@ -151,7 +159,7 @@ def __call__( pass rh2EPM = RunHistory2EPM4LogCost - ta_kwargs['budget_type'] = self.budget_type + ta_kwargs["budget_type"] = self.budget_type smac4ac = SMAC4AC( scenario=scenario, @@ -163,10 +171,10 @@ def __call__( run_id=seed, intensifier=SuccessiveHalving, intensifier_kwargs={ - 'initial_budget': self.initial_budget, - 'max_budget': 100, - 'eta': self.eta, - 'min_chall': 1, + "initial_budget": self.initial_budget, + "max_budget": 100, + "eta": self.eta, + "min_chall": 1, }, dask_client=dask_client, n_jobs=n_jobs, @@ -178,7 +186,6 @@ def __call__( class AutoSklearn2Classifier(AutoSklearnClassifier): - def __init__( self, time_left_for_this_task: int = 3600, @@ -198,7 +205,7 @@ def __init__( metric: Optional[Scorer] = None, scoring_functions: Optional[List[Scorer]] = None, load_models: bool = True, - dataset_compression: Union[bool, Mapping[str, Any]] = True + dataset_compression: Union[bool, Mapping[str, Any]] = True, ): """ @@ -240,11 +247,11 @@ def __init__( Memory limit in MB for the machine learning algorithm. `auto-sklearn` will stop fitting the machine learning algorithm if it tries to allocate more than ``memory_limit`` MB. - - **Important notes:** - + + **Important notes:** + * If ``None`` is provided, no memory limit is set. - * In case of multi-processing, ``memory_limit`` will be *per job*, so the total usage is + * In case of multi-processing, ``memory_limit`` will be *per job*, so the total usage is ``n_jobs x memory_limit``. * The memory limit also applies to the ensemble creation process. @@ -258,12 +265,12 @@ def __init__( n_jobs : int, optional, experimental The number of jobs to run in parallel for ``fit()``. ``-1`` means - using all processors. - - **Important notes**: - - * By default, Auto-sklearn uses one core. - * Ensemble building is not affected by ``n_jobs`` but can be controlled by the number + using all processors. + + **Important notes**: + + * By default, Auto-sklearn uses one core. + * Ensemble building is not affected by ``n_jobs`` but can be controlled by the number of models in the ensemble. * ``predict()`` is not affected by ``n_jobs`` (in contrast to most scikit-learn models) * If ``dask_client`` is ``None``, a new dask client is created. @@ -319,11 +326,18 @@ def __init__( """ # noqa (links are too long) include_estimators = [ - 'extra_trees', 'passive_aggressive', 'random_forest', 'sgd', 'gradient_boosting', 'mlp', + "extra_trees", + "passive_aggressive", + "random_forest", + "sgd", + "gradient_boosting", + "mlp", ] include_preprocessors = ["no_preprocessing"] - include = {'classifier': include_estimators, - 'feature_preprocessor': include_preprocessors} + include = { + "classifier": include_estimators, + "feature_preprocessor": include_preprocessors, + } super().__init__( time_left_for_this_task=time_left_for_this_task, per_run_time_limit=per_run_time_limit, @@ -351,31 +365,40 @@ def __init__( load_models=load_models, ) - def fit(self, X, y, - X_test=None, - y_test=None, - metric=None, - feat_type=None, - dataset_name=None): + def fit( + self, + X, + y, + X_test=None, + y_test=None, + metric=None, + feat_type=None, + dataset_name=None, + ): # TODO - # regularly check https://github.com/scikit-learn/scikit-learn/issues/15336 whether - # histogram gradient boosting in scikit-learn finally support sparse data + # regularly check https://github.com/scikit-learn/scikit-learn/issues/15336 + # whether histogram gradient boosting in scikit-learn finally support + # sparse data is_sparse = scipy.sparse.issparse(X) if is_sparse: include_estimators = [ - 'extra_trees', 'passive_aggressive', 'random_forest', 'sgd', 'mlp', + "extra_trees", + "passive_aggressive", + "random_forest", + "sgd", + "mlp", ] else: include_estimators = [ - 'extra_trees', - 'passive_aggressive', - 'random_forest', - 'sgd', - 'gradient_boosting', - 'mlp', + "extra_trees", + "passive_aggressive", + "random_forest", + "sgd", + "gradient_boosting", + "mlp", ] - self.include['classifier'] = include_estimators + self.include["classifier"] = include_estimators if self.metric is None: if len(y.shape) == 1 or y.shape[1] == 1: @@ -387,71 +410,76 @@ def fit(self, X, y, metric_name = self.metric.name selector_file = selector_files[metric_name] else: - metric_name = 'balanced_accuracy' + metric_name = "balanced_accuracy" selector_file = selector_files[metric_name] - with open(selector_file, 'rb') as fh: + with open(selector_file, "rb") as fh: selector = pickle.load(fh) - metafeatures = pd.DataFrame({dataset_name: [X.shape[1], X.shape[0]]}).transpose() + metafeatures = pd.DataFrame( + {dataset_name: [X.shape[1], X.shape[0]]} + ).transpose() selection = np.argmax(selector.predict(metafeatures)) automl_policy = strategies[selection] setting = { - 'RF_None_holdout_iterative_es_if': { - 'resampling_strategy': 'holdout-iterative-fit', - 'fidelity': None, + "RF_None_holdout_iterative_es_if": { + "resampling_strategy": "holdout-iterative-fit", + "fidelity": None, + }, + "RF_None_3CV_iterative_es_if": { + "resampling_strategy": "cv-iterative-fit", + "folds": 3, + "fidelity": None, }, - 'RF_None_3CV_iterative_es_if': { - 'resampling_strategy': 'cv-iterative-fit', - 'folds': 3, - 'fidelity': None, + "RF_None_5CV_iterative_es_if": { + "resampling_strategy": "cv-iterative-fit", + "folds": 5, + "fidelity": None, }, - 'RF_None_5CV_iterative_es_if': { - 'resampling_strategy': 'cv-iterative-fit', - 'folds': 5, - 'fidelity': None, + "RF_None_10CV_iterative_es_if": { + "resampling_strategy": "cv-iterative-fit", + "folds": 10, + "fidelity": None, }, - 'RF_None_10CV_iterative_es_if': { - 'resampling_strategy': 'cv-iterative-fit', - 'folds': 10, - 'fidelity': None, + "RF_SH-eta4-i_holdout_iterative_es_if": { + "resampling_strategy": "holdout-iterative-fit", + "fidelity": "SH", }, - 'RF_SH-eta4-i_holdout_iterative_es_if': { - 'resampling_strategy': 'holdout-iterative-fit', - 'fidelity': 'SH', + "RF_SH-eta4-i_3CV_iterative_es_if": { + "resampling_strategy": "cv-iterative-fit", + "folds": 3, + "fidelity": "SH", }, - 'RF_SH-eta4-i_3CV_iterative_es_if': { - 'resampling_strategy': 'cv-iterative-fit', - 'folds': 3, - 'fidelity': 'SH', + "RF_SH-eta4-i_5CV_iterative_es_if": { + "resampling_strategy": "cv-iterative-fit", + "folds": 5, + "fidelity": "SH", }, - 'RF_SH-eta4-i_5CV_iterative_es_if': { - 'resampling_strategy': 'cv-iterative-fit', - 'folds': 5, - 'fidelity': 'SH', + "RF_SH-eta4-i_10CV_iterative_es_if": { + "resampling_strategy": "cv-iterative-fit", + "folds": 10, + "fidelity": "SH", }, - 'RF_SH-eta4-i_10CV_iterative_es_if': { - 'resampling_strategy': 'cv-iterative-fit', - 'folds': 10, - 'fidelity': 'SH', - } }[automl_policy] - resampling_strategy = setting['resampling_strategy'] - if resampling_strategy == 'cv-iterative-fit': - resampling_strategy_kwargs = {'folds': setting['folds']} + resampling_strategy = setting["resampling_strategy"] + if resampling_strategy == "cv-iterative-fit": + resampling_strategy_kwargs = {"folds": setting["folds"]} else: resampling_strategy_kwargs = None portfolio_file = ( - this_directory / metric_name / 'askl2_portfolios' / ('%s.json' % automl_policy) + this_directory + / metric_name + / "askl2_portfolios" + / ("%s.json" % automl_policy) ) with open(portfolio_file) as fh: portfolio_json = json.load(fh) - portfolio = portfolio_json['portfolio'] + portfolio = portfolio_json["portfolio"] - if setting['fidelity'] == 'SH': - smac_callback = SHObjectCallback('iterations', 4, 5.0, portfolio) + if setting["fidelity"] == "SH": + smac_callback = SHObjectCallback("iterations", 4, 5.0, portfolio) else: smac_callback = SmacObjectCallback(portfolio) diff --git a/autosklearn/experimental/selector.py b/autosklearn/experimental/selector.py index 3ef681bef8..125cba6125 100644 --- a/autosklearn/experimental/selector.py +++ b/autosklearn/experimental/selector.py @@ -1,6 +1,7 @@ +import typing + import copy import itertools -import typing import numpy as np import pandas as pd @@ -9,7 +10,6 @@ class AbstractSelector: - def fit( self, X: pd.DataFrame, @@ -19,18 +19,22 @@ def fit( ) -> None: raise NotImplementedError() - def predict(self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame] = None) -> pd.DataFrame: + def predict( + self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame] = None + ) -> pd.DataFrame: prediction = self._predict(X, y) for col, series in prediction.iteritems(): assert series.dtype == float, (col, series) np.testing.assert_array_almost_equal( - prediction.sum(axis='columns').to_numpy(), + prediction.sum(axis="columns").to_numpy(), np.ones(X.shape[0]), err_msg=prediction.to_csv(), ) return prediction - def _predict(self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame]) -> pd.DataFrame: + def _predict( + self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame] + ) -> pd.DataFrame: raise NotImplementedError() @@ -68,43 +72,60 @@ def fit( weights[i] = dict() for j in range(i + 1, len(target_indices)): - if self.configuration['normalization'] in ('all', 'binary', 'y', 'all1', - 'binary1'): + if self.configuration["normalization"] in ( + "all", + "binary", + "y", + "all1", + "binary1", + ): minimum2 = np.ones(len(X)) * np.inf maximum2 = np.zeros(len(X)) - if self.configuration['normalization'] in ('all', 'all1'): + if self.configuration["normalization"] in ("all", "all1"): for idx, task_id in enumerate(X.index): for method_id in range(len(target_indices)): - minimum2[idx] = np.nanmin(( - minimum2[idx], - minima[task_id][self.strategies_[method_id]] - )) - maximum2[idx] = np.nanmax(( - maximum2[idx], - maxima[task_id][self.strategies_[method_id]] - )) - if self.configuration['normalization'] == 'all1': + minimum2[idx] = np.nanmin( + ( + minimum2[idx], + minima[task_id][self.strategies_[method_id]], + ) + ) + maximum2[idx] = np.nanmax( + ( + maximum2[idx], + maxima[task_id][self.strategies_[method_id]], + ) + ) + if self.configuration["normalization"] == "all1": maximum2 = np.ones_like(maximum2) - elif self.configuration['normalization'] in ('binary', 'binary1'): + elif self.configuration["normalization"] in ("binary", "binary1"): for idx, task_id in enumerate(X.index): for method_id in (i, j): - minimum2[idx] = np.nanmin(( - minimum2[idx], - minima[task_id][self.strategies_[method_id]] - )) - maximum2[idx] = np.nanmax(( - maximum2[idx], - maxima[task_id][self.strategies_[method_id]] - )) - if self.configuration['normalization'] == 'binary1': + minimum2[idx] = np.nanmin( + ( + minimum2[idx], + minima[task_id][self.strategies_[method_id]], + ) + ) + maximum2[idx] = np.nanmax( + ( + maximum2[idx], + maxima[task_id][self.strategies_[method_id]], + ) + ) + if self.configuration["normalization"] == "binary1": maximum2 = np.ones_like(maximum2) - elif self.configuration['normalization'] == 'y': + elif self.configuration["normalization"] == "y": for idx, task_id in enumerate(X.index): - minimum2[idx] = np.nanmin((minimum2[idx], y_pd.loc[task_id].min())) - maximum2[idx] = np.nanmax((maximum2[idx], y_pd.loc[task_id].max())) + minimum2[idx] = np.nanmin( + (minimum2[idx], y_pd.loc[task_id].min()) + ) + maximum2[idx] = np.nanmax( + (maximum2[idx], y_pd.loc[task_id].max()) + ) else: - raise ValueError(self.configuration['normalization']) + raise ValueError(self.configuration["normalization"]) y_i_j = y[:, i] < y[:, j] mask = np.isfinite(y[:, i]) & np.isfinite(y[:, j]) @@ -121,7 +142,7 @@ def fit( weights_i_j = np.abs(normalized_y_i - normalized_y_j) - elif self.configuration['normalization'] == 'rank': + elif self.configuration["normalization"] == "rank": y_i_j = y[:, i] < y[:, j] mask = np.isfinite(y[:, i]) & np.isfinite(y[:, j]) X_ = X.to_numpy()[mask] @@ -129,7 +150,7 @@ def fit( ranks = scipy.stats.rankdata(y[mask], axis=1) weights_i_j = np.abs(ranks[:, i] - ranks[:, j]) - elif self.configuration['normalization'] == 'None': + elif self.configuration["normalization"] == "None": y_i_j = y[:, i] < y[:, j] mask = np.isfinite(y[:, i]) & np.isfinite(y[:, j]) X_ = X.to_numpy()[mask] @@ -137,7 +158,7 @@ def fit( weights_i_j = np.ones_like(y_i_j).astype(int) else: - raise ValueError(self.configuration['normalization']) + raise ValueError(self.configuration["normalization"]) if len(y_i_j) == 0: models[i][j] = None @@ -148,21 +169,25 @@ def fit( n_zeros = int(np.ceil(len(y_i_j) / 2)) n_ones = int(np.floor(len(y_i_j) / 2)) import sklearn.dummy - base_model = sklearn.dummy.DummyClassifier(strategy='constant', - constant=y_i_j[0]) + + base_model = sklearn.dummy.DummyClassifier( + strategy="constant", constant=y_i_j[0] + ) base_model.fit( X_, np.array(([[0]] * n_zeros) + ([[1]] * n_ones)).flatten(), sample_weight=weights_i_j, ) else: - if self.configuration.get('max_depth') == 0: + if self.configuration.get("max_depth") == 0: import sklearn.dummy + loss_i = np.sum((y_i_j == 0) * weights_i_j) loss_j = np.sum((y_i_j == 1) * weights_i_j) base_model = sklearn.dummy.DummyClassifier( - strategy='constant', constant=1 if loss_i < loss_j else 0, + strategy="constant", + constant=1 if loss_i < loss_j else 0, ) base_model.fit( X_, @@ -171,7 +196,11 @@ def fit( ) else: base_model = self.fit_pairwise_model( - X_, y_i_j, weights_i_j, self.rng, self.configuration, + X_, + y_i_j, + weights_i_j, + self.rng, + self.configuration, ) models[i][j] = base_model weights[i][j] = weights_i_j @@ -179,7 +208,9 @@ def fit( self.weights_ = weights self.target_indices = target_indices - def _predict(self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame]) -> pd.DataFrame: + def _predict( + self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame] + ) -> pd.DataFrame: if y is not None: raise ValueError("y must not be provided") @@ -193,7 +224,9 @@ def _predict(self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame]) -> pd.Data raw_probas[(i, j)] = self.models[i][j].predict_proba(X) if len(raw_predictions) == 0: - predictions = pd.DataFrame(0, index=X.index, columns=self.strategies_).astype(float) + predictions = pd.DataFrame( + 0, index=X.index, columns=self.strategies_ + ).astype(float) predictions.iloc[:, self.single_strategy_idx] = 1.0 return predictions @@ -203,21 +236,21 @@ def _predict(self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame]) -> pd.Data for i in range(len(self.target_indices)): for j in range(i + 1, len(self.target_indices)): if (i, j) in raw_predictions: - if self.configuration['prediction'] == 'soft': + if self.configuration["prediction"] == "soft": if raw_probas[(i, j)].shape[1] == 1: proba = raw_probas[(i, j)][x_idx][0] else: proba = raw_probas[(i, j)][x_idx][1] wins[i] += proba wins[j] += 1 - proba - elif self.configuration['prediction'] == 'hard': + elif self.configuration["prediction"] == "hard": prediction = raw_predictions[(i, j)][x_idx] if prediction == 1: wins[i] += 1 else: wins[j] += 1 else: - raise ValueError(self.configuration['prediction']) + raise ValueError(self.configuration["prediction"]) n_prev = np.inf # Tie breaking @@ -236,7 +269,9 @@ def _predict(self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame]) -> pd.Data hit = True break if not hit: - wins[int(self.rng.choice(np.argwhere(most_wins_mask).flatten()))] += 1 + wins[ + int(self.rng.choice(np.argwhere(most_wins_mask).flatten())) + ] += 1 elif np.sum(most_wins_mask) > 1: n_prev = np.sum(most_wins_mask) where = np.argwhere(most_wins_mask).flatten() @@ -250,10 +285,9 @@ def _predict(self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame]) -> pd.Data else: method_i = self.strategies_[i] method_j = self.strategies_[j] - if ( - self.tie_break_order.index(method_i) - < self.tie_break_order.index(method_j) - ): + if self.tie_break_order.index( + method_i + ) < self.tie_break_order.index(method_j): wins[i] += 1 else: wins[j] += 1 @@ -288,18 +322,17 @@ def fit_pairwise_model(self, X, y, weights, rng, configuration): base_model = sklearn.ensemble.RandomForestClassifier( random_state=rng, n_estimators=self.n_estimators, - bootstrap=True if configuration['bootstrap'] == 'True' else False, - min_samples_split=configuration['min_samples_split'], - min_samples_leaf=configuration['min_samples_leaf'], - max_features=int(configuration['max_features']), - max_depth=configuration['max_depth'], + bootstrap=True if configuration["bootstrap"] == "True" else False, + min_samples_split=configuration["min_samples_split"], + min_samples_leaf=configuration["min_samples_leaf"], + max_features=int(configuration["max_features"]), + max_depth=configuration["max_depth"], ) base_model.fit(X, y, sample_weight=weights) return base_model class FallbackWrapper(AbstractSelector): - def __init__(self, selector, default_strategies: typing.List[str]): self.selector = selector self.default_strategies = default_strategies @@ -313,16 +346,19 @@ def fit( ) -> None: self.X_ = X self.strategies_ = y.columns - self.rval_ = np.array([ - ( - len(self.strategies_) - self.default_strategies.index(strategy) - 1 - ) / (len(self.strategies_) - 1) - for strategy in self.strategies_ - ]) + self.rval_ = np.array( + [ + (len(self.strategies_) - self.default_strategies.index(strategy) - 1) + / (len(self.strategies_) - 1) + for strategy in self.strategies_ + ] + ) self.rval_ = self.rval_ / np.sum(self.rval_) self.selector.fit(X, y, minima, maxima) - def _predict(self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame]) -> pd.DataFrame: + def _predict( + self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame] + ) -> pd.DataFrame: if y is not None: prediction = self.selector.predict(X, y) @@ -338,8 +374,11 @@ def _predict(self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame]) -> pd.Data counter += 1 if counter == 0: - prediction.loc[task_id] = pd.Series({ - strategy: value for strategy, value in zip(self.strategies_, self.rval_) - }) + prediction.loc[task_id] = pd.Series( + { + strategy: value + for strategy, value in zip(self.strategies_, self.rval_) + } + ) return prediction diff --git a/autosklearn/metalearning/__init__.py b/autosklearn/metalearning/__init__.py index cc3cd7becd..e298f0f075 100644 --- a/autosklearn/metalearning/__init__.py +++ b/autosklearn/metalearning/__init__.py @@ -1,2 +1,2 @@ # -*- encoding: utf-8 -*- -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/autosklearn/metalearning/input/aslib_simple.py b/autosklearn/metalearning/input/aslib_simple.py index 7bac637c50..833242729d 100644 --- a/autosklearn/metalearning/input/aslib_simple.py +++ b/autosklearn/metalearning/input/aslib_simple.py @@ -1,7 +1,7 @@ -from collections import defaultdict, OrderedDict import csv import logging import os +from collections import OrderedDict, defaultdict import arff import pandas as pd @@ -24,7 +24,7 @@ def __init__(self, directory): # "feature_runstatus.arff": self._read_feature_runstatus, # "ground_truth.arff": self._read_ground_truth, # "cv.arff": self._read_cv, - "configurations.csv": self._read_configurations + "configurations.csv": self._read_configurations, } self.found_files = [] @@ -33,24 +33,28 @@ def __init__(self, directory): self._read_files() def _find_files(self): - ''' - find all expected files in self.dir_ - fills self.found_files - ''' + """ + find all expected files in self.dir_ + fills self.found_files + """ expected = [ # "description.txt", "algorithm_runs.arff", "feature_values.arff", # "feature_runstatus.arff", ] - optional = ["ground_truth.arff", "feature_costs.arff", "citation.bib", - "cv.arff", "configurations.csv"] + optional = [ + "ground_truth.arff", + "feature_costs.arff", + "citation.bib", + "cv.arff", + "configurations.csv", + ] for expected_file in expected: full_path = os.path.join(self.dir_, expected_file) if not os.path.isfile(full_path): - self.logger.error( - "Not found: %s (has to be added)" % (full_path)) + self.logger.error("Not found: %s (has to be added)" % (full_path)) else: self.found_files.append(full_path) @@ -64,10 +68,10 @@ def _find_files(self): self.found_files.append(full_path) def _read_files(self): - ''' - iterates over all found files (self.found_files) and - calls the corresponding function to validate file - ''' + """ + iterates over all found files (self.found_files) and + calls the corresponding function to validate file + """ for file_ in self.found_files: read_func = self.read_funcs.get(os.path.basename(file_)) if read_func: @@ -79,15 +83,18 @@ def _read_algorithm_runs(self, filename): if arff_dict["attributes"][0][0].upper() != "INSTANCE_ID": self.logger.error( - "instance_id as first attribute is missing in %s" % (filename)) + "instance_id as first attribute is missing in %s" % (filename) + ) if arff_dict["attributes"][1][0].upper() != "REPETITION": self.logger.error( - "repetition as second attribute is missing in %s" % (filename)) + "repetition as second attribute is missing in %s" % (filename) + ) if arff_dict["attributes"][2][0].upper() != "ALGORITHM": self.logger.error( - "algorithm as third attribute is missing in %s" % (filename)) + "algorithm as third attribute is missing in %s" % (filename) + ) - performance_measures = [pm[0] for pm in arff_dict['attributes'][3:-1]] + performance_measures = [pm[0] for pm in arff_dict["attributes"][3:-1]] measure_instance_algorithm_triples = defaultdict(lambda: defaultdict(dict)) for data in arff_dict["data"]: @@ -97,18 +104,20 @@ def _read_algorithm_runs(self, filename): perf_list = data[3:-1] status = data[-1] - if status != 'ok': + if status != "ok": continue for i, performance_measure in enumerate(performance_measures): - measure_instance_algorithm_triples[performance_measure][ - inst_name][algorithm] = perf_list[i] + measure_instance_algorithm_triples[performance_measure][inst_name][ + algorithm + ] = perf_list[i] # TODO: this does not support any repetitions! measure_algorithm_matrices = OrderedDict() for pm in performance_measures: measure_algorithm_matrices[pm] = pd.DataFrame( - measure_instance_algorithm_triples[pm]).transpose() + measure_instance_algorithm_triples[pm] + ).transpose() self.algorithm_runs = measure_algorithm_matrices @@ -122,9 +131,10 @@ def _read_feature_values(self, filename): # repetition = data[1] features = data[2:] - metafeatures[inst_name] = {feature[0]: feature_value - for feature, feature_value in - zip(arff_dict['attributes'][2:], features)} + metafeatures[inst_name] = { + feature[0]: feature_value + for feature, feature_value in zip(arff_dict["attributes"][2:], features) + } self.metafeatures = pd.DataFrame(metafeatures).transpose() @@ -135,9 +145,9 @@ def _read_configurations(self, filename): configurations = dict() for line in csv_reader: configuration = dict() - algorithm_id = line['idx'] + algorithm_id = line["idx"] for hp_name, value in line.items(): - if not value or hp_name == 'idx': + if not value or hp_name == "idx": continue try: diff --git a/autosklearn/metalearning/metafeatures/metafeature.py b/autosklearn/metalearning/metafeatures/metafeature.py index 821a5033f4..033b76116b 100644 --- a/autosklearn/metalearning/metafeatures/metafeature.py +++ b/autosklearn/metalearning/metafeatures/metafeature.py @@ -1,6 +1,7 @@ from abc import ABCMeta, abstractmethod -from io import StringIO + import time +from io import StringIO import arff import scipy.sparse @@ -33,8 +34,15 @@ def __call__(self, X, y, logger, categorical=None): comment = "Memory Error" endtime = time.time() - return MetaFeatureValue(self.__class__.__name__, self.type_, - 0, 0, value, endtime-starttime, comment=comment) + return MetaFeatureValue( + self.__class__.__name__, + self.type_, + 0, + 0, + value, + endtime - starttime, + comment=comment, + ) class MetaFeature(AbstractMetaFeature): @@ -65,15 +73,26 @@ def to_arff_row(self): else: value = "?" - return [self.name, self.type_, self.fold, - self.repeat, value, self.time, self.comment] + return [ + self.name, + self.type_, + self.fold, + self.repeat, + value, + self.time, + self.comment, + ] def __repr__(self): - repr = "%s (type: %s, fold: %d, repeat: %d, value: %s, time: %3.3f, " \ - "comment: %s)" - repr = repr % tuple(self.to_arff_row()[:4] + - [str(self.to_arff_row()[4])] + - self.to_arff_row()[5:]) + repr = ( + "%s (type: %s, fold: %d, repeat: %d, value: %s, time: %3.3f, " + "comment: %s)" + ) + repr = repr % tuple( + self.to_arff_row()[:4] + + [str(self.to_arff_row()[4])] + + self.to_arff_row()[5:] + ) return repr @@ -84,19 +103,21 @@ def __init__(self, dataset_name, metafeature_values): def _get_arff(self): output = dict() - output['relation'] = "metafeatures_%s" % (self.dataset_name) - output['description'] = "" - output['attributes'] = [('name', 'STRING'), - ('type', 'STRING'), - ('fold', 'NUMERIC'), - ('repeat', 'NUMERIC'), - ('value', 'NUMERIC'), - ('time', 'NUMERIC'), - ('comment', 'STRING')] - output['data'] = [] + output["relation"] = "metafeatures_%s" % (self.dataset_name) + output["description"] = "" + output["attributes"] = [ + ("name", "STRING"), + ("type", "STRING"), + ("fold", "NUMERIC"), + ("repeat", "NUMERIC"), + ("value", "NUMERIC"), + ("time", "NUMERIC"), + ("comment", "STRING"), + ] + output["data"] = [] for key in sorted(self.metafeature_values): - output['data'].append(self.metafeature_values[key].to_arff_row()) + output["data"].append(self.metafeature_values[key].to_arff_row()) return output def dumps(self): @@ -120,9 +141,9 @@ def load(cls, path_or_filehandle): else: input = arff.load(path_or_filehandle) - dataset_name = input['relation'].replace('metafeatures_', '') + dataset_name = input["relation"].replace("metafeatures_", "") metafeature_values = [] - for item in input['data']: + for item in input["data"]: mf = MetaFeatureValue(*item) metafeature_values.append(mf) @@ -135,13 +156,18 @@ def __repr__(self, verbosity=0): if verbosity == 0 and self.metafeature_values[name].type_ != "METAFEATURE": continue if verbosity == 0: - repr.write(" %s: %s\n" % - (str(name), str(self.metafeature_values[name].value))) + repr.write( + " %s: %s\n" % (str(name), str(self.metafeature_values[name].value)) + ) elif verbosity >= 1: - repr.write(" %s: %10s (%10fs)\n" % - (str(name), str(self.metafeature_values[ - name].value)[:10], - self.metafeature_values[name].time)) + repr.write( + " %s: %10s (%10fs)\n" + % ( + str(name), + str(self.metafeature_values[name].value)[:10], + self.metafeature_values[name].time, + ) + ) # Add the reason for a crash if one happened! if verbosity > 1 and self.metafeature_values[name].comment: diff --git a/autosklearn/metalearning/metafeatures/metafeatures.py b/autosklearn/metalearning/metafeatures/metafeatures.py index 9652cfc673..3c95fbf22f 100644 --- a/autosklearn/metalearning/metafeatures/metafeatures.py +++ b/autosklearn/metalearning/metafeatures/metafeatures.py @@ -1,22 +1,22 @@ -from collections import defaultdict, OrderedDict, deque import copy +from collections import OrderedDict, defaultdict, deque import numpy as np - import pandas as pd - +import scipy.sparse import scipy.stats from scipy.linalg import LinAlgError -import scipy.sparse # TODO use balanced accuracy! from sklearn.multiclass import OneVsRestClassifier from sklearn.utils import check_array from sklearn.utils.multiclass import type_of_target -from autosklearn.pipeline.components.data_preprocessing.feature_type \ - import FeatTypeSplit -from .metafeature import MetaFeature, HelperFunction, DatasetMetafeatures +from autosklearn.pipeline.components.data_preprocessing.feature_type import ( + FeatTypeSplit, +) + +from .metafeature import DatasetMetafeatures, HelperFunction, MetaFeature # TODO Allow multiple dependencies for a metafeature @@ -62,10 +62,12 @@ def define(self, name): """Decorator for adding helper functions to a "dictionary". This behaves like a function decorating a function, not a class decorating a function""" + def wrapper(metafeature_class): instance = metafeature_class() self.__setitem__(name, instance) return instance + return wrapper @@ -107,19 +109,20 @@ def is_calculated(self, key): return key in self.values def get_dependency(self, name): - """Return the dependency of metafeature "name". - """ + """Return the dependency of metafeature "name".""" return self.dependencies.get(name) def define(self, name, dependency=None): """Decorator for adding metafeature functions to a "dictionary" of metafeatures. This behaves like a function decorating a function, not a class decorating a function""" + def wrapper(metafeature_class): instance = metafeature_class() self.__setitem__(name, instance) self.dependencies[name] = dependency return instance + return wrapper @@ -136,8 +139,7 @@ def _calculate(self, X, y, logger, categorical): return float(X.shape[0]) -@metafeatures.define("LogNumberOfInstances", - dependency="NumberOfInstances") +@metafeatures.define("LogNumberOfInstances", dependency="NumberOfInstances") class LogNumberOfInstances(MetaFeature): def _calculate(self, X, y, logger, categorical): return np.log(metafeatures.get_value("NumberOfInstances")) @@ -151,8 +153,9 @@ class NumberOfClasses(MetaFeature): Calls np.unique on the targets. If the dataset is a multilabel dataset, does this for each label seperately and returns the mean. """ + def _calculate(self, X, y, logger, categorical): - if type_of_target(y) == 'multilabel-indicator': + if type_of_target(y) == "multilabel-indicator": # We have a label binary indicator array: # each sample is one row of a 2d array of shape (n_samples, n_classes) return y.shape[1] @@ -168,8 +171,7 @@ def _calculate(self, X, y, logger, categorical): return float(X.shape[1]) -@metafeatures.define("LogNumberOfFeatures", - dependency="NumberOfFeatures") +@metafeatures.define("LogNumberOfFeatures", dependency="NumberOfFeatures") class LogNumberOfFeatures(MetaFeature): def _calculate(self, X, y, logger, categorical): return np.log(metafeatures.get_value("NumberOfFeatures")) @@ -183,13 +185,11 @@ def _calculate(self, X, y, logger, categorical): def _calculate_sparse(self, X, y, logger, categorical): data = [True if not np.isfinite(x) else False for x in X.data] - missing = X.__class__((data, X.indices, X.indptr), shape=X.shape, - dtype=bool) + missing = X.__class__((data, X.indices, X.indptr), shape=X.shape, dtype=bool) return missing -@metafeatures.define("NumberOfInstancesWithMissingValues", - dependency="MissingValues") +@metafeatures.define("NumberOfInstancesWithMissingValues", dependency="MissingValues") class NumberOfInstancesWithMissingValues(MetaFeature): def _calculate(self, X, y, logger, categorical): missing = helper_functions.get_value("MissingValues") @@ -199,14 +199,18 @@ def _calculate(self, X, y, logger, categorical): def _calculate_sparse(self, X, y, logger, categorical): missing = helper_functions.get_value("MissingValues") new_missing = missing.tocsr() - num_missing = [np.sum(new_missing.data[new_missing.indptr[i]:new_missing.indptr[i + 1]]) - for i in range(new_missing.shape[0])] + num_missing = [ + np.sum(new_missing.data[new_missing.indptr[i] : new_missing.indptr[i + 1]]) + for i in range(new_missing.shape[0]) + ] return float(np.sum([1 if num > 0 else 0 for num in num_missing])) -@metafeatures.define("PercentageOfInstancesWithMissingValues", - dependency="NumberOfInstancesWithMissingValues") +@metafeatures.define( + "PercentageOfInstancesWithMissingValues", + dependency="NumberOfInstancesWithMissingValues", +) class PercentageOfInstancesWithMissingValues(MetaFeature): def _calculate(self, X, y, logger, categorical): n_missing = metafeatures.get_value("NumberOfInstancesWithMissingValues") @@ -214,8 +218,7 @@ def _calculate(self, X, y, logger, categorical): return float(n_missing / n_total) -@metafeatures.define("NumberOfFeaturesWithMissingValues", - dependency="MissingValues") +@metafeatures.define("NumberOfFeaturesWithMissingValues", dependency="MissingValues") class NumberOfFeaturesWithMissingValues(MetaFeature): def _calculate(self, X, y, logger, categorical): missing = helper_functions.get_value("MissingValues") @@ -225,15 +228,18 @@ def _calculate(self, X, y, logger, categorical): def _calculate_sparse(self, X, y, logger, categorical): missing = helper_functions.get_value("MissingValues") new_missing = missing.tocsc() - num_missing = [np.sum( - new_missing.data[new_missing.indptr[i]:new_missing.indptr[i+1]]) - for i in range(missing.shape[1])] + num_missing = [ + np.sum(new_missing.data[new_missing.indptr[i] : new_missing.indptr[i + 1]]) + for i in range(missing.shape[1]) + ] return float(np.sum([1 if num > 0 else 0 for num in num_missing])) -@metafeatures.define("PercentageOfFeaturesWithMissingValues", - dependency="NumberOfFeaturesWithMissingValues") +@metafeatures.define( + "PercentageOfFeaturesWithMissingValues", + dependency="NumberOfFeaturesWithMissingValues", +) class PercentageOfFeaturesWithMissingValues(MetaFeature): def _calculate(self, X, y, logger, categorical): n_missing = metafeatures.get_value("NumberOfFeaturesWithMissingValues") @@ -250,12 +256,12 @@ def _calculate(self, X, y, logger, categorical): return float(np.count_nonzero(helper_functions.get_value("MissingValues"))) -@metafeatures.define("PercentageOfMissingValues", - dependency="NumberOfMissingValues") +@metafeatures.define("PercentageOfMissingValues", dependency="NumberOfMissingValues") class PercentageOfMissingValues(MetaFeature): def _calculate(self, X, y, logger, categorical): - return float(metafeatures.get_value("NumberOfMissingValues")) / \ - float(X.shape[0]*X.shape[1]) + return float(metafeatures.get_value("NumberOfMissingValues")) / float( + X.shape[0] * X.shape[1] + ) # TODO: generalize this! @@ -274,24 +280,28 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("RatioNumericalToNominal") class RatioNumericalToNominal(MetaFeature): def _calculate(self, X, y, logger, categorical): - num_categorical = float(metafeatures[ - "NumberOfCategoricalFeatures"](X, y, logger, categorical).value) - num_numerical = float(metafeatures[ - "NumberOfNumericFeatures"](X, y, logger, categorical).value) + num_categorical = float( + metafeatures["NumberOfCategoricalFeatures"](X, y, logger, categorical).value + ) + num_numerical = float( + metafeatures["NumberOfNumericFeatures"](X, y, logger, categorical).value + ) if num_categorical == 0.0: - return 0. + return 0.0 return num_numerical / num_categorical @metafeatures.define("RatioNominalToNumerical") class RatioNominalToNumerical(MetaFeature): def _calculate(self, X, y, logger, categorical): - num_categorical = float(metafeatures[ - "NumberOfCategoricalFeatures"](X, y, logger, categorical).value) - num_numerical = float(metafeatures[ - "NumberOfNumericFeatures"](X, y, logger, categorical).value) + num_categorical = float( + metafeatures["NumberOfCategoricalFeatures"](X, y, logger, categorical).value + ) + num_numerical = float( + metafeatures["NumberOfNumericFeatures"](X, y, logger, categorical).value + ) if num_numerical == 0.0: - return 0. + return 0.0 else: return num_categorical / num_numerical @@ -300,8 +310,9 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("DatasetRatio") class DatasetRatio(MetaFeature): def _calculate(self, X, y, logger, categorical): - return float(metafeatures["NumberOfFeatures"](X, y, logger).value) /\ - float(metafeatures["NumberOfInstances"](X, y, logger).value) + return float(metafeatures["NumberOfFeatures"](X, y, logger).value) / float( + metafeatures["NumberOfInstances"](X, y, logger).value + ) @metafeatures.define("LogDatasetRatio", dependency="DatasetRatio") @@ -313,12 +324,12 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("InverseDatasetRatio") class InverseDatasetRatio(MetaFeature): def _calculate(self, X, y, logger, categorical): - return float(metafeatures["NumberOfInstances"](X, y, logger).value) /\ - float(metafeatures["NumberOfFeatures"](X, y, logger).value) + return float(metafeatures["NumberOfInstances"](X, y, logger).value) / float( + metafeatures["NumberOfFeatures"](X, y, logger).value + ) -@metafeatures.define("LogInverseDatasetRatio", - dependency="InverseDatasetRatio") +@metafeatures.define("LogInverseDatasetRatio", dependency="InverseDatasetRatio") class LogInverseDatasetRatio(MetaFeature): def _calculate(self, X, y, logger, categorical): return np.log(metafeatures.get_value("InverseDatasetRatio")) @@ -385,12 +396,13 @@ def _calculate(self, X, y, logger, categorical): occurences = [] for i in range(y.shape[1]): occurences.extend( - [occurrence for occurrence in occurence_dict[ - i].values()]) + [occurrence for occurrence in occurence_dict[i].values()] + ) occurences = np.array(occurences) else: - occurences = np.array([occurrence for occurrence in occurence_dict.values()], - dtype=np.float64) + occurences = np.array( + [occurrence for occurrence in occurence_dict.values()], dtype=np.float64 + ) return (occurences / y.shape[0]).mean() @@ -403,15 +415,16 @@ def _calculate(self, X, y, logger, categorical): stds = [] for i in range(y.shape[1]): std = np.array( - [occurrence for occurrence in occurence_dict[ - i].values()], - dtype=np.float64) + [occurrence for occurrence in occurence_dict[i].values()], + dtype=np.float64, + ) std = (std / y.shape[0]).std() stds.append(std) return np.mean(stds) else: - occurences = np.array([occurrence for occurrence in occurence_dict.values()], - dtype=np.float64) + occurences = np.array( + [occurrence for occurrence in occurence_dict.values()], dtype=np.float64 + ) return (occurences / y.shape[0]).std() @@ -424,10 +437,11 @@ class NumSymbols(HelperFunction): def _calculate(self, X, y, logger, categorical): symbols_per_column = [] for i in range(X.shape[1]): - if categorical[X.columns[i] if hasattr(X, 'columns') else i]: - column = X.iloc[:, i] if hasattr(X, 'iloc') else X[:, i] - unique_values = column.unique() if hasattr( - column, 'unique') else np.unique(column) + if categorical[X.columns[i] if hasattr(X, "columns") else i]: + column = X.iloc[:, i] if hasattr(X, "iloc") else X[:, i] + unique_values = ( + column.unique() if hasattr(column, "unique") else np.unique(column) + ) num_unique = np.sum(pd.notna(unique_values)) symbols_per_column.append(num_unique) return symbols_per_column @@ -436,7 +450,7 @@ def _calculate_sparse(self, X, y, logger, categorical): symbols_per_column = [] new_X = X.tocsc() for i in range(new_X.shape[1]): - if categorical[X.columns[i] if hasattr(X, 'columns') else i]: + if categorical[X.columns[i] if hasattr(X, "columns") else i]: unique_values = np.unique(new_X.getcol(i).data) num_unique = np.sum(np.isfinite(unique_values)) symbols_per_column.append(num_unique) @@ -489,6 +503,7 @@ def _calculate(self, X, y, logger, categorical): sum = np.nansum(helper_functions.get_value("NumSymbols")) return sum if np.isfinite(sum) else 0 + ################################################################################ # Statistical meta features # Only use third and fourth statistical moment because it is common to @@ -502,19 +517,21 @@ class Kurtosisses(HelperFunction): def _calculate(self, X, y, logger, categorical): kurts = [] for i in range(X.shape[1]): - if not categorical[X.columns[i] if hasattr(X, 'columns') else i]: - kurts.append(scipy.stats.kurtosis( - X.iloc[:, i] if hasattr(X, 'iloc') else X[:, i] - )) + if not categorical[X.columns[i] if hasattr(X, "columns") else i]: + kurts.append( + scipy.stats.kurtosis( + X.iloc[:, i] if hasattr(X, "iloc") else X[:, i] + ) + ) return kurts def _calculate_sparse(self, X, y, logger, categorical): kurts = [] X_new = X.tocsc() for i in range(X_new.shape[1]): - if not categorical[X.columns[i] if hasattr(X, 'columns') else i]: + if not categorical[X.columns[i] if hasattr(X, "columns") else i]: start = X_new.indptr[i] - stop = X_new.indptr[i+1] + stop = X_new.indptr[i + 1] kurts.append(scipy.stats.kurtosis(X_new.data[start:stop])) return kurts @@ -556,17 +573,17 @@ class Skewnesses(HelperFunction): def _calculate(self, X, y, logger, categorical): skews = [] for i in range(X.shape[1]): - if not categorical[X.columns[i] if hasattr(X, 'columns') else i]: - skews.append(scipy.stats.skew( - X.iloc[:, i] if hasattr(X, 'iloc') else X[:, i] - )) + if not categorical[X.columns[i] if hasattr(X, "columns") else i]: + skews.append( + scipy.stats.skew(X.iloc[:, i] if hasattr(X, "iloc") else X[:, i]) + ) return skews def _calculate_sparse(self, X, y, logger, categorical): skews = [] X_new = X.tocsc() for i in range(X_new.shape[1]): - if not categorical[X.columns[i] if hasattr(X, 'columns') else i]: + if not categorical[X.columns[i] if hasattr(X, "columns") else i]: start = X_new.indptr[i] stop = X_new.indptr[i + 1] skews.append(scipy.stats.skew(X_new.data[start:stop])) @@ -628,8 +645,11 @@ def _calculate(self, X, y, logger, categorical): occurence_dict = defaultdict(float) for value in y if labels == 1 else y[:, i]: occurence_dict[value] += 1 - entropies.append(scipy.stats.entropy([occurence_dict[key] for key in - occurence_dict], base=2)) + entropies.append( + scipy.stats.entropy( + [occurence_dict[key] for key in occurence_dict], base=2 + ) + ) return np.mean(entropies) @@ -669,34 +689,35 @@ def _calculate(self, X, y, logger, categorical): class LandmarkLDA(MetaFeature): def _calculate(self, X, y, logger, categorical): import sklearn.discriminant_analysis - if type(y) in ('binary', 'multiclass'): + + if type(y) in ("binary", "multiclass"): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) - accuracy = 0. + accuracy = 0.0 try: for train, test in kf.split(X, y): lda = sklearn.discriminant_analysis.LinearDiscriminantAnalysis() if len(y.shape) == 1 or y.shape[1] == 1: lda.fit( - X.iloc[train] if hasattr(X, 'iloc') else X[train], - y.iloc[train] if hasattr(y, 'iloc') else y[train], + X.iloc[train] if hasattr(X, "iloc") else X[train], + y.iloc[train] if hasattr(y, "iloc") else y[train], ) else: lda = OneVsRestClassifier(lda) lda.fit( - X.iloc[train] if hasattr(X, 'iloc') else X[train], - y.iloc[train] if hasattr(y, 'iloc') else y[train], + X.iloc[train] if hasattr(X, "iloc") else X[train], + y.iloc[train] if hasattr(y, "iloc") else y[train], ) predictions = lda.predict( - X.iloc[test] if hasattr(X, 'iloc') else X[test], + X.iloc[test] if hasattr(X, "iloc") else X[test], ) accuracy += sklearn.metrics.accuracy_score( predictions, - y.iloc[test] if hasattr(y, 'iloc') else y[test], + y.iloc[test] if hasattr(y, "iloc") else y[test], ) return accuracy / 5 except scipy.linalg.LinAlgError as e: @@ -716,33 +737,33 @@ class LandmarkNaiveBayes(MetaFeature): def _calculate(self, X, y, logger, categorical): import sklearn.naive_bayes - if type(y) in ('binary', 'multiclass'): + if type(y) in ("binary", "multiclass"): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) - accuracy = 0. + accuracy = 0.0 for train, test in kf.split(X, y): nb = sklearn.naive_bayes.GaussianNB() if len(y.shape) == 1 or y.shape[1] == 1: nb.fit( - X.iloc[train] if hasattr(X, 'iloc') else X[train], - y.iloc[train] if hasattr(y, 'iloc') else y[train], + X.iloc[train] if hasattr(X, "iloc") else X[train], + y.iloc[train] if hasattr(y, "iloc") else y[train], ) else: nb = OneVsRestClassifier(nb) nb.fit( - X.iloc[train] if hasattr(X, 'iloc') else X[train], - y.iloc[train] if hasattr(y, 'iloc') else y[train], + X.iloc[train] if hasattr(X, "iloc") else X[train], + y.iloc[train] if hasattr(y, "iloc") else y[train], ) predictions = nb.predict( - X.iloc[test] if hasattr(X, 'iloc') else X[test], + X.iloc[test] if hasattr(X, "iloc") else X[test], ) accuracy += sklearn.metrics.accuracy_score( predictions, - y.iloc[test] if hasattr(y, 'iloc') else y[test], + y.iloc[test] if hasattr(y, "iloc") else y[test], ) return accuracy / 5 @@ -756,34 +777,34 @@ class LandmarkDecisionTree(MetaFeature): def _calculate(self, X, y, logger, categorical): import sklearn.tree - if type(y) in ('binary', 'multiclass'): + if type(y) in ("binary", "multiclass"): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) - accuracy = 0. + accuracy = 0.0 for train, test in kf.split(X, y): random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: tree.fit( - X.iloc[train] if hasattr(X, 'iloc') else X[train], - y.iloc[train] if hasattr(y, 'iloc') else y[train], + X.iloc[train] if hasattr(X, "iloc") else X[train], + y.iloc[train] if hasattr(y, "iloc") else y[train], ) else: tree = OneVsRestClassifier(tree) tree.fit( - X.iloc[train] if hasattr(X, 'iloc') else X[train], - y.iloc[train] if hasattr(y, 'iloc') else y[train], + X.iloc[train] if hasattr(X, "iloc") else X[train], + y.iloc[train] if hasattr(y, "iloc") else y[train], ) predictions = tree.predict( - X.iloc[test] if hasattr(X, 'iloc') else X[test], + X.iloc[test] if hasattr(X, "iloc") else X[test], ) accuracy += sklearn.metrics.accuracy_score( predictions, - y.iloc[test] if hasattr(y, 'iloc') else y[test], + y.iloc[test] if hasattr(y, "iloc") else y[test], ) return accuracy / 5 @@ -803,34 +824,39 @@ class LandmarkDecisionNodeLearner(MetaFeature): def _calculate(self, X, y, logger, categorical): import sklearn.tree - if type(y) in ('binary', 'multiclass'): + if type(y) in ("binary", "multiclass"): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) - accuracy = 0. + accuracy = 0.0 for train, test in kf.split(X, y): random_state = sklearn.utils.check_random_state(42) node = sklearn.tree.DecisionTreeClassifier( - criterion="entropy", max_depth=1, random_state=random_state, - min_samples_split=2, min_samples_leaf=1, max_features=None) + criterion="entropy", + max_depth=1, + random_state=random_state, + min_samples_split=2, + min_samples_leaf=1, + max_features=None, + ) if len(y.shape) == 1 or y.shape[1] == 1: node.fit( - X.iloc[train] if hasattr(X, 'iloc') else X[train], - y.iloc[train] if hasattr(y, 'iloc') else y[train], + X.iloc[train] if hasattr(X, "iloc") else X[train], + y.iloc[train] if hasattr(y, "iloc") else y[train], ) else: node = OneVsRestClassifier(node) node.fit( - X.iloc[train] if hasattr(X, 'iloc') else X[train], - y.iloc[train] if hasattr(y, 'iloc') else y[train], + X.iloc[train] if hasattr(X, "iloc") else X[train], + y.iloc[train] if hasattr(y, "iloc") else y[train], ) predictions = node.predict( - X.iloc[test] if hasattr(X, 'iloc') else X[test], + X.iloc[test] if hasattr(X, "iloc") else X[test], ) accuracy += sklearn.metrics.accuracy_score( predictions, - y.iloc[test] if hasattr(y, 'iloc') else y[test], + y.iloc[test] if hasattr(y, "iloc") else y[test], ) return accuracy / 5 @@ -843,27 +869,32 @@ class LandmarkRandomNodeLearner(MetaFeature): def _calculate(self, X, y, logger, categorical): import sklearn.tree - if type(y) in ('binary', 'multiclass'): + if type(y) in ("binary", "multiclass"): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) - accuracy = 0. + accuracy = 0.0 for train, test in kf.split(X, y): random_state = sklearn.utils.check_random_state(42) node = sklearn.tree.DecisionTreeClassifier( - criterion="entropy", max_depth=1, random_state=random_state, - min_samples_split=2, min_samples_leaf=1, max_features=1) + criterion="entropy", + max_depth=1, + random_state=random_state, + min_samples_split=2, + min_samples_leaf=1, + max_features=1, + ) node.fit( - X.iloc[train] if hasattr(X, 'iloc') else X[train], - y.iloc[train] if hasattr(y, 'iloc') else y[train], + X.iloc[train] if hasattr(X, "iloc") else X[train], + y.iloc[train] if hasattr(y, "iloc") else y[train], ) predictions = node.predict( - X.iloc[test] if hasattr(X, 'iloc') else X[test], + X.iloc[test] if hasattr(X, "iloc") else X[test], ) accuracy += sklearn.metrics.accuracy_score( predictions, - y.iloc[test] if hasattr(y, 'iloc') else y[test], + y.iloc[test] if hasattr(y, "iloc") else y[test], ) return accuracy / 5 @@ -903,31 +934,31 @@ class Landmark1NN(MetaFeature): def _calculate(self, X, y, logger, categorical): import sklearn.neighbors - if type(y) in ('binary', 'multiclass'): + if type(y) in ("binary", "multiclass"): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) - accuracy = 0. + accuracy = 0.0 for train, test in kf.split(X, y): kNN = sklearn.neighbors.KNeighborsClassifier(n_neighbors=1) if len(y.shape) == 1 or y.shape[1] == 1: kNN.fit( - X.iloc[train] if hasattr(X, 'iloc') else X[train], - y.iloc[train] if hasattr(y, 'iloc') else y[train], + X.iloc[train] if hasattr(X, "iloc") else X[train], + y.iloc[train] if hasattr(y, "iloc") else y[train], ) else: kNN = OneVsRestClassifier(kNN) kNN.fit( - X.iloc[train] if hasattr(X, 'iloc') else X[train], - y.iloc[train] if hasattr(y, 'iloc') else y[train], + X.iloc[train] if hasattr(X, "iloc") else X[train], + y.iloc[train] if hasattr(y, "iloc") else y[train], ) predictions = kNN.predict( - X.iloc[test] if hasattr(X, 'iloc') else X[test], + X.iloc[test] if hasattr(X, "iloc") else X[test], ) accuracy += sklearn.metrics.accuracy_score( predictions, - y.iloc[test] if hasattr(y, 'iloc') else y[test], + y.iloc[test] if hasattr(y, "iloc") else y[test], ) return accuracy / 5 @@ -945,6 +976,7 @@ def _calculate(self, X, y, logger, categorical): class PCA(HelperFunction): def _calculate(self, X, y, logger, categorical): import sklearn.decomposition + pca = sklearn.decomposition.PCA(copy=True) rs = np.random.RandomState(42) indices = np.arange(X.shape[0]) @@ -952,7 +984,7 @@ def _calculate(self, X, y, logger, categorical): try: rs.shuffle(indices) pca.fit( - X.iloc[indices] if hasattr(X, 'iloc') else X[indices], + X.iloc[indices] if hasattr(X, "iloc") else X[indices], ) return pca except LinAlgError: @@ -962,6 +994,7 @@ def _calculate(self, X, y, logger, categorical): def _calculate_sparse(self, X, y, logger, categorical): import sklearn.decomposition + rs = np.random.RandomState(42) indices = np.arange(X.shape[0]) # This is expensive, but necessary with scikit-learn 0.15 @@ -970,8 +1003,8 @@ def _calculate_sparse(self, X, y, logger, categorical): try: rs.shuffle(indices) truncated_svd = sklearn.decomposition.TruncatedSVD( - n_components=X.shape[1]-1, random_state=i, - algorithm="randomized") + n_components=X.shape[1] - 1, random_state=i, algorithm="randomized" + ) truncated_svd.fit(Xt[indices]) return truncated_svd except LinAlgError: @@ -987,12 +1020,12 @@ def _calculate(self, X, y, logger, categorical): pca_ = helper_functions.get_value("PCA") if pca_ is None: return np.NaN - sum_ = 0. + sum_ = 0.0 idx = 0 while sum_ < 0.95 and idx < len(pca_.explained_variance_ratio_): sum_ += pca_.explained_variance_ratio_[idx] idx += 1 - return float(idx)/float(X.shape[1]) + return float(idx) / float(X.shape[1]) # Kurtosis of first PC @@ -1027,8 +1060,9 @@ def _calculate(self, X, y, logger, categorical): return skewness[0] -def calculate_all_metafeatures_encoded_labels(X, y, categorical, dataset_name, logger, - calculate=None, dont_calculate=None): +def calculate_all_metafeatures_encoded_labels( + X, y, categorical, dataset_name, logger, calculate=None, dont_calculate=None +): """ Calculate only metafeatures for which a 1HotEncoded feature matrix is necessery. """ @@ -1036,25 +1070,46 @@ def calculate_all_metafeatures_encoded_labels(X, y, categorical, dataset_name, l calculate = set() calculate.update(npy_metafeatures) - return calculate_all_metafeatures(X, y, categorical, dataset_name, - calculate=calculate, - dont_calculate=dont_calculate, logger=logger) + return calculate_all_metafeatures( + X, + y, + categorical, + dataset_name, + calculate=calculate, + dont_calculate=dont_calculate, + logger=logger, + ) -def calculate_all_metafeatures_with_labels(X, y, categorical, dataset_name, logger, - calculate=None, dont_calculate=None): +def calculate_all_metafeatures_with_labels( + X, y, categorical, dataset_name, logger, calculate=None, dont_calculate=None +): if dont_calculate is None: dont_calculate = set() else: dont_calculate = copy.deepcopy(dont_calculate) dont_calculate.update(npy_metafeatures) - return calculate_all_metafeatures(X, y, categorical, dataset_name, - calculate=calculate, - dont_calculate=dont_calculate, logger=logger) - - -def calculate_all_metafeatures(X, y, categorical, dataset_name, logger, - calculate=None, dont_calculate=None, densify_threshold=1000): + return calculate_all_metafeatures( + X, + y, + categorical, + dataset_name, + calculate=calculate, + dont_calculate=dont_calculate, + logger=logger, + ) + + +def calculate_all_metafeatures( + X, + y, + categorical, + dataset_name, + logger, + calculate=None, + dont_calculate=None, + densify_threshold=1000, +): """Calculate all metafeatures.""" helper_functions.clear() @@ -1083,8 +1138,10 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger, # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) - feat_type = {key: 'categorical' if value else 'numerical' - for key, value in categorical.items()} + feat_type = { + key: "categorical" if value else "numerical" + for key, value in categorical.items() + } # TODO make this more cohesive to the overall structure (quick bug fix) if isinstance(X, pd.DataFrame): @@ -1095,9 +1152,12 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger, # The difference between feat_type and categorical, is that # categorical has True/False instead of categorical/numerical feat_type=feat_type, - force_sparse_output=True) + force_sparse_output=True, + ) X_transformed = DPP.fit_transform(X) - categorical_transformed = {i: False for i in range(X_transformed.shape[1])} + categorical_transformed = { + i: False for i in range(X_transformed.shape[1]) + } # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): @@ -1111,9 +1171,9 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger, # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! - X_transformed = check_array(X_transformed, - force_all_finite=True, - accept_sparse='csr') + X_transformed = check_array( + X_transformed, force_all_finite=True, accept_sparse="csr" + ) indices = np.arange(X_transformed.shape[0]) rs = np.random.RandomState(42) @@ -1143,17 +1203,15 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger, elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue - elif is_helper_function and not helper_functions.is_calculated( - dependency): - logger.info("%s: Going to calculate: %s", dataset_name, - dependency) + elif is_helper_function and not helper_functions.is_calculated(dependency): + logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency]( - X_, y_, categorical=categorical_, logger=logger) + X_, y_, categorical=categorical_, logger=logger + ) helper_functions.set_value(dependency, value) mf_[dependency] = value - logger.info("%s: Going to calculate: %s", dataset_name, - name) + logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, logger, categorical_) metafeatures.set_value(name, value) @@ -1164,40 +1222,48 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger, return mf_ -npy_metafeatures = set(["LandmarkLDA", - "LandmarkNaiveBayes", - "LandmarkDecisionTree", - "LandmarkDecisionNodeLearner", - "LandmarkRandomNodeLearner", - "LandmarkWorstNodeLearner", - "Landmark1NN", - "PCAFractionOfComponentsFor95PercentVariance", - "PCAKurtosisFirstPC", - "PCASkewnessFirstPC", - "Skewnesses", - "SkewnessMin", - "SkewnessMax", - "SkewnessMean", - "SkewnessSTD", - "Kurtosisses", - "KurtosisMin", - "KurtosisMax", - "KurtosisMean", - "KurtosisSTD"]) +npy_metafeatures = set( + [ + "LandmarkLDA", + "LandmarkNaiveBayes", + "LandmarkDecisionTree", + "LandmarkDecisionNodeLearner", + "LandmarkRandomNodeLearner", + "LandmarkWorstNodeLearner", + "Landmark1NN", + "PCAFractionOfComponentsFor95PercentVariance", + "PCAKurtosisFirstPC", + "PCASkewnessFirstPC", + "Skewnesses", + "SkewnessMin", + "SkewnessMax", + "SkewnessMean", + "SkewnessSTD", + "Kurtosisses", + "KurtosisMin", + "KurtosisMax", + "KurtosisMean", + "KurtosisSTD", + ] +) subsets = dict() # All implemented metafeatures subsets["all"] = set(metafeatures.functions.keys()) # Metafeatures used by Pfahringer et al. (2000) in the first experiment -subsets["pfahringer_2000_experiment1"] = set(["number_of_features", - "number_of_numeric_features", - "number_of_categorical_features", - "number_of_classes", - "class_probability_max", - "landmark_lda", - "landmark_naive_bayes", - "landmark_decision_tree"]) +subsets["pfahringer_2000_experiment1"] = set( + [ + "number_of_features", + "number_of_numeric_features", + "number_of_categorical_features", + "number_of_classes", + "class_probability_max", + "landmark_lda", + "landmark_naive_bayes", + "landmark_decision_tree", + ] +) # Metafeatures used by Pfahringer et al. (2000) in the second experiment # worst node learner not implemented yet @@ -1209,19 +1275,27 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger, """ # Metafeatures used by Yogatama and Mann (2014) -subsets["yogotama_2014"] = set(["log_number_of_features", - "log_number_of_instances", - "number_of_classes"]) +subsets["yogotama_2014"] = set( + ["log_number_of_features", "log_number_of_instances", "number_of_classes"] +) # Metafeatures used by Bardenet et al. (2013) for the AdaBoost.MH experiment -subsets["bardenet_2013_boost"] = set(["number_of_classes", - "log_number_of_features", - "log_inverse_dataset_ratio", - "pca_95percent"]) +subsets["bardenet_2013_boost"] = set( + [ + "number_of_classes", + "log_number_of_features", + "log_inverse_dataset_ratio", + "pca_95percent", + ] +) # Metafeatures used by Bardenet et al. (2013) for the Neural Net experiment -subsets["bardenet_2013_nn"] = set(["number_of_classes", - "log_number_of_features", - "log_inverse_dataset_ratio", - "pca_kurtosis_first_pc", - "pca_skewness_first_pc"]) +subsets["bardenet_2013_nn"] = set( + [ + "number_of_classes", + "log_number_of_features", + "log_inverse_dataset_ratio", + "pca_kurtosis_first_pc", + "pca_skewness_first_pc", + ] +) diff --git a/autosklearn/metalearning/metalearning/__init__.py b/autosklearn/metalearning/metalearning/__init__.py index 8f0ce6cb7c..92bf78f389 100644 --- a/autosklearn/metalearning/metalearning/__init__.py +++ b/autosklearn/metalearning/metalearning/__init__.py @@ -1 +1 @@ -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/autosklearn/metalearning/metalearning/clustering/gmeans.py b/autosklearn/metalearning/metalearning/clustering/gmeans.py index 573537e446..5c5233f284 100644 --- a/autosklearn/metalearning/metalearning/clustering/gmeans.py +++ b/autosklearn/metalearning/metalearning/clustering/gmeans.py @@ -9,8 +9,14 @@ class GMeans(object): - def __init__(self, minimum_samples_per_cluster=2, n_init=10, significance=4, - restarts=10, random_state=None, ): + def __init__( + self, + minimum_samples_per_cluster=2, + n_init=10, + significance=4, + restarts=10, + random_state=None, + ): self.minimum_samples_per_cluster = minimum_samples_per_cluster self.n_init = n_init self.significance = significance @@ -21,8 +27,9 @@ def fit(self, X): self.inertia_ = np.inf for i in range(self.restarts): - KMeans = sklearn.cluster.KMeans(n_clusters=1, n_init=1, - random_state=self.random_state) + KMeans = sklearn.cluster.KMeans( + n_clusters=1, n_init=1, random_state=self.random_state + ) KMeans.fit(X) while True: @@ -34,14 +41,16 @@ def fit(self, X): indices = KMeans.labels_ == i X_ = X[indices] - if np.sum(indices) < self.minimum_samples_per_cluster*2: + if np.sum(indices) < self.minimum_samples_per_cluster * 2: cluster_centers.append(cluster_center) continue for i in range(10): - KMeans_ = sklearn.cluster.KMeans(n_clusters=2, - n_init=self.n_init, - random_state=self.random_state) + KMeans_ = sklearn.cluster.KMeans( + n_clusters=2, + n_init=self.n_init, + random_state=self.random_state, + ) predictions = KMeans_.fit_predict(X_) bins = np.bincount(predictions) minimum = np.min(bins) @@ -73,9 +82,12 @@ def fit(self, X): break # Refinement - KMeans = sklearn.cluster.KMeans(n_clusters=len(cluster_centers), n_init=1, - init=np.array(cluster_centers), - random_state=self.random_state) + KMeans = sklearn.cluster.KMeans( + n_clusters=len(cluster_centers), + n_init=1, + init=np.array(cluster_centers), + random_state=self.random_state, + ) KMeans.fit(X) if KMeans.inertia_ < self.inertia_: diff --git a/autosklearn/metalearning/metalearning/create_datasets.py b/autosklearn/metalearning/metalearning/create_datasets.py index a65b7840ed..1a60bebb7a 100644 --- a/autosklearn/metalearning/metalearning/create_datasets.py +++ b/autosklearn/metalearning/metalearning/create_datasets.py @@ -1,5 +1,6 @@ import itertools import logging + import numpy as np import pandas as pd import scipy.stats @@ -13,11 +14,13 @@ def create_regression_dataset(metafeatures, experiments): experiment = experiments[dataset_name] mf = metafeatures.loc[dataset_name] for i, run in enumerate(experiment): - x1 = pd.Series(data=[run.params[param] for param in run.params], - index=run.params.keys()) + x1 = pd.Series( + data=[run.params[param] for param in run.params], + index=run.params.keys(), + ) x2 = mf X.append(x1.append(x2)) - X_indices.append('%s_%d' % (dataset_name, i)) + X_indices.append("%s_%d" % (dataset_name, i)) Y.append(run.result) X = pd.DataFrame(X, index=X_indices) Y = pd.DataFrame(Y, index=X_indices) @@ -67,14 +70,19 @@ def create_predict_spearman_rank(metafeatures, experiments, iterator): responses_1 = np.zeros((len(experiments_1)), dtype=np.float64) responses_2 = np.zeros((len(experiments_1)), dtype=np.float64) - for idx, zipped in enumerate(zip( + for idx, zipped in enumerate( + zip( sorted(experiments_1, key=lambda t: str(t.configuration)), - sorted(experiments_2, key=lambda t: str(t.configuration)))): + sorted(experiments_2, key=lambda t: str(t.configuration)), + ) + ): # Test if the order of the params is the same exp_1, exp_2 = zipped print(exp_1.configuration, exp_2.configuration) - assert exp_1.configuration == exp_2.configuration,\ - (experiments_1, experiments_2) + assert exp_1.configuration == exp_2.configuration, ( + experiments_1, + experiments_2, + ) responses_1[idx] = exp_1.result if np.isfinite(exp_1.result) else 1 responses_2[idx] = exp_2.result if np.isfinite(exp_2.result) else 1 @@ -91,9 +99,11 @@ def create_predict_spearman_rank(metafeatures, experiments, iterator): logging.info("Metafeatures %s", metafeatures.shape) logging.info("X.shape %s", X.shape) logging.info("Y.shape %s", Y.shape) - assert X.shape == (len(cross_product), metafeatures.shape[1] * 2), \ - (X.shape, (len(cross), metafeatures.shape[1] * 2)) - assert Y.shape == (len(cross_product), ) + assert X.shape == (len(cross_product), metafeatures.shape[1] * 2), ( + X.shape, + (len(cross), metafeatures.shape[1] * 2), + ) + assert Y.shape == (len(cross_product),) # train sklearn regressor (tree) with 10fold CV indices = range(len(X)) np_rs = np.random.RandomState(42) @@ -103,8 +113,7 @@ def create_predict_spearman_rank(metafeatures, experiments, iterator): return X, Y -def create_predict_spearman_rank_with_cv(cv_metafeatures, cv_experiments, - iterator): +def create_predict_spearman_rank_with_cv(cv_metafeatures, cv_experiments, iterator): X = [] Y = [] Y_names = [] @@ -128,13 +137,18 @@ def create_predict_spearman_rank_with_cv(cv_metafeatures, cv_experiments, logging.info("Create spearman rank dataset with CV data %s", iterator) logging.info("Using %d datasets", len(dataset_names)) - logging.info("This will results in %d training points", len(cross_product) * len(folds_product)) + logging.info( + "This will results in %d training points", + len(cross_product) * len(folds_product), + ) logging.info("Length of dataset crossproduct %s", len(cross_product)) logging.info("Length of folds crossproduct %s", len(folds_product)) # Create inputs and targets for i, cross in enumerate(cross_product): - print("%d/%d: %s" % (i, len(cross_product), cross),) + print( + "%d/%d: %s" % (i, len(cross_product), cross), + ) for folds in folds_product: name = "%s-%d_%s-%d" % (cross[0], folds[0], cross[1], folds[1]) mf_1 = cv_metafeatures[cross[0]][folds[0]] @@ -266,7 +280,7 @@ def create_smac_files_file(cv_metafeatures, cv_experiments, dataset, train_instances_file.seek(0) for line in train_instances_file: fh.write(line) -""" +""" # noqa: E501 if __name__ == "__main__": diff --git a/autosklearn/metalearning/metalearning/kNearestDatasets/__init__.py b/autosklearn/metalearning/metalearning/kNearestDatasets/__init__.py index 8f0ce6cb7c..92bf78f389 100644 --- a/autosklearn/metalearning/metalearning/kNearestDatasets/__init__.py +++ b/autosklearn/metalearning/metalearning/kNearestDatasets/__init__.py @@ -1 +1 @@ -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py b/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py index 336d3b6bb3..f6c10c95d2 100644 --- a/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py +++ b/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py @@ -1,13 +1,12 @@ import numpy as np import pandas as pd - +import sklearn.utils from sklearn.neighbors import NearestNeighbors from sklearn.preprocessing import MinMaxScaler -import sklearn.utils class KNearestDatasets(object): - def __init__(self, logger, metric='l1', random_state=None, metric_params=None): + def __init__(self, logger, metric="l1", random_state=None, metric_params=None): self.logger = logger self.metric = metric @@ -37,8 +36,10 @@ def fit(self, metafeatures, runs): assert metafeatures.values.dtype in (np.float32, np.float64) assert np.isfinite(metafeatures.values).all() assert isinstance(runs, pd.DataFrame) - assert runs.shape[1] == metafeatures.shape[0], \ - (runs.shape[1], metafeatures.shape[0]) + assert runs.shape[1] == metafeatures.shape[0], ( + runs.shape[1], + metafeatures.shape[0], + ) self.metafeatures = metafeatures self.runs = runs @@ -54,7 +55,8 @@ def fit(self, metafeatures, runs): best_configuration_per_dataset[dataset_name] = None else: configuration_idx = runs[dataset_name].index[ - np.nanargmin(runs[dataset_name].values)] + np.nanargmin(runs[dataset_name].values) + ] best_configuration_per_dataset[dataset_name] = configuration_idx self.best_configuration_per_dataset = best_configuration_per_dataset @@ -72,9 +74,14 @@ def fit(self, metafeatures, runs): raise ValueError(self.metric) self._nearest_neighbors = NearestNeighbors( - n_neighbors=self.num_datasets, radius=None, algorithm="brute", - leaf_size=30, metric=self._metric, p=self._p, - metric_params=self.metric_params) + n_neighbors=self.num_datasets, + radius=None, + algorithm="brute", + leaf_size=30, + metric=self._metric, + p=self._p, + metric_params=self.metric_params, + ) def kNearestDatasets(self, x, k=1, return_distance=False): """Return the k most similar datasets with respect to self.metric @@ -101,7 +108,7 @@ def kNearestDatasets(self, x, k=1, return_distance=False): """ assert type(x) == pd.Series if k < -1 or k == 0: - raise ValueError('Number of neighbors k cannot be zero or negative.') + raise ValueError("Number of neighbors k cannot be zero or negative.") elif k == -1: k = self.num_datasets @@ -110,14 +117,17 @@ def kNearestDatasets(self, x, k=1, return_distance=False): x = self.scaler.transform(x) self._nearest_neighbors.fit(X_train) distances, neighbor_indices = self._nearest_neighbors.kneighbors( - x, n_neighbors=k, return_distance=True) + x, n_neighbors=k, return_distance=True + ) assert k == neighbor_indices.shape[1] - rval = [self.metafeatures.index[i] - # Neighbor indices is 2d, each row is the indices for one - # dataset in x. - for i in neighbor_indices[0]] + rval = [ + self.metafeatures.index[i] + # Neighbor indices is 2d, each row is the indices for one + # dataset in x. + for i in neighbor_indices[0] + ] if return_distance is False: return rval @@ -127,19 +137,19 @@ def kNearestDatasets(self, x, k=1, return_distance=False): def kBestSuggestions(self, x, k=1, exclude_double_configurations=True): assert type(x) == pd.Series if k < -1 or k == 0: - raise ValueError('Number of neighbors k cannot be zero or negative.') - nearest_datasets, distances = self.kNearestDatasets(x, -1, - return_distance=True) + raise ValueError("Number of neighbors k cannot be zero or negative.") + nearest_datasets, distances = self.kNearestDatasets(x, -1, return_distance=True) kbest = [] added_configurations = set() for dataset_name, distance in zip(nearest_datasets, distances): - best_configuration = self.best_configuration_per_dataset[ - dataset_name] + best_configuration = self.best_configuration_per_dataset[dataset_name] if best_configuration is None: - self.logger.info("Found no best configuration for instance %s" % dataset_name) + self.logger.info( + "Found no best configuration for instance %s" % dataset_name + ) continue if exclude_double_configurations: diff --git a/autosklearn/metalearning/metalearning/meta_base.py b/autosklearn/metalearning/metalearning/meta_base.py index 13653de528..f193a61fef 100644 --- a/autosklearn/metalearning/metalearning/meta_base.py +++ b/autosklearn/metalearning/metalearning/meta_base.py @@ -2,10 +2,10 @@ import numpy as np import pandas as pd +from ConfigSpace.configuration_space import Configuration from ..input import aslib_simple from ..metafeatures.metafeature import DatasetMetafeatures -from ConfigSpace.configuration_space import Configuration class Run(object): @@ -15,8 +15,11 @@ def __init__(self, configuration, result, runtime): self.runtime = runtime def __repr__(self): - return "Run:\nresult: %3.3f\nruntime: %3.3f\n%s" % \ - (self.result, self.runtime, str(self.configuration)) + return "Run:\nresult: %3.3f\nruntime: %3.3f\n%s" % ( + self.result, + self.runtime, + str(self.configuration), + ) class Instance(object): @@ -41,15 +44,18 @@ def __init__(self, configuration_space, aslib_directory, logger): aslib_reader = aslib_simple.AlgorithmSelectionProblem(self.aslib_directory) self.metafeatures = aslib_reader.metafeatures - self.algorithm_runs: OrderedDict[str, pd.DataFrame] = aslib_reader.algorithm_runs + self.algorithm_runs: OrderedDict[ + str, pd.DataFrame + ] = aslib_reader.algorithm_runs self.configurations = aslib_reader.configurations configurations = dict() for algorithm_id in self.configurations: configuration = self.configurations[algorithm_id] try: - configurations[str(algorithm_id)] = \ - (Configuration(configuration_space, values=configuration)) + configurations[str(algorithm_id)] = Configuration( + configuration_space, values=configuration + ) except (ValueError, KeyError) as e: self.logger.debug("Error reading configurations: %s", e) @@ -58,11 +64,13 @@ def __init__(self, configuration_space, aslib_directory, logger): def add_dataset(self, name, metafeatures): metafeatures.name = name if isinstance(metafeatures, DatasetMetafeatures): - data_ = {mf.name: mf.value for mf in metafeatures.metafeature_values.values()} + data_ = { + mf.name: mf.value for mf in metafeatures.metafeature_values.values() + } metafeatures = pd.Series(name=name, data=data_, dtype=np.float64) if name.lower() in self.metafeatures.index: self.logger.warning( - 'Dataset %s already in meta-data. Removing occurence.', name.lower() + "Dataset %s already in meta-data. Removing occurence.", name.lower() ) self.metafeatures.drop(name.lower(), inplace=True) self.metafeatures = self.metafeatures.append(metafeatures) @@ -97,8 +105,7 @@ def _get_metafeatures(self, features): """This is inside an extra function for testing purpose""" # Load the task - self.logger.info("Going to use the following metafeature subset: %s", - features) + self.logger.info("Going to use the following metafeature subset: %s", features) all_metafeatures = self.metafeatures all_metafeatures = all_metafeatures.loc[:, features] diff --git a/autosklearn/metalearning/mismbo.py b/autosklearn/metalearning/mismbo.py index 8a4f2e2bed..7b2956c489 100644 --- a/autosklearn/metalearning/mismbo.py +++ b/autosklearn/metalearning/mismbo.py @@ -2,15 +2,19 @@ import time -from autosklearn.metalearning.optimizers.metalearn_optimizer.metalearner \ - import MetaLearningOptimizer -from autosklearn.constants \ - import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, TASK_TYPES_TO_STRING +from autosklearn.constants import ( + MULTICLASS_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, + TASK_TYPES_TO_STRING, +) +from autosklearn.metalearning.optimizers.metalearn_optimizer.metalearner import ( + MetaLearningOptimizer, +) def suggest_via_metalearning( - meta_base, dataset_name, metric, task, sparse, - num_initial_configurations, logger): + meta_base, dataset_name, metric, task, sparse, num_initial_configurations, logger +): if task == MULTILABEL_CLASSIFICATION: task = MULTICLASS_CLASSIFICATION @@ -24,10 +28,10 @@ def suggest_via_metalearning( dataset_name=dataset_name, configuration_space=meta_base.configuration_space, meta_base=meta_base, - distance='l1', + distance="l1", seed=1, - logger=logger,) - logger.info('Reading meta-data took %5.2f seconds', - time.time() - start) + logger=logger, + ) + logger.info("Reading meta-data took %5.2f seconds", time.time() - start) runs = ml.metalearning_suggest_all(exclude_double_configurations=True) return runs[:num_initial_configurations] diff --git a/autosklearn/metalearning/optimizers/__init__.py b/autosklearn/metalearning/optimizers/__init__.py index 8f0ce6cb7c..92bf78f389 100644 --- a/autosklearn/metalearning/optimizers/__init__.py +++ b/autosklearn/metalearning/optimizers/__init__.py @@ -1 +1 @@ -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/autosklearn/metalearning/optimizers/metalearn_optimizer/metalearner.py b/autosklearn/metalearning/optimizers/metalearn_optimizer/metalearner.py index 987f40b0f7..eff2956b9b 100644 --- a/autosklearn/metalearning/optimizers/metalearn_optimizer/metalearner.py +++ b/autosklearn/metalearning/optimizers/metalearn_optimizer/metalearner.py @@ -1,6 +1,7 @@ import ast -import pandas as pd + import numpy as np +import pandas as pd import sklearn.utils from autosklearn.metalearning.metalearning.kNearestDatasets.kND import KNearestDatasets @@ -11,9 +12,17 @@ def test_function(params): class MetaLearningOptimizer(object): - def __init__(self, dataset_name, configuration_space, - meta_base, logger, distance='l1', seed=None, use_features=None, - distance_kwargs=None): + def __init__( + self, + dataset_name, + configuration_space, + meta_base, + logger, + distance="l1", + seed=None, + use_features=None, + distance_kwargs=None, + ): self.dataset_name = dataset_name self.configuration_space = configuration_space self.meta_base = meta_base @@ -21,7 +30,7 @@ def __init__(self, dataset_name, configuration_space, self.seed = seed self.use_features = use_features self.distance_kwargs = distance_kwargs - self.kND = None # For caching, makes things faster... + self.kND = None # For caching, makes things faster... self.logger = logger @@ -32,7 +41,9 @@ def metalearning_suggest_all(self, exclude_double_configurations=True): hp_list = [] for neighbor in neighbors: try: - configuration = self.meta_base.get_configuration_from_algorithm_index(neighbor[2]) + configuration = self.meta_base.get_configuration_from_algorithm_index( + neighbor[2] + ) self.logger.info("%s %s %s" % (neighbor[0], neighbor[1], configuration)) except (KeyError): self.logger.warning("Configuration %s not found" % neighbor[2]) @@ -42,14 +53,16 @@ def metalearning_suggest_all(self, exclude_double_configurations=True): return hp_list def metalearning_suggest(self, history): - """Suggest the next most promising hyperparameters which were not yet evaluated""" + """Suggest the next promosing hyperparameters which were not yet evaluated""" # TODO test the object in the history! neighbors = self._learn() # Iterate over all datasets which are sorted ascending by distance history_with_indices = [] for run in history: - history_with_indices.append(self.meta_base.get_algorithm_index_from_configuration(run)) + history_with_indices.append( + self.meta_base.get_algorithm_index_from_configuration(run) + ) for idx, neighbor in enumerate(neighbors): already_evaluated = False @@ -62,16 +75,18 @@ def metalearning_suggest(self, history): break if not already_evaluated: - self.logger.info("Nearest dataset with hyperparameters of best value " - "not evaluated yet is %s with a distance of %f" % - (neighbor[0], neighbor[1])) + self.logger.info( + "Nearest dataset with hyperparameters of best value " + "not evaluated yet is %s with a distance of %f" + % (neighbor[0], neighbor[1]) + ) return self.meta_base.get_configuration_from_algorithm_index( - neighbor[2]) + neighbor[2] + ) raise StopIteration("No more values available.") def _learn(self, exclude_double_configurations=True): - dataset_metafeatures, all_other_metafeatures = \ - self._split_metafeature_array() + dataset_metafeatures, all_other_metafeatures = self._split_metafeature_array() # Remove metafeatures which could not be calculated for the target # dataset @@ -85,7 +100,8 @@ def _learn(self, exclude_double_configurations=True): # Do mean imputation of all other metafeatures all_other_metafeatures = all_other_metafeatures.fillna( - all_other_metafeatures.mean()) + all_other_metafeatures.mean() + ) if self.kND is None: # In case that we learn our distance function, get_value the parameters for @@ -98,10 +114,12 @@ def _learn(self, exclude_double_configurations=True): # To keep the distance the same in every iteration, we create a new # random state random_state = sklearn.utils.check_random_state(self.seed) - kND = KNearestDatasets(metric=self.distance, - random_state=random_state, - logger=self.logger, - metric_params=rf_params) + kND = KNearestDatasets( + metric=self.distance, + random_state=random_state, + logger=self.logger, + metric_params=rf_params, + ) runs = dict() # TODO move this code to the metabase @@ -121,13 +139,15 @@ def _learn(self, exclude_double_configurations=True): dataset_metafeatures, k=-1, exclude_double_configurations=exclude_double_configurations, - ) + ) def _split_metafeature_array(self): dataset_metafeatures = self.meta_base.get_metafeatures( - self.dataset_name, self.use_features) + self.dataset_name, self.use_features + ) all_other_datasets = self.meta_base.get_all_dataset_names() all_other_datasets.remove(self.dataset_name) all_other_metafeatures = self.meta_base.get_metafeatures( - all_other_datasets, self.use_features) + all_other_datasets, self.use_features + ) return dataset_metafeatures, all_other_metafeatures diff --git a/autosklearn/metalearning/optimizers/optimizer_base.py b/autosklearn/metalearning/optimizers/optimizer_base.py index dd336fa4a8..e437f1bb64 100644 --- a/autosklearn/metalearning/optimizers/optimizer_base.py +++ b/autosklearn/metalearning/optimizers/optimizer_base.py @@ -1,7 +1,7 @@ +import subprocess from collections import OrderedDict -from itertools import product from io import StringIO -import subprocess +from itertools import product def _parse_categorical(line): @@ -26,7 +26,7 @@ def _parse_categorical(line): first_bracket = line.find("{") second_bracket = line.find("}") - domain_values = line[first_bracket + 1:second_bracket] + domain_values = line[first_bracket + 1 : second_bracket] cat_values = domain_values.split(",") if len(cat_values) < 1: raise ValueError("Expected at least one value in %s" % line) diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index cb6920979f..3234329658 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -1,18 +1,22 @@ from abc import ABCMeta, abstractmethod +from typing import Any, Callable, Dict, List, Optional, Union, cast + from functools import partial from itertools import product -from typing import Any, Callable, Dict, List, Optional, Union, cast import numpy as np - import sklearn.metrics from sklearn.utils.multiclass import type_of_target - from smac.utils.constants import MAXINT from autosklearn.constants import ( - BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION, - MULTIOUTPUT_REGRESSION, REGRESSION, REGRESSION_TASKS, TASK_TYPES, + BINARY_CLASSIFICATION, + MULTICLASS_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, + MULTIOUTPUT_REGRESSION, + REGRESSION, + REGRESSION_TASKS, + TASK_TYPES, ) from .util import sanitize_array @@ -26,7 +30,7 @@ def __init__( optimum: float, worst_possible_result: float, sign: float, - kwargs: Any + kwargs: Any, ) -> None: self.name = name self._kwargs = kwargs @@ -40,7 +44,7 @@ def __call__( self, y_true: np.ndarray, y_pred: np.ndarray, - sample_weight: Optional[List[float]] = None + sample_weight: Optional[List[float]] = None, ) -> float: pass @@ -53,7 +57,7 @@ def __call__( self, y_true: np.ndarray, y_pred: np.ndarray, - sample_weight: Optional[List[float]] = None + sample_weight: Optional[List[float]] = None, ) -> float: """Evaluate predicted target values for X relative to y_true. @@ -74,34 +78,37 @@ def __call__( Score function applied to prediction of estimator on X. """ type_true = type_of_target(y_true) - if type_true == 'binary' and type_of_target(y_pred) == 'continuous' and \ - len(y_pred.shape) == 1: + if ( + type_true == "binary" + and type_of_target(y_pred) == "continuous" + and len(y_pred.shape) == 1 + ): # For a pred scorer, no threshold, nor probability is required # If y_true is binary, and y_pred is continuous # it means that a rounding is necessary to obtain the binary class y_pred = np.around(y_pred, decimals=0) - elif len(y_pred.shape) == 1 or y_pred.shape[1] == 1 or \ - type_true == 'continuous': + elif ( + len(y_pred.shape) == 1 or y_pred.shape[1] == 1 or type_true == "continuous" + ): # must be regression, all other task types would return at least # two probabilities pass - elif type_true in ['binary', 'multiclass']: + elif type_true in ["binary", "multiclass"]: y_pred = np.argmax(y_pred, axis=1) - elif type_true == 'multilabel-indicator': + elif type_true == "multilabel-indicator": y_pred[y_pred > 0.5] = 1.0 y_pred[y_pred <= 0.5] = 0.0 - elif type_true == 'continuous-multioutput': + elif type_true == "continuous-multioutput": pass else: raise ValueError(type_true) if sample_weight is not None: - return self._sign * self._score_func(y_true, y_pred, - sample_weight=sample_weight, - **self._kwargs) + return self._sign * self._score_func( + y_true, y_pred, sample_weight=sample_weight, **self._kwargs + ) else: - return self._sign * self._score_func(y_true, y_pred, - **self._kwargs) + return self._sign * self._score_func(y_true, y_pred, **self._kwargs) class _ProbaScorer(Scorer): @@ -109,7 +116,7 @@ def __call__( self, y_true: np.ndarray, y_pred: np.ndarray, - sample_weight: Optional[List[float]] = None + sample_weight: Optional[List[float]] = None, ) -> float: """Evaluate predicted probabilities for X relative to y_true. Parameters @@ -136,21 +143,24 @@ def __call__( if n_labels_pred != n_labels_test: labels = list(range(n_labels_pred)) if sample_weight is not None: - return self._sign * self._score_func(y_true, y_pred, - sample_weight=sample_weight, - labels=labels, - **self._kwargs) + return self._sign * self._score_func( + y_true, + y_pred, + sample_weight=sample_weight, + labels=labels, + **self._kwargs, + ) else: - return self._sign * self._score_func(y_true, y_pred, - labels=labels, **self._kwargs) + return self._sign * self._score_func( + y_true, y_pred, labels=labels, **self._kwargs + ) if sample_weight is not None: - return self._sign * self._score_func(y_true, y_pred, - sample_weight=sample_weight, - **self._kwargs) + return self._sign * self._score_func( + y_true, y_pred, sample_weight=sample_weight, **self._kwargs + ) else: - return self._sign * self._score_func(y_true, y_pred, - **self._kwargs) + return self._sign * self._score_func(y_true, y_pred, **self._kwargs) class _ThresholdScorer(Scorer): @@ -158,7 +168,7 @@ def __call__( self, y_true: np.ndarray, y_pred: np.ndarray, - sample_weight: Optional[List[float]] = None + sample_weight: Optional[List[float]] = None, ) -> float: """Evaluate decision function output for X relative to y_true. Parameters @@ -189,9 +199,9 @@ def __call__( y_pred = np.vstack([p[:, -1] for p in y_pred]).T if sample_weight is not None: - return self._sign * self._score_func(y_true, y_pred, - sample_weight=sample_weight, - **self._kwargs) + return self._sign * self._score_func( + y_true, y_pred, sample_weight=sample_weight, **self._kwargs + ) else: return self._sign * self._score_func(y_true, y_pred, **self._kwargs) @@ -204,7 +214,7 @@ def make_scorer( greater_is_better: bool = True, needs_proba: bool = False, needs_threshold: bool = False, - **kwargs: Any + **kwargs: Any, ) -> Scorer: """Make a scorer from a performance metric or loss function. @@ -244,96 +254,114 @@ def make_scorer( """ sign = 1 if greater_is_better else -1 if needs_proba: - return _ProbaScorer(name, score_func, optimum, worst_possible_result, sign, kwargs) + return _ProbaScorer( + name, score_func, optimum, worst_possible_result, sign, kwargs + ) elif needs_threshold: - return _ThresholdScorer(name, score_func, optimum, worst_possible_result, sign, kwargs) + return _ThresholdScorer( + name, score_func, optimum, worst_possible_result, sign, kwargs + ) else: - return _PredictScorer(name, score_func, optimum, worst_possible_result, sign, kwargs) + return _PredictScorer( + name, score_func, optimum, worst_possible_result, sign, kwargs + ) # Standard regression scores -mean_absolute_error = make_scorer('mean_absolute_error', - sklearn.metrics.mean_absolute_error, - optimum=0, - worst_possible_result=MAXINT, - greater_is_better=False) -mean_squared_error = make_scorer('mean_squared_error', - sklearn.metrics.mean_squared_error, - optimum=0, - worst_possible_result=MAXINT, - greater_is_better=False, - squared=True) -root_mean_squared_error = make_scorer('root_mean_squared_error', - sklearn.metrics.mean_squared_error, - optimum=0, - worst_possible_result=MAXINT, - greater_is_better=False, - squared=False) -mean_squared_log_error = make_scorer('mean_squared_log_error', - sklearn.metrics.mean_squared_log_error, - optimum=0, - worst_possible_result=MAXINT, - greater_is_better=False,) -median_absolute_error = make_scorer('median_absolute_error', - sklearn.metrics.median_absolute_error, - optimum=0, - worst_possible_result=MAXINT, - greater_is_better=False) - -r2 = make_scorer('r2', sklearn.metrics.r2_score) +mean_absolute_error = make_scorer( + "mean_absolute_error", + sklearn.metrics.mean_absolute_error, + optimum=0, + worst_possible_result=MAXINT, + greater_is_better=False, +) +mean_squared_error = make_scorer( + "mean_squared_error", + sklearn.metrics.mean_squared_error, + optimum=0, + worst_possible_result=MAXINT, + greater_is_better=False, + squared=True, +) +root_mean_squared_error = make_scorer( + "root_mean_squared_error", + sklearn.metrics.mean_squared_error, + optimum=0, + worst_possible_result=MAXINT, + greater_is_better=False, + squared=False, +) +mean_squared_log_error = make_scorer( + "mean_squared_log_error", + sklearn.metrics.mean_squared_log_error, + optimum=0, + worst_possible_result=MAXINT, + greater_is_better=False, +) +median_absolute_error = make_scorer( + "median_absolute_error", + sklearn.metrics.median_absolute_error, + optimum=0, + worst_possible_result=MAXINT, + greater_is_better=False, +) + +r2 = make_scorer("r2", sklearn.metrics.r2_score) # Standard Classification Scores -accuracy = make_scorer('accuracy', - sklearn.metrics.accuracy_score) -balanced_accuracy = make_scorer('balanced_accuracy', - sklearn.metrics.balanced_accuracy_score) +accuracy = make_scorer("accuracy", sklearn.metrics.accuracy_score) +balanced_accuracy = make_scorer( + "balanced_accuracy", sklearn.metrics.balanced_accuracy_score +) # Score functions that need decision values -roc_auc = make_scorer('roc_auc', - sklearn.metrics.roc_auc_score, - greater_is_better=True, - needs_threshold=True) -average_precision = make_scorer('average_precision', - sklearn.metrics.average_precision_score, - needs_threshold=True) +roc_auc = make_scorer( + "roc_auc", + sklearn.metrics.roc_auc_score, + greater_is_better=True, + needs_threshold=True, +) +average_precision = make_scorer( + "average_precision", sklearn.metrics.average_precision_score, needs_threshold=True +) # NOTE: zero_division # # Specified as the explicit default, see sklearn docs: # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn-metrics-precision-score precision = make_scorer( - 'precision', partial(sklearn.metrics.precision_score, zero_division=0) -) -recall = make_scorer( - 'recall', partial(sklearn.metrics.recall_score, zero_division=0) -) -f1 = make_scorer( - 'f1', partial(sklearn.metrics.f1_score, zero_division=0) + "precision", partial(sklearn.metrics.precision_score, zero_division=0) ) +recall = make_scorer("recall", partial(sklearn.metrics.recall_score, zero_division=0)) +f1 = make_scorer("f1", partial(sklearn.metrics.f1_score, zero_division=0)) # Score function for probabilistic classification -log_loss = make_scorer('log_loss', - sklearn.metrics.log_loss, - optimum=0, - worst_possible_result=MAXINT, - greater_is_better=False, - needs_proba=True) +log_loss = make_scorer( + "log_loss", + sklearn.metrics.log_loss, + optimum=0, + worst_possible_result=MAXINT, + greater_is_better=False, + needs_proba=True, +) # TODO what about mathews correlation coefficient etc? REGRESSION_METRICS = { scorer.name: scorer for scorer in [ - mean_absolute_error, mean_squared_error, root_mean_squared_error, - mean_squared_log_error, median_absolute_error, r2 + mean_absolute_error, + mean_squared_error, + root_mean_squared_error, + mean_squared_log_error, + median_absolute_error, + r2, ] } CLASSIFICATION_METRICS = { scorer.name: scorer - for scorer in [ - accuracy, balanced_accuracy, roc_auc, average_precision, log_loss - ] + for scorer in [accuracy, balanced_accuracy, roc_auc, average_precision, log_loss] } # NOTE: zero_division @@ -342,13 +370,13 @@ def make_scorer( # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn-metrics-precision-score for (base_name, sklearn_metric), average in product( [ - ('precision', sklearn.metrics.precision_score), - ('recall', sklearn.metrics.recall_score), - ('f1', sklearn.metrics.f1_score), + ("precision", sklearn.metrics.precision_score), + ("recall", sklearn.metrics.recall_score), + ("f1", sklearn.metrics.f1_score), ], - ['macro', 'micro', 'samples', 'weighted'] + ["macro", "micro", "samples", "weighted"], ): - name = f'{base_name}_{average}' + name = f"{base_name}_{average}" scorer = make_scorer( name, partial(sklearn_metric, pos_label=None, average=average, zero_division=0) ) @@ -361,7 +389,7 @@ def calculate_score( prediction: np.ndarray, task_type: int, metric: Scorer, - scoring_functions: Optional[List[Scorer]] = None + scoring_functions: Optional[List[Scorer]] = None, ) -> Union[float, Dict[str, float]]: """ Returns a score (a magnitude that allows casting the @@ -396,11 +424,15 @@ def calculate_score( try: score_dict[metric_.name] = _compute_scorer( - metric_, prediction, solution, task_type) + metric_, prediction, solution, task_type + ) except ValueError as e: print(e, e.args[0]) - if e.args[0] == "Mean Squared Logarithmic Error cannot be used when " \ - "targets contain negative values.": + if ( + e.args[0] + == "Mean Squared Logarithmic Error cannot be used when " + "targets contain negative values." + ): continue else: raise e @@ -413,16 +445,21 @@ def calculate_score( try: score_dict[metric_.name] = _compute_scorer( - metric_, prediction, solution, task_type) + metric_, prediction, solution, task_type + ) except ValueError as e: - if e.args[0] == 'multiclass format is not supported': + if e.args[0] == "multiclass format is not supported": continue - elif e.args[0] == "Samplewise metrics are not available "\ - "outside of multilabel classification.": + elif ( + e.args[0] == "Samplewise metrics are not available " + "outside of multilabel classification." + ): continue - elif e.args[0] == "Target is multiclass but "\ - "average='binary'. Please choose another average "\ - "setting, one of [None, 'micro', 'macro', 'weighted'].": + elif ( + e.args[0] == "Target is multiclass but " + "average='binary'. Please choose another average " + "setting, one of [None, 'micro', 'macro', 'weighted']." + ): continue else: raise e @@ -438,7 +475,7 @@ def calculate_loss( prediction: np.ndarray, task_type: int, metric: Scorer, - scoring_functions: Optional[List[Scorer]] = None + scoring_functions: Optional[List[Scorer]] = None, ) -> Union[float, Dict[str, float]]: """ Returns a loss (a magnitude that allows casting the @@ -493,10 +530,7 @@ def calculate_loss( def calculate_metric( - metric: Scorer, - prediction: np.ndarray, - solution: np.ndarray, - task_type: int + metric: Scorer, prediction: np.ndarray, solution: np.ndarray, task_type: int ) -> float: """ Returns a metric for the given Auto-Sklearn Scorer object. @@ -529,10 +563,7 @@ def calculate_metric( def _compute_scorer( - metric: Scorer, - prediction: np.ndarray, - solution: np.ndarray, - task_type: int + metric: Scorer, prediction: np.ndarray, solution: np.ndarray, task_type: int ) -> float: """ Returns a score (a magnitude that allows casting the @@ -566,9 +597,9 @@ def _compute_scorer( # Must be at bottom so all metrics are defined default_metric_for_task: Dict[int, Scorer] = { - BINARY_CLASSIFICATION: CLASSIFICATION_METRICS['accuracy'], - MULTICLASS_CLASSIFICATION: CLASSIFICATION_METRICS['accuracy'], - MULTILABEL_CLASSIFICATION: CLASSIFICATION_METRICS['f1_macro'], - REGRESSION: REGRESSION_METRICS['r2'], - MULTIOUTPUT_REGRESSION: REGRESSION_METRICS['r2'], + BINARY_CLASSIFICATION: CLASSIFICATION_METRICS["accuracy"], + MULTICLASS_CLASSIFICATION: CLASSIFICATION_METRICS["accuracy"], + MULTILABEL_CLASSIFICATION: CLASSIFICATION_METRICS["f1_macro"], + REGRESSION: REGRESSION_METRICS["r2"], + MULTIOUTPUT_REGRESSION: REGRESSION_METRICS["r2"], } diff --git a/autosklearn/metrics/util.py b/autosklearn/metrics/util.py index b4537d13aa..c25a25b4dc 100644 --- a/autosklearn/metrics/util.py +++ b/autosklearn/metrics/util.py @@ -11,8 +11,8 @@ def sanitize_array(array: np.ndarray) -> np.ndarray: a = np.ravel(array) maxi = np.nanmax(a[np.isfinite(a)]) mini = np.nanmin(a[np.isfinite(a)]) - array[array == float('inf')] = maxi - array[array == float('-inf')] = mini + array[array == float("inf")] = maxi + array[array == float("-inf")] = mini mid = (maxi + mini) / 2 array[np.isnan(array)] = mid return array diff --git a/autosklearn/pipeline/base.py b/autosklearn/pipeline/base.py index 467e7b26c1..93c73b4716 100644 --- a/autosklearn/pipeline/base.py +++ b/autosklearn/pipeline/base.py @@ -1,17 +1,15 @@ from abc import ABCMeta from typing import Dict, Union -from ConfigSpace import Configuration - import numpy as np - import scipy.sparse - +from ConfigSpace import Configuration from sklearn.pipeline import Pipeline -from .components.base import AutoSklearnChoice, AutoSklearnComponent import autosklearn.pipeline.create_searchspace_util +from .components.base import AutoSklearnChoice, AutoSklearnComponent + DATASET_PROPERTIES_TYPE = Dict[str, Union[str, int, bool]] PIPELINE_DATA_DTYPE = Union[ np.ndarray, @@ -31,17 +29,26 @@ class BasePipeline(Pipeline): Notes ----- This class should not be instantiated, only subclassed.""" + __metaclass__ = ABCMeta - def __init__(self, config=None, steps=None, dataset_properties=None, - include=None, exclude=None, random_state=None, - init_params=None): + def __init__( + self, + config=None, + steps=None, + dataset_properties=None, + include=None, + exclude=None, + random_state=None, + init_params=None, + ): self.init_params = init_params if init_params is not None else {} self.include = include if include is not None else {} self.exclude = exclude if exclude is not None else {} - self.dataset_properties = dataset_properties if \ - dataset_properties is not None else {} + self.dataset_properties = ( + dataset_properties if dataset_properties is not None else {} + ) self.random_state = random_state if steps is None: @@ -62,13 +69,17 @@ def __init__(self, config=None, steps=None, dataset_properties=None, print(self.config_space._children) print(config.configuration_space._children) import difflib + diff = difflib.unified_diff( str(self.config_space).splitlines(), - str(config.configuration_space).splitlines()) - diff = '\n'.join(diff) - raise ValueError('Configuration passed does not come from the ' - 'same configuration space. Differences are: ' - '%s' % diff) + str(config.configuration_space).splitlines(), + ) + diff = "\n".join(diff) + raise ValueError( + "Configuration passed does not come from the " + "same configuration space. Differences are: " + "%s" % diff + ) self.config = config self.set_hyperparameters(self.config, init_params=init_params) @@ -111,21 +122,22 @@ def fit_transformer(self, X, y, fit_params=None): self.num_targets = 1 if len(y.shape) == 1 else y.shape[1] if fit_params is None: fit_params = {} - fit_params = {key.replace(":", "__"): value for key, value in - fit_params.items()} + fit_params = { + key.replace(":", "__"): value for key, value in fit_params.items() + } fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) return Xt, fit_params_steps[self.steps[-1][0]] def fit_estimator(self, X, y, **fit_params): - fit_params = {key.replace(":", "__"): value for key, value in - fit_params.items()} + fit_params = { + key.replace(":", "__"): value for key, value in fit_params.items() + } self._final_estimator.fit(X, y, **fit_params) return self def iterative_fit(self, X, y, n_iter=1, **fit_params): - self._final_estimator.iterative_fit(X, y, n_iter=n_iter, - **fit_params) + self._final_estimator.iterative_fit(X, y, n_iter=n_iter, **fit_params) def estimator_supports_iterative_fit(self): return self._final_estimator.estimator_supports_iterative_fit() @@ -163,26 +175,30 @@ def predict(self, X, batch_size=None): return super().predict(X).astype(self._output_dtype) else: if not isinstance(batch_size, int): - raise ValueError("Argument 'batch_size' must be of type int, " - "but is '%s'" % type(batch_size)) + raise ValueError( + "Argument 'batch_size' must be of type int, " + "but is '%s'" % type(batch_size) + ) if batch_size <= 0: - raise ValueError("Argument 'batch_size' must be positive, " - "but is %d" % batch_size) + raise ValueError( + "Argument 'batch_size' must be positive, " "but is %d" % batch_size + ) else: if self.num_targets == 1: y = np.zeros((X.shape[0],), dtype=self._output_dtype) else: - y = np.zeros((X.shape[0], self.num_targets), - dtype=self._output_dtype) + y = np.zeros( + (X.shape[0], self.num_targets), dtype=self._output_dtype + ) # Copied and adapted from the scikit-learn GP code - for k in range(max(1, int(np.ceil(float(X.shape[0]) / - batch_size)))): + for k in range(max(1, int(np.ceil(float(X.shape[0]) / batch_size)))): batch_from = k * batch_size batch_to = min([(k + 1) * batch_size, X.shape[0]]) - y[batch_from:batch_to] = \ - self.predict(X[batch_from:batch_to], batch_size=None) + y[batch_from:batch_to] = self.predict( + X[batch_from:batch_to], batch_size=None + ) return y @@ -197,29 +213,33 @@ def set_hyperparameters(self, configuration, init_params=None): ) sub_config_dict = {} for param in configuration: - if param.startswith('%s:' % node_name): + if param.startswith("%s:" % node_name): value = configuration[param] - new_name = param.replace('%s:' % node_name, '', 1) + new_name = param.replace("%s:" % node_name, "", 1) sub_config_dict[new_name] = value - sub_configuration = Configuration(sub_configuration_space, - values=sub_config_dict) + sub_configuration = Configuration( + sub_configuration_space, values=sub_config_dict + ) if init_params is not None: sub_init_params_dict = {} for param in init_params: - if param.startswith('%s:' % node_name): + if param.startswith("%s:" % node_name): value = init_params[param] - new_name = param.replace('%s:' % node_name, '', 1) + new_name = param.replace("%s:" % node_name, "", 1) sub_init_params_dict[new_name] = value else: sub_init_params_dict = None - if isinstance(node, (AutoSklearnChoice, AutoSklearnComponent, BasePipeline)): - node.set_hyperparameters(configuration=sub_configuration, - init_params=sub_init_params_dict) + if isinstance( + node, (AutoSklearnChoice, AutoSklearnComponent, BasePipeline) + ): + node.set_hyperparameters( + configuration=sub_configuration, init_params=sub_init_params_dict + ) else: - raise NotImplementedError('Not supported yet!') + raise NotImplementedError("Not supported yet!") # In-code check to make sure init params # is checked after pipeline creation @@ -236,14 +256,17 @@ def get_hyperparameter_search_space(self, dataset_properties=None): The configuration space describing the AutoSklearnClassifier. """ - if not hasattr(self, 'config_space') or self.config_space is None: + if not hasattr(self, "config_space") or self.config_space is None: self.config_space = self._get_hyperparameter_search_space( - include=self.include, exclude=self.exclude, - dataset_properties=self.dataset_properties) + include=self.include, + exclude=self.exclude, + dataset_properties=self.dataset_properties, + ) return self.config_space - def _get_hyperparameter_search_space(self, include=None, exclude=None, - dataset_properties=None): + def _get_hyperparameter_search_space( + self, include=None, exclude=None, dataset_properties=None + ): """Return the configuration space for the CASH problem. This method should be called by the method @@ -283,8 +306,9 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, """ raise NotImplementedError() - def _get_base_search_space(self, cs, dataset_properties, exclude, - include, pipeline): + def _get_base_search_space( + self, cs, dataset_properties, exclude, include, pipeline + ): if include is None: if self.include is None: include = {} @@ -294,8 +318,9 @@ def _get_base_search_space(self, cs, dataset_properties, exclude, keys = [pair[0] for pair in pipeline] for key in include: if key not in keys: - raise ValueError('Invalid key in include: %s; should be one ' - 'of %s' % (key, keys)) + raise ValueError( + "Invalid key in include: %s; should be one " "of %s" % (key, keys) + ) if exclude is None: if self.exclude is None: @@ -306,26 +331,32 @@ def _get_base_search_space(self, cs, dataset_properties, exclude, keys = [pair[0] for pair in pipeline] for key in exclude: if key not in keys: - raise ValueError('Invalid key in exclude: %s; should be one ' - 'of %s' % (key, keys)) + raise ValueError( + "Invalid key in exclude: %s; should be one " "of %s" % (key, keys) + ) - if 'sparse' not in dataset_properties: + if "sparse" not in dataset_properties: # This dataset is probably dense - dataset_properties['sparse'] = False - if 'signed' not in dataset_properties: + dataset_properties["sparse"] = False + if "signed" not in dataset_properties: # This dataset probably contains unsigned data - dataset_properties['signed'] = False + dataset_properties["signed"] = False matches = autosklearn.pipeline.create_searchspace_util.get_match_array( - pipeline, dataset_properties, include=include, exclude=exclude) + pipeline, dataset_properties, include=include, exclude=exclude + ) # Now we have only legal combinations at this step of the pipeline # Simple sanity checks assert np.sum(matches) != 0, "No valid pipeline found." - assert np.sum(matches) <= np.size(matches), \ - "'matches' is not binary; %s <= %d, %s" % \ - (str(np.sum(matches)), np.size(matches), str(matches.shape)) + assert np.sum(matches) <= np.size( + matches + ), "'matches' is not binary; %s <= %d, %s" % ( + str(np.sum(matches)), + np.size(matches), + str(matches.shape), + ) # Iterate each dimension of the matches array (each step of the # pipeline) to see if we can add a hyperparameter for that step @@ -340,26 +371,36 @@ def _get_base_search_space(self, cs, dataset_properties, exclude, cs.add_configuration_space( node_name, node.get_hyperparameter_search_space(dataset_properties), - ) + ) # If the node is a choice, we have to figure out which of its # choices are actually legal choices else: - choices_list = autosklearn.pipeline.create_searchspace_util.\ - find_active_choices(matches, node, node_idx, - dataset_properties, - include.get(node_name), - exclude.get(node_name)) + choices_list = ( + autosklearn.pipeline.create_searchspace_util.find_active_choices( + matches, + node, + node_idx, + dataset_properties, + include.get(node_name), + exclude.get(node_name), + ) + ) sub_config_space = node.get_hyperparameter_search_space( - dataset_properties, include=choices_list) + dataset_properties, include=choices_list + ) cs.add_configuration_space(node_name, sub_config_space) # And now add forbidden parameter configurations # According to matches if np.sum(matches) < np.size(matches): cs = autosklearn.pipeline.create_searchspace_util.add_forbidden( - conf_space=cs, pipeline=pipeline, matches=matches, - dataset_properties=dataset_properties, include=include, - exclude=exclude) + conf_space=cs, + pipeline=pipeline, + matches=matches, + dataset_properties=dataset_properties, + include=include, + exclude=exclude, + ) return cs @@ -371,36 +412,35 @@ def _check_init_params_honored(self, init_params): # None/empty dict, so no further check required return - # There is the scenario, where instance is passed as an argument to the init_params - # 'instance': '{"task_id": "73543c4a360aa24498c0967fbc2f926b"}'} + # There is the scenario, where instance is passed as an argument to the + # init_params 'instance': '{"task_id": "73543c4a360aa24498c0967fbc2f926b"}'} # coming from smac instance. Remove this key to make the testing stricter - init_params.pop('instance', None) + init_params.pop("instance", None) for key, value in init_params.items(): - if ':' not in key: - raise ValueError("Unsupported argument to init_params {}." - "When using init_params, a hierarchical format like " - "node_name:parameter must be provided.".format(key) - ) - node_name = key.split(':', 1)[0] + if ":" not in key: + raise ValueError( + "Unsupported argument to init_params {}." + "When using init_params, a hierarchical format like " + "node_name:parameter must be provided.".format(key) + ) + node_name = key.split(":", 1)[0] if node_name not in self.named_steps.keys(): - raise ValueError("The current node name specified via key={} of init_params " - "is not valid. Valid node names are {}".format( - key, - self.named_steps.keys() - ) - ) + raise ValueError( + "The current node name specified via key={} of init_params " + "is not valid. Valid node names are {}".format( + key, self.named_steps.keys() + ) + ) continue - variable_name = key.split(':')[-1] + variable_name = key.split(":")[-1] node = self.named_steps[node_name] if isinstance(node, BasePipeline): # If dealing with a sub pipe, # Call the child _check_init_params_honored with the updated config node._check_init_params_honored( - { - key.replace('%s:' % node_name, '', 1): value - } + {key.replace("%s:" % node_name, "", 1): value} ) continue @@ -412,8 +452,10 @@ def _check_init_params_honored(self, init_params): raise ValueError("Unsupported node type {}".format(type(node))) if variable_name not in node_dict or node_dict[variable_name] != value: - raise ValueError("Cannot properly set the pair {}->{} via init_params" - "".format(key, value)) + raise ValueError( + "Cannot properly set the pair {}->{} via init_params" + "".format(key, value) + ) def __repr__(self): class_name = self.__class__.__name__ @@ -424,34 +466,42 @@ def __repr__(self): if self.config[hp_name] is not None: configuration[hp_name] = self.config[hp_name] - configuration_string = ''.join( - ['configuration={\n ', - ',\n '.join(["'%s': %s" % (hp_name, repr(configuration[hp_name])) - for hp_name in sorted(configuration)]), - '}']) + configuration_string = "".join( + [ + "configuration={\n ", + ",\n ".join( + [ + "'%s': %s" % (hp_name, repr(configuration[hp_name])) + for hp_name in sorted(configuration) + ] + ), + "}", + ] + ) if len(self.dataset_properties) > 0: dataset_properties_string = [] - dataset_properties_string.append('dataset_properties={') + dataset_properties_string.append("dataset_properties={") for i, item in enumerate(self.dataset_properties.items()): if i != 0: - dataset_properties_string.append(',\n ') + dataset_properties_string.append(",\n ") else: - dataset_properties_string.append('\n ') + dataset_properties_string.append("\n ") if isinstance(item[1], str): - dataset_properties_string.append("'%s': '%s'" % (item[0], - item[1])) + dataset_properties_string.append("'%s': '%s'" % (item[0], item[1])) else: - dataset_properties_string.append("'%s': %s" % (item[0], - item[1])) - dataset_properties_string.append('}') - dataset_properties_string = ''.join(dataset_properties_string) - - rval = '%s(%s,\n%s)' % (class_name, configuration, - dataset_properties_string) + dataset_properties_string.append("'%s': %s" % (item[0], item[1])) + dataset_properties_string.append("}") + dataset_properties_string = "".join(dataset_properties_string) + + rval = "%s(%s,\n%s)" % ( + class_name, + configuration, + dataset_properties_string, + ) else: - rval = '%s(%s)' % (class_name, configuration_string) + rval = "%s(%s)" % (class_name, configuration_string) return rval @@ -473,32 +523,54 @@ def _validate_include_exclude_params(self): if self.include is not None and self.exclude is not None: for key in self.include.keys(): if key in self.exclude.keys(): - raise ValueError("Cannot specify include and exclude for same step '{}'." - .format(key)) + raise ValueError( + "Cannot specify include and exclude for same step '{}'.".format( + key + ) + ) - supported_steps = {step[0]: step[1] for step in self.steps - if isinstance(step[1], AutoSklearnChoice)} - for arg in ['include', 'exclude']: + supported_steps = { + step[0]: step[1] + for step in self.steps + if isinstance(step[1], AutoSklearnChoice) + } + for arg in ["include", "exclude"]: argument = getattr(self, arg) if not argument: continue for key in list(argument.keys()): if key not in supported_steps: - raise ValueError("The provided key '{}' in the '{}' argument is not valid. The" - " only supported keys for this task are {}" - .format(key, arg, list(supported_steps.keys()))) + raise ValueError( + "The provided key '{}' in the '{}' argument is not valid. The" + " only supported keys for this task are {}".format( + key, arg, list(supported_steps.keys()) + ) + ) candidate_components = argument[key] - if not (isinstance(candidate_components, list) and candidate_components): - raise ValueError("The provided value of the key '{}' in the '{}' argument is " - "not valid. The value must be a non-empty list." - .format(key, arg)) + if not ( + isinstance(candidate_components, list) and candidate_components + ): + raise ValueError( + "The provided value of the key '{}' in the '{}' argument is " + "not valid. The value must be a non-empty list.".format( + key, arg + ) + ) - available_components = list(supported_steps[key].get_available_components( - dataset_properties=self.dataset_properties).keys()) + available_components = list( + supported_steps[key] + .get_available_components( + dataset_properties=self.dataset_properties + ) + .keys() + ) for component in candidate_components: if component not in available_components: - raise ValueError("The provided component '{}' for the key '{}' in the '{}'" - " argument is not valid. The supported components for the" - " step '{}' for this task are {}" - .format(component, key, arg, key, available_components)) + raise ValueError( + "The provided component '{}' for the key '{}' in the '{}'" + " argument is not valid. The supported components for the" + " step '{}' for this task are {}".format( + component, key, arg, key, available_components + ) + ) diff --git a/autosklearn/pipeline/classification.py b/autosklearn/pipeline/classification.py index 44ae129f4d..1686e02809 100644 --- a/autosklearn/pipeline/classification.py +++ b/autosklearn/pipeline/classification.py @@ -1,21 +1,22 @@ +from typing import Optional, Union + import copy from itertools import product -from typing import Optional, Union import numpy as np - +from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause from sklearn.base import ClassifierMixin -from ConfigSpace.configuration_space import ConfigurationSpace, Configuration -from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction - -from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice - -from autosklearn.pipeline.components.classification import ClassifierChoice -from autosklearn.pipeline.components.data_preprocessing.balancing.balancing import \ - Balancing -from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice from autosklearn.pipeline.base import BasePipeline +from autosklearn.pipeline.components.classification import ClassifierChoice +from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice +from autosklearn.pipeline.components.data_preprocessing.balancing.balancing import ( + Balancing, +) +from autosklearn.pipeline.components.feature_preprocessing import ( + FeaturePreprocessorChoice, +) from autosklearn.pipeline.constants import SPARSE @@ -75,13 +76,13 @@ def __init__( include=None, exclude=None, random_state: Optional[Union[int, np.random.RandomState]] = None, - init_params=None + init_params=None, ): self._output_dtype = np.int32 if dataset_properties is None: dataset_properties = dict() - if 'target_type' not in dataset_properties: - dataset_properties['target_type'] = 'classification' + if "target_type" not in dataset_properties: + dataset_properties["target_type"] = "classification" super().__init__( config=config, steps=steps, @@ -89,7 +90,7 @@ def __init__( include=include, exclude=exclude, random_state=random_state, - init_params=init_params + init_params=init_params, ) def fit_transformer(self, X, y, fit_params=None): @@ -97,21 +98,24 @@ def fit_transformer(self, X, y, fit_params=None): if fit_params is None: fit_params = {} - if self.config['balancing:strategy'] == 'weighting': - balancing = Balancing(strategy='weighting') + if self.config["balancing:strategy"] == "weighting": + balancing = Balancing(strategy="weighting") _init_params, _fit_params = balancing.get_weights( - y, self.config['classifier:__choice__'], - self.config['feature_preprocessor:__choice__'], - {}, {}) + y, + self.config["classifier:__choice__"], + self.config["feature_preprocessor:__choice__"], + {}, + {}, + ) _init_params.update(self.init_params) - self.set_hyperparameters(configuration=self.config, - init_params=_init_params) + self.set_hyperparameters( + configuration=self.config, init_params=_init_params + ) if _fit_params is not None: fit_params.update(_fit_params) - X, fit_params = super().fit_transformer( - X, y, fit_params=fit_params) + X, fit_params = super().fit_transformer(X, y, fit_params=fit_params) return X, fit_params @@ -136,29 +140,34 @@ def predict_proba(self, X, batch_size=None): else: if not isinstance(batch_size, int): - raise ValueError("Argument 'batch_size' must be of type int, " - "but is '%s'" % type(batch_size)) + raise ValueError( + "Argument 'batch_size' must be of type int, " + "but is '%s'" % type(batch_size) + ) if batch_size <= 0: - raise ValueError("Argument 'batch_size' must be positive, " - "but is %d" % batch_size) + raise ValueError( + "Argument 'batch_size' must be positive, " "but is %d" % batch_size + ) else: # Probe for the target array dimensions target = self.predict_proba(X[0:2].copy()) - y = np.zeros((X.shape[0], target.shape[1]), - dtype=np.float32) + y = np.zeros((X.shape[0], target.shape[1]), dtype=np.float32) for k in range(max(1, int(np.ceil(float(X.shape[0]) / batch_size)))): batch_from = k * batch_size batch_to = min([(k + 1) * batch_size, X.shape[0]]) - pred_prob = self.predict_proba(X[batch_from:batch_to], batch_size=None) + pred_prob = self.predict_proba( + X[batch_from:batch_to], batch_size=None + ) y[batch_from:batch_to] = pred_prob.astype(np.float32) return y - def _get_hyperparameter_search_space(self, include=None, exclude=None, - dataset_properties=None): + def _get_hyperparameter_search_space( + self, include=None, exclude=None, dataset_properties=None + ): """Create the hyperparameter configuration space. Parameters @@ -174,42 +183,52 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, if dataset_properties is None or not isinstance(dataset_properties, dict): dataset_properties = dict() - if 'target_type' not in dataset_properties: - dataset_properties['target_type'] = 'classification' - if dataset_properties['target_type'] != 'classification': - dataset_properties['target_type'] = 'classification' + if "target_type" not in dataset_properties: + dataset_properties["target_type"] = "classification" + if dataset_properties["target_type"] != "classification": + dataset_properties["target_type"] = "classification" - if 'sparse' not in dataset_properties: + if "sparse" not in dataset_properties: # This dataset is probably dense - dataset_properties['sparse'] = False + dataset_properties["sparse"] = False cs = self._get_base_search_space( - cs=cs, dataset_properties=dataset_properties, - exclude=exclude, include=include, pipeline=self.steps) + cs=cs, + dataset_properties=dataset_properties, + exclude=exclude, + include=include, + pipeline=self.steps, + ) - classifiers = cs.get_hyperparameter('classifier:__choice__').choices - preprocessors = cs.get_hyperparameter('feature_preprocessor:__choice__').choices + classifiers = cs.get_hyperparameter("classifier:__choice__").choices + preprocessors = cs.get_hyperparameter("feature_preprocessor:__choice__").choices available_classifiers = self._final_estimator.get_available_components( - dataset_properties) + dataset_properties + ) - possible_default_classifier = copy.copy(list( - available_classifiers.keys())) - default = cs.get_hyperparameter('classifier:__choice__').default_value + possible_default_classifier = copy.copy(list(available_classifiers.keys())) + default = cs.get_hyperparameter("classifier:__choice__").default_value del possible_default_classifier[possible_default_classifier.index(default)] # A classifier which can handle sparse data after the densifier is # forbidden for memory issues for key in classifiers: - if SPARSE in available_classifiers[key].get_properties()['input']: - if 'densifier' in preprocessors: + if SPARSE in available_classifiers[key].get_properties()["input"]: + if "densifier" in preprocessors: while True: try: forb_cls = ForbiddenEqualsClause( - cs.get_hyperparameter('classifier:__choice__'), key) - forb_fpp = ForbiddenEqualsClause(cs.get_hyperparameter( - 'feature_preprocessor:__choice__'), 'densifier') + cs.get_hyperparameter("classifier:__choice__"), key + ) + forb_fpp = ForbiddenEqualsClause( + cs.get_hyperparameter( + "feature_preprocessor:__choice__" + ), + "densifier", + ) cs.add_forbidden_clause( - ForbiddenAndConjunction(forb_cls, forb_fpp)) + ForbiddenAndConjunction(forb_cls, forb_fpp) + ) # Success break except ValueError: @@ -218,20 +237,29 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, default = possible_default_classifier.pop() except IndexError: raise ValueError( - "Cannot find a legal default configuration.") + "Cannot find a legal default configuration." + ) cs.get_hyperparameter( - 'classifier:__choice__').default_value = default + "classifier:__choice__" + ).default_value = default # which would take too long # Combinations of non-linear models with feature learning: classifiers_ = [ - "adaboost", "decision_tree", "extra_trees", - "gradient_boosting", "k_nearest_neighbors", - "libsvm_svc", "mlp", "random_forest", + "adaboost", + "decision_tree", + "extra_trees", + "gradient_boosting", + "k_nearest_neighbors", + "libsvm_svc", + "mlp", + "random_forest", "gaussian_nb", ] feature_learning = [ - "kernel_pca", "kitchen_sinks", "nystroem_sampler", + "kernel_pca", + "kitchen_sinks", + "nystroem_sampler", ] for c, f in product(classifiers_, feature_learning): @@ -241,11 +269,19 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, continue while True: try: - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - "classifier:__choice__"), c), - ForbiddenEqualsClause(cs.get_hyperparameter( - "feature_preprocessor:__choice__"), f))) + cs.add_forbidden_clause( + ForbiddenAndConjunction( + ForbiddenEqualsClause( + cs.get_hyperparameter("classifier:__choice__"), c + ), + ForbiddenEqualsClause( + cs.get_hyperparameter( + "feature_preprocessor:__choice__" + ), + f, + ), + ) + ) break except KeyError: break @@ -254,16 +290,22 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, try: default = possible_default_classifier.pop() except IndexError: - raise ValueError( - "Cannot find a legal default configuration.") + raise ValueError("Cannot find a legal default configuration.") cs.get_hyperparameter( - 'classifier:__choice__').default_value = default + "classifier:__choice__" + ).default_value = default # Won't work # Multinomial NB etc don't use with features learning, pca etc classifiers_ = ["multinomial_nb"] - preproc_with_negative_X = ["kitchen_sinks", "pca", "truncatedSVD", - "fast_ica", "kernel_pca", "nystroem_sampler"] + preproc_with_negative_X = [ + "kitchen_sinks", + "pca", + "truncatedSVD", + "fast_ica", + "kernel_pca", + "nystroem_sampler", + ] for c, f in product(classifiers_, preproc_with_negative_X): if c not in classifiers: @@ -272,11 +314,19 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, continue while True: try: - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - "feature_preprocessor:__choice__"), f), - ForbiddenEqualsClause(cs.get_hyperparameter( - "classifier:__choice__"), c))) + cs.add_forbidden_clause( + ForbiddenAndConjunction( + ForbiddenEqualsClause( + cs.get_hyperparameter( + "feature_preprocessor:__choice__" + ), + f, + ), + ForbiddenEqualsClause( + cs.get_hyperparameter("classifier:__choice__"), c + ), + ) + ) break except KeyError: break @@ -285,10 +335,10 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, try: default = possible_default_classifier.pop() except IndexError: - raise ValueError( - "Cannot find a legal default configuration.") + raise ValueError("Cannot find a legal default configuration.") cs.get_hyperparameter( - 'classifier:__choice__').default_value = default + "classifier:__choice__" + ).default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties @@ -297,30 +347,36 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, def _get_pipeline_steps(self, dataset_properties): steps = [] - default_dataset_properties = {'target_type': 'classification'} + default_dataset_properties = {"target_type": "classification"} if dataset_properties is not None and isinstance(dataset_properties, dict): default_dataset_properties.update(dataset_properties) - steps.extend([ - [ - "data_preprocessor", DataPreprocessorChoice( - dataset_properties=default_dataset_properties, - random_state=self.random_state) - ], - [ - "balancing", Balancing(random_state=self.random_state) - ], + steps.extend( [ - "feature_preprocessor", FeaturePreprocessorChoice( - dataset_properties=default_dataset_properties, - random_state=self.random_state) - ], - [ - 'classifier', ClassifierChoice( - dataset_properties=default_dataset_properties, - random_state=self.random_state) + [ + "data_preprocessor", + DataPreprocessorChoice( + dataset_properties=default_dataset_properties, + random_state=self.random_state, + ), + ], + ["balancing", Balancing(random_state=self.random_state)], + [ + "feature_preprocessor", + FeaturePreprocessorChoice( + dataset_properties=default_dataset_properties, + random_state=self.random_state, + ), + ], + [ + "classifier", + ClassifierChoice( + dataset_properties=default_dataset_properties, + random_state=self.random_state, + ), + ], ] - ]) + ) return steps diff --git a/autosklearn/pipeline/components/base.py b/autosklearn/pipeline/components/base.py index 5864a2a5d6..c4a95df08c 100644 --- a/autosklearn/pipeline/components/base.py +++ b/autosklearn/pipeline/components/base.py @@ -1,9 +1,10 @@ -from collections import OrderedDict +from typing import Dict + import importlib import inspect import pkgutil import sys -from typing import Dict +from collections import OrderedDict from sklearn.base import BaseEstimator, TransformerMixin @@ -21,8 +22,11 @@ def find_components(package, directory, base_class): module = importlib.import_module(full_module_name) for member_name, obj in inspect.getmembers(module): - if inspect.isclass(obj) and issubclass(obj, base_class) and \ - obj != base_class: + if ( + inspect.isclass(obj) + and issubclass(obj, base_class) + and obj != base_class + ): # TODO test if the obj implements the interface # Keep in mind that this only instantiates the ensemble_wrapper, # but not the real target classifier @@ -42,24 +46,35 @@ def add_component(self, obj): name = obj.__name__ classifier = obj else: - raise TypeError('add_component works only with a subclass of %s' % - str(self.base_class)) + raise TypeError( + "add_component works only with a subclass of %s" % str(self.base_class) + ) properties = set(classifier.get_properties()) - should_be_there = {'shortname', 'name', 'handles_regression', - 'handles_classification', 'handles_multiclass', - 'handles_multilabel', 'handles_multioutput', - 'is_deterministic', 'input', 'output'} + should_be_there = { + "shortname", + "name", + "handles_regression", + "handles_classification", + "handles_multiclass", + "handles_multilabel", + "handles_multioutput", + "is_deterministic", + "input", + "output", + } for property in properties: if property not in should_be_there: - raise ValueError('Property %s must not be specified for ' - 'algorithm %s. Only the following properties ' - 'can be specified: %s' % - (property, name, str(should_be_there))) + raise ValueError( + "Property %s must not be specified for " + "algorithm %s. Only the following properties " + "can be specified: %s" % (property, name, str(should_be_there)) + ) for property in should_be_there: if property not in properties: - raise ValueError('Property %s not specified for algorithm %s' % - (property, name)) + raise ValueError( + "Property %s not specified for algorithm %s" % (property, name) + ) self.components[name] = classifier @@ -126,34 +141,35 @@ def set_hyperparameters(self, configuration, init_params=None): for param, value in params.items(): if not hasattr(self, param): - raise ValueError('Cannot set hyperparameter %s for %s because ' - 'the hyperparameter does not exist.' % - (param, str(self))) + raise ValueError( + "Cannot set hyperparameter %s for %s because " + "the hyperparameter does not exist." % (param, str(self)) + ) setattr(self, param, value) if init_params is not None: for param, value in init_params.items(): if not hasattr(self, param): - raise ValueError('Cannot set init param %s for %s because ' - 'the init param does not exist.' % - (param, str(self))) + raise ValueError( + "Cannot set init param %s for %s because " + "the init param does not exist." % (param, str(self)) + ) setattr(self, param, value) return self def __str__(self): - name = self.get_properties()['name'] + name = self.get_properties()["name"] return "autosklearn.pipeline %s" % name class IterativeComponent(AutoSklearnComponent): - def fit(self, X, y, sample_weight=None): self.iterative_fit(X, y, n_iter=2, refit=True) iteration = 2 while not self.configuration_fully_fitted(): - n_iter = int(2 ** iteration / 2) + n_iter = int(2**iteration / 2) self.iterative_fit(X, y, n_iter=n_iter, refit=False) iteration += 1 @@ -168,14 +184,15 @@ def get_current_iter(self): class IterativeComponentWithSampleWeight(AutoSklearnComponent): - def fit(self, X, y, sample_weight=None): self.iterative_fit(X, y, n_iter=2, refit=True, sample_weight=sample_weight) iteration = 2 while not self.configuration_fully_fitted(): - n_iter = int(2 ** iteration / 2) - self.iterative_fit(X, y, n_iter=n_iter, refit=False, sample_weight=sample_weight) + n_iter = int(2**iteration / 2) + self.iterative_fit( + X, y, n_iter=n_iter, refit=False, sample_weight=sample_weight + ) iteration += 1 return self @@ -356,23 +373,25 @@ def __init__(self, dataset_properties, random_state=None): def get_components(cls): raise NotImplementedError() - def get_available_components(self, dataset_properties=None, - include=None, - exclude=None): + def get_available_components( + self, dataset_properties=None, include=None, exclude=None + ): if dataset_properties is None: dataset_properties = {} if include is not None and exclude is not None: raise ValueError( - "The argument include and exclude cannot be used together.") + "The argument include and exclude cannot be used together." + ) available_comp = self.get_components() if include is not None: for incl in include: if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) + raise ValueError( + "Trying to include unknown component: " "%s" % incl + ) components_dict = OrderedDict() for name in available_comp: @@ -381,14 +400,14 @@ def get_available_components(self, dataset_properties=None, elif exclude is not None and name in exclude: continue - if 'sparse' in dataset_properties and dataset_properties['sparse']: + if "sparse" in dataset_properties and dataset_properties["sparse"]: # In case the dataset is sparse, ignore # components that do not handle sparse data # Auto-sklearn uses SPARSE constant as a mechanism # to indicate whether a component can handle sparse data. # If SPARSE is not in the input properties of the component, it # means SPARSE is not a valid input to this component, so filter it out - if SPARSE not in available_comp[name].get_properties()['input']: + if SPARSE not in available_comp[name].get_properties()["input"]: continue components_dict[name] = available_comp[name] @@ -399,29 +418,28 @@ def set_hyperparameters(self, configuration, init_params=None): new_params = {} params = configuration.get_dictionary() - choice = params['__choice__'] - del params['__choice__'] + choice = params["__choice__"] + del params["__choice__"] for param, value in params.items(): - param = param.replace(choice, '').replace(':', '') + param = param.replace(choice, "").replace(":", "") new_params[param] = value if init_params is not None: for param, value in init_params.items(): - param = param.replace(choice, '').replace(':', '') + param = param.replace(choice, "").replace(":", "") new_params[param] = value - new_params['random_state'] = self.random_state + new_params["random_state"] = self.random_state self.new_params = new_params self.choice = self.get_components()[choice](**new_params) return self - def get_hyperparameter_search_space(self, dataset_properties=None, - default=None, - include=None, - exclude=None): + def get_hyperparameter_search_space( + self, dataset_properties=None, default=None, include=None, exclude=None + ): raise NotImplementedError() def fit(self, X, y, **kwargs): diff --git a/autosklearn/pipeline/components/classification/__init__.py b/autosklearn/pipeline/components/classification/__init__.py index 2dca6623ae..c95334273a 100644 --- a/autosklearn/pipeline/components/classification/__init__.py +++ b/autosklearn/pipeline/components/classification/__init__.py @@ -1,20 +1,27 @@ -__author__ = 'feurerm' +__author__ = "feurerm" -from collections import OrderedDict from typing import Type + import os +from collections import OrderedDict -from ..base import AutoSklearnClassificationAlgorithm, find_components, \ - ThirdPartyComponents, AutoSklearnChoice, _addons from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from ..base import ( + AutoSklearnChoice, + AutoSklearnClassificationAlgorithm, + ThirdPartyComponents, + _addons, + find_components, +) + classifier_directory = os.path.split(__file__)[0] -_classifiers = find_components(__package__, - classifier_directory, - AutoSklearnClassificationAlgorithm) +_classifiers = find_components( + __package__, classifier_directory, AutoSklearnClassificationAlgorithm +) additional_components = ThirdPartyComponents(AutoSklearnClassificationAlgorithm) -_addons['classification'] = additional_components +_addons["classification"] = additional_components def add_classifier(classifier: Type[AutoSklearnClassificationAlgorithm]) -> None: @@ -22,7 +29,6 @@ def add_classifier(classifier: Type[AutoSklearnClassificationAlgorithm]) -> None class ClassifierChoice(AutoSklearnChoice): - @classmethod def get_components(cls): components = OrderedDict() @@ -30,9 +36,9 @@ def get_components(cls): components.update(additional_components.components) return components - def get_available_components(cls, dataset_properties=None, - include=None, - exclude=None): + def get_available_components( + cls, dataset_properties=None, include=None, exclude=None + ): if dataset_properties is None: dataset_properties = {} @@ -40,13 +46,16 @@ def get_available_components(cls, dataset_properties=None, components_dict = OrderedDict() if include is not None and exclude is not None: - raise ValueError("The argument include and exclude cannot be used together.") + raise ValueError( + "The argument include and exclude cannot be used together." + ) if include is not None: for incl in include: if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) + raise ValueError( + "Trying to include unknown component: " "%s" % incl + ) for name in available_comp: if include is not None and name not in include: @@ -60,43 +69,47 @@ def get_available_components(cls, dataset_properties=None, if entry == ClassifierChoice: continue - if entry.get_properties()['handles_classification'] is False: + if entry.get_properties()["handles_classification"] is False: continue - if dataset_properties.get('multiclass') is True and \ - entry.get_properties()['handles_multiclass'] is False: + if ( + dataset_properties.get("multiclass") is True + and entry.get_properties()["handles_multiclass"] is False + ): continue - if dataset_properties.get('multilabel') is True and \ - available_comp[name].get_properties()['handles_multilabel'] is False: + if ( + dataset_properties.get("multilabel") is True + and available_comp[name].get_properties()["handles_multilabel"] is False + ): continue components_dict[name] = entry return components_dict - def get_hyperparameter_search_space(self, dataset_properties=None, - default=None, - include=None, - exclude=None): + def get_hyperparameter_search_space( + self, dataset_properties=None, default=None, include=None, exclude=None + ): if dataset_properties is None: dataset_properties = {} if include is not None and exclude is not None: - raise ValueError("The arguments include and " - "exclude cannot be used together.") + raise ValueError( + "The arguments include and " "exclude cannot be used together." + ) cs = ConfigurationSpace() # Compile a list of all estimator objects for this problem available_estimators = self.get_available_components( - dataset_properties=dataset_properties, - include=include, - exclude=exclude) + dataset_properties=dataset_properties, include=include, exclude=exclude + ) if len(available_estimators) == 0: raise ValueError("No classifiers found") if default is None: - defaults = ['random_forest', 'liblinear_svc', 'sgd', - 'libsvm_svc'] + list(available_estimators.keys()) + defaults = ["random_forest", "liblinear_svc", "sgd", "libsvm_svc"] + list( + available_estimators.keys() + ) for default_ in defaults: if default_ in available_estimators: if include is not None and default_ not in include: @@ -106,18 +119,20 @@ def get_hyperparameter_search_space(self, dataset_properties=None, default = default_ break - estimator = CategoricalHyperparameter('__choice__', - list(available_estimators.keys()), - default_value=default) + estimator = CategoricalHyperparameter( + "__choice__", list(available_estimators.keys()), default_value=default + ) cs.add_hyperparameter(estimator) for estimator_name in available_estimators.keys(): - estimator_configuration_space = available_estimators[estimator_name].\ - get_hyperparameter_search_space(dataset_properties) - parent_hyperparameter = {'parent': estimator, - 'value': estimator_name} - cs.add_configuration_space(estimator_name, - estimator_configuration_space, - parent_hyperparameter=parent_hyperparameter) + estimator_configuration_space = available_estimators[ + estimator_name + ].get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {"parent": estimator, "value": estimator_name} + cs.add_configuration_space( + estimator_name, + estimator_configuration_space, + parent_hyperparameter=parent_hyperparameter, + ) self.configuration_space = cs self.dataset_properties = dataset_properties @@ -127,7 +142,7 @@ def predict_proba(self, X): return self.choice.predict_proba(X) def estimator_supports_iterative_fit(self): - return hasattr(self.choice, 'iterative_fit') + return hasattr(self.choice, "iterative_fit") def get_max_iter(self): if self.estimator_supports_iterative_fit(): diff --git a/autosklearn/pipeline/components/classification/adaboost.py b/autosklearn/pipeline/components/classification/adaboost.py index 31567aaeae..3634f53956 100644 --- a/autosklearn/pipeline/components/classification/adaboost.py +++ b/autosklearn/pipeline/components/classification/adaboost.py @@ -1,15 +1,18 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA class AdaboostClassifier(AutoSklearnClassificationAlgorithm): - - def __init__(self, n_estimators, learning_rate, algorithm, max_depth, - random_state=None): + def __init__( + self, n_estimators, learning_rate, algorithm, max_depth, random_state=None + ): self.n_estimators = n_estimators self.learning_rate = learning_rate self.algorithm = algorithm @@ -31,7 +34,7 @@ def fit(self, X, Y, sample_weight=None): n_estimators=self.n_estimators, learning_rate=self.learning_rate, algorithm=self.algorithm, - random_state=self.random_state + random_state=self.random_state, ) estimator.fit(X, Y, sample_weight=sample_weight) @@ -51,29 +54,35 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'AB', - 'name': 'AdaBoost Classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "AB", + "name": "AdaBoost Classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_estimators = UniformIntegerHyperparameter( - name="n_estimators", lower=50, upper=500, default_value=50, log=False) + name="n_estimators", lower=50, upper=500, default_value=50, log=False + ) learning_rate = UniformFloatHyperparameter( - name="learning_rate", lower=0.01, upper=2, default_value=0.1, log=True) + name="learning_rate", lower=0.01, upper=2, default_value=0.1, log=True + ) algorithm = CategoricalHyperparameter( - name="algorithm", choices=["SAMME.R", "SAMME"], default_value="SAMME.R") + name="algorithm", choices=["SAMME.R", "SAMME"], default_value="SAMME.R" + ) max_depth = UniformIntegerHyperparameter( - name="max_depth", lower=1, upper=10, default_value=1, log=False) + name="max_depth", lower=1, upper=10, default_value=1, log=False + ) cs.add_hyperparameters([n_estimators, learning_rate, algorithm, max_depth]) return cs diff --git a/autosklearn/pipeline/components/classification/bernoulli_nb.py b/autosklearn/pipeline/components/classification/bernoulli_nb.py index 9bb2f8c590..8271c5f602 100644 --- a/autosklearn/pipeline/components/classification/bernoulli_nb.py +++ b/autosklearn/pipeline/components/classification/bernoulli_nb.py @@ -1,13 +1,12 @@ import numpy as np - from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter - -from autosklearn.pipeline.components.base import ( - AutoSklearnClassificationAlgorithm, +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, ) -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE + +from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool @@ -23,14 +22,18 @@ def fit(self, X, y): import sklearn.naive_bayes self.fit_prior = check_for_bool(self.fit_prior) - self.estimator = sklearn.naive_bayes.BernoulliNB(alpha=self.alpha, fit_prior=self.fit_prior) + self.estimator = sklearn.naive_bayes.BernoulliNB( + alpha=self.alpha, fit_prior=self.fit_prior + ) self.classes_ = np.unique(y.astype(int)) # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass + self.estimator = sklearn.multiclass.OneVsRestClassifier( - self.estimator, n_jobs=1) + self.estimator, n_jobs=1 + ) self.estimator.fit(X, y) return self @@ -47,16 +50,18 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'BernoulliNB', - 'name': 'Bernoulli Naive Bayes classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "BernoulliNB", + "name": "Bernoulli Naive Bayes classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): @@ -65,12 +70,13 @@ def get_hyperparameter_search_space(dataset_properties=None): # the smoothing parameter is a non-negative float # I will limit it to 1000 and put it on a logarithmic scale. (SF) # Please adjust that, if you know a proper range, this is just a guess. - alpha = UniformFloatHyperparameter(name="alpha", lower=1e-2, upper=100, - default_value=1, log=True) + alpha = UniformFloatHyperparameter( + name="alpha", lower=1e-2, upper=100, default_value=1, log=True + ) - fit_prior = CategoricalHyperparameter(name="fit_prior", - choices=["True", "False"], - default_value="True") + fit_prior = CategoricalHyperparameter( + name="fit_prior", choices=["True", "False"], default_value="True" + ) cs.add_hyperparameters([alpha, fit_prior]) diff --git a/autosklearn/pipeline/components/classification/decision_tree.py b/autosklearn/pipeline/components/classification/decision_tree.py index 045e5c3e44..fbfc6b7c6a 100644 --- a/autosklearn/pipeline/components/classification/decision_tree.py +++ b/autosklearn/pipeline/components/classification/decision_tree.py @@ -1,22 +1,35 @@ import numpy as np - from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter, \ - UnParametrizedHyperparameter, Constant +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) -from autosklearn.pipeline.components.base import \ - AutoSklearnClassificationAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE -from autosklearn.pipeline.implementations.util import convert_multioutput_multiclass_to_multilabel +from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA +from autosklearn.pipeline.implementations.util import ( + convert_multioutput_multiclass_to_multilabel, +) from autosklearn.util.common import check_none class DecisionTree(AutoSklearnClassificationAlgorithm): - def __init__(self, criterion, max_features, max_depth_factor, - min_samples_split, min_samples_leaf, min_weight_fraction_leaf, - max_leaf_nodes, min_impurity_decrease, class_weight=None, - random_state=None): + def __init__( + self, + criterion, + max_features, + max_depth_factor, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_leaf_nodes, + min_impurity_decrease, + class_weight=None, + random_state=None, + ): self.criterion = criterion self.max_features = max_features self.max_depth_factor = max_depth_factor @@ -40,8 +53,8 @@ def fit(self, X, y, sample_weight=None): num_features = X.shape[1] self.max_depth_factor = int(self.max_depth_factor) max_depth_factor = max( - 1, - int(np.round(self.max_depth_factor * num_features, 0))) + 1, int(np.round(self.max_depth_factor * num_features, 0)) + ) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) if check_none(self.max_leaf_nodes): @@ -60,7 +73,8 @@ def fit(self, X, y, sample_weight=None): min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, class_weight=self.class_weight, - random_state=self.random_state) + random_state=self.random_state, + ) self.estimator.fit(X, y, sample_weight=sample_weight) return self @@ -78,37 +92,53 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'DT', - 'name': 'Decision Tree Classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "DT", + "name": "Decision Tree Classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() criterion = CategoricalHyperparameter( - "criterion", ["gini", "entropy"], default_value="gini") + "criterion", ["gini", "entropy"], default_value="gini" + ) max_depth_factor = UniformFloatHyperparameter( - 'max_depth_factor', 0., 2., default_value=0.5) + "max_depth_factor", 0.0, 2.0, default_value=0.5 + ) min_samples_split = UniformIntegerHyperparameter( - "min_samples_split", 2, 20, default_value=2) + "min_samples_split", 2, 20, default_value=2 + ) min_samples_leaf = UniformIntegerHyperparameter( - "min_samples_leaf", 1, 20, default_value=1) + "min_samples_leaf", 1, 20, default_value=1 + ) min_weight_fraction_leaf = Constant("min_weight_fraction_leaf", 0.0) - max_features = UnParametrizedHyperparameter('max_features', 1.0) + max_features = UnParametrizedHyperparameter("max_features", 1.0) max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") - min_impurity_decrease = UnParametrizedHyperparameter('min_impurity_decrease', 0.0) + min_impurity_decrease = UnParametrizedHyperparameter( + "min_impurity_decrease", 0.0 + ) - cs.add_hyperparameters([criterion, max_features, max_depth_factor, - min_samples_split, min_samples_leaf, - min_weight_fraction_leaf, max_leaf_nodes, - min_impurity_decrease]) + cs.add_hyperparameters( + [ + criterion, + max_features, + max_depth_factor, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_leaf_nodes, + min_impurity_decrease, + ] + ) return cs diff --git a/autosklearn/pipeline/components/classification/extra_trees.py b/autosklearn/pipeline/components/classification/extra_trees.py index bc3eb5e4d7..5c7ce1879a 100644 --- a/autosklearn/pipeline/components/classification/extra_trees.py +++ b/autosklearn/pipeline/components/classification/extra_trees.py @@ -1,13 +1,19 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter, UnParametrizedHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, IterativeComponentWithSampleWeight, ) -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE -from autosklearn.pipeline.implementations.util import convert_multioutput_multiclass_to_multilabel +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA +from autosklearn.pipeline.implementations.util import ( + convert_multioutput_multiclass_to_multilabel, +) from autosklearn.util.common import check_for_bool, check_none @@ -15,12 +21,23 @@ class ExtraTreesClassifier( IterativeComponentWithSampleWeight, AutoSklearnClassificationAlgorithm, ): - - def __init__(self, criterion, min_samples_leaf, - min_samples_split, max_features, bootstrap, max_leaf_nodes, - max_depth, min_weight_fraction_leaf, min_impurity_decrease, - oob_score=False, n_jobs=1, random_state=None, verbose=0, - class_weight=None): + def __init__( + self, + criterion, + min_samples_leaf, + min_samples_split, + max_features, + bootstrap, + max_leaf_nodes, + max_depth, + min_weight_fraction_leaf, + min_impurity_decrease, + oob_score=False, + n_jobs=1, + random_state=None, + verbose=0, + class_weight=None, + ): self.n_estimators = self.get_max_iter() self.criterion = criterion @@ -55,8 +72,9 @@ def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): if self.estimator is None: max_features = int(X.shape[1] ** float(self.max_features)) if self.criterion not in ("gini", "entropy"): - raise ValueError("'criterion' is not in ('gini', 'entropy'): " - "%s" % self.criterion) + raise ValueError( + "'criterion' is not in ('gini', 'entropy'): " "%s" % self.criterion + ) if check_none(self.max_depth): self.max_depth = None @@ -77,27 +95,30 @@ def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): self.n_jobs = int(self.n_jobs) self.verbose = int(self.verbose) - self.estimator = ETC(n_estimators=n_iter, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - bootstrap=self.bootstrap, - max_features=max_features, - max_leaf_nodes=self.max_leaf_nodes, - min_weight_fraction_leaf=self.min_weight_fraction_leaf, - min_impurity_decrease=self.min_impurity_decrease, - oob_score=self.oob_score, - n_jobs=self.n_jobs, - verbose=self.verbose, - random_state=self.random_state, - class_weight=self.class_weight, - warm_start=True) + self.estimator = ETC( + n_estimators=n_iter, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + bootstrap=self.bootstrap, + max_features=max_features, + max_leaf_nodes=self.max_leaf_nodes, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + min_impurity_decrease=self.min_impurity_decrease, + oob_score=self.oob_score, + n_jobs=self.n_jobs, + verbose=self.verbose, + random_state=self.random_state, + class_weight=self.class_weight, + warm_start=True, + ) else: self.estimator.n_estimators += n_iter - self.estimator.n_estimators = min(self.estimator.n_estimators, - self.n_estimators) + self.estimator.n_estimators = min( + self.estimator.n_estimators, self.n_estimators + ) self.estimator.fit(X, y, sample_weight=sample_weight) return self @@ -121,46 +142,67 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'ET', - 'name': 'Extra Trees Classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "ET", + "name": "Extra Trees Classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() criterion = CategoricalHyperparameter( - "criterion", ["gini", "entropy"], default_value="gini") - - # The maximum number of features used in the forest is calculated as m^max_features, where - # m is the total number of features, and max_features is the hyperparameter specified below. - # The default is 0.5, which yields sqrt(m) features as max_features in the estimator. This - # corresponds with Geurts' heuristic. + "criterion", ["gini", "entropy"], default_value="gini" + ) + + # The maximum number of features used in the forest is calculated as + # m^max_features, where m is the total number of features, + # and max_features is the hyperparameter specified below. + # The default is 0.5, which yields sqrt(m) features as max_features + # in the estimator. This corresponds with Geurts' heuristic. max_features = UniformFloatHyperparameter( - "max_features", 0., 1., default_value=0.5) + "max_features", 0.0, 1.0, default_value=0.5 + ) max_depth = UnParametrizedHyperparameter(name="max_depth", value="None") min_samples_split = UniformIntegerHyperparameter( - "min_samples_split", 2, 20, default_value=2) + "min_samples_split", 2, 20, default_value=2 + ) min_samples_leaf = UniformIntegerHyperparameter( - "min_samples_leaf", 1, 20, default_value=1) - min_weight_fraction_leaf = UnParametrizedHyperparameter('min_weight_fraction_leaf', 0.) + "min_samples_leaf", 1, 20, default_value=1 + ) + min_weight_fraction_leaf = UnParametrizedHyperparameter( + "min_weight_fraction_leaf", 0.0 + ) max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") - min_impurity_decrease = UnParametrizedHyperparameter('min_impurity_decrease', 0.0) + min_impurity_decrease = UnParametrizedHyperparameter( + "min_impurity_decrease", 0.0 + ) bootstrap = CategoricalHyperparameter( - "bootstrap", ["True", "False"], default_value="False") - cs.add_hyperparameters([criterion, max_features, - max_depth, min_samples_split, min_samples_leaf, - min_weight_fraction_leaf, max_leaf_nodes, - min_impurity_decrease, bootstrap]) + "bootstrap", ["True", "False"], default_value="False" + ) + cs.add_hyperparameters( + [ + criterion, + max_features, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_leaf_nodes, + min_impurity_decrease, + bootstrap, + ] + ) return cs diff --git a/autosklearn/pipeline/components/classification/gaussian_nb.py b/autosklearn/pipeline/components/classification/gaussian_nb.py index cae1733baf..8e978e9631 100644 --- a/autosklearn/pipeline/components/classification/gaussian_nb.py +++ b/autosklearn/pipeline/components/classification/gaussian_nb.py @@ -1,15 +1,11 @@ import numpy as np - from ConfigSpace.configuration_space import ConfigurationSpace -from autosklearn.pipeline.components.base import ( - AutoSklearnClassificationAlgorithm, -) -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS +from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA class GaussianNB(AutoSklearnClassificationAlgorithm): - def __init__(self, random_state=None, verbose=0): self.random_state = random_state @@ -25,8 +21,10 @@ def fit(self, X, y): # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass + self.estimator = sklearn.multiclass.OneVsRestClassifier( - self.estimator, n_jobs=1) + self.estimator, n_jobs=1 + ) self.estimator.fit(X, y) return self @@ -43,16 +41,18 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'GaussianNB', - 'name': 'Gaussian Naive Bayes classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "GaussianNB", + "name": "Gaussian Naive Bayes classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/gradient_boosting.py b/autosklearn/pipeline/components/classification/gradient_boosting.py index 0faca0faa2..50b0b284bd 100644 --- a/autosklearn/pipeline/components/classification/gradient_boosting.py +++ b/autosklearn/pipeline/components/classification/gradient_boosting.py @@ -1,26 +1,42 @@ import numpy as np - -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, UnParametrizedHyperparameter, Constant, \ - CategoricalHyperparameter from ConfigSpace.conditions import EqualsCondition, InCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, - IterativeComponentWithSampleWeight) -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS + IterativeComponentWithSampleWeight, +) +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA from autosklearn.util.common import check_none class GradientBoostingClassifier( - IterativeComponentWithSampleWeight, - AutoSklearnClassificationAlgorithm + IterativeComponentWithSampleWeight, AutoSklearnClassificationAlgorithm ): - def __init__(self, loss, learning_rate, min_samples_leaf, max_depth, - max_leaf_nodes, max_bins, l2_regularization, early_stop, tol, scoring, - n_iter_no_change=0, validation_fraction=None, random_state=None, - verbose=0): + def __init__( + self, + loss, + learning_rate, + min_samples_leaf, + max_depth, + max_leaf_nodes, + max_bins, + l2_regularization, + early_stop, + tol, + scoring, + n_iter_no_change=0, + validation_fraction=None, + random_state=None, + verbose=0, + ): self.loss = loss self.learning_rate = learning_rate self.max_iter = self.get_max_iter() @@ -119,13 +135,14 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): ) else: self.estimator.max_iter += n_iter - self.estimator.max_iter = min(self.estimator.max_iter, - self.max_iter) + self.estimator.max_iter = min(self.estimator.max_iter, self.max_iter) self.estimator.fit(X, y, sample_weight=sample_weight) - if self.estimator.max_iter >= self.max_iter \ - or self.estimator.max_iter > self.estimator.n_iter_: + if ( + self.estimator.max_iter >= self.max_iter + or self.estimator.max_iter > self.estimator.n_iter_ + ): self.fully_fit_ = True @@ -134,7 +151,7 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): def configuration_fully_fitted(self): if self.estimator is None: return False - elif not hasattr(self, 'fully_fit_'): + elif not hasattr(self, "fully_fit_"): return False else: return self.fully_fit_ @@ -151,53 +168,77 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'GB', - 'name': 'Gradient Boosting Classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "GB", + "name": "Gradient Boosting Classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = Constant("loss", "auto") learning_rate = UniformFloatHyperparameter( - name="learning_rate", lower=0.01, upper=1, default_value=0.1, log=True) + name="learning_rate", lower=0.01, upper=1, default_value=0.1, log=True + ) min_samples_leaf = UniformIntegerHyperparameter( - name="min_samples_leaf", lower=1, upper=200, default_value=20, log=True) - max_depth = UnParametrizedHyperparameter( - name="max_depth", value="None") + name="min_samples_leaf", lower=1, upper=200, default_value=20, log=True + ) + max_depth = UnParametrizedHyperparameter(name="max_depth", value="None") max_leaf_nodes = UniformIntegerHyperparameter( - name="max_leaf_nodes", lower=3, upper=2047, default_value=31, log=True) + name="max_leaf_nodes", lower=3, upper=2047, default_value=31, log=True + ) max_bins = Constant("max_bins", 255) l2_regularization = UniformFloatHyperparameter( - name="l2_regularization", lower=1E-10, upper=1, default_value=1E-10, log=True) + name="l2_regularization", + lower=1e-10, + upper=1, + default_value=1e-10, + log=True, + ) early_stop = CategoricalHyperparameter( - name="early_stop", choices=["off", "valid", "train"], default_value="off") - tol = UnParametrizedHyperparameter( - name="tol", value=1e-7) - scoring = UnParametrizedHyperparameter( - name="scoring", value="loss") + name="early_stop", choices=["off", "valid", "train"], default_value="off" + ) + tol = UnParametrizedHyperparameter(name="tol", value=1e-7) + scoring = UnParametrizedHyperparameter(name="scoring", value="loss") n_iter_no_change = UniformIntegerHyperparameter( - name="n_iter_no_change", lower=1, upper=20, default_value=10) + name="n_iter_no_change", lower=1, upper=20, default_value=10 + ) validation_fraction = UniformFloatHyperparameter( - name="validation_fraction", lower=0.01, upper=0.4, default_value=0.1) - - cs.add_hyperparameters([loss, learning_rate, min_samples_leaf, - max_depth, max_leaf_nodes, max_bins, l2_regularization, - early_stop, tol, scoring, n_iter_no_change, - validation_fraction]) + name="validation_fraction", lower=0.01, upper=0.4, default_value=0.1 + ) + + cs.add_hyperparameters( + [ + loss, + learning_rate, + min_samples_leaf, + max_depth, + max_leaf_nodes, + max_bins, + l2_regularization, + early_stop, + tol, + scoring, + n_iter_no_change, + validation_fraction, + ] + ) n_iter_no_change_cond = InCondition( - n_iter_no_change, early_stop, ["valid", "train"]) + n_iter_no_change, early_stop, ["valid", "train"] + ) validation_fraction_cond = EqualsCondition( - validation_fraction, early_stop, "valid") + validation_fraction, early_stop, "valid" + ) cs.add_conditions([n_iter_no_change_cond, validation_fraction_cond]) diff --git a/autosklearn/pipeline/components/classification/k_nearest_neighbors.py b/autosklearn/pipeline/components/classification/k_nearest_neighbors.py index 6901451f11..fe55e0783d 100644 --- a/autosklearn/pipeline/components/classification/k_nearest_neighbors.py +++ b/autosklearn/pipeline/components/classification/k_nearest_neighbors.py @@ -1,12 +1,14 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA class KNearestNeighborsClassifier(AutoSklearnClassificationAlgorithm): - def __init__(self, n_neighbors, weights, p, random_state=None): self.n_neighbors = n_neighbors self.weights = weights @@ -14,13 +16,12 @@ def __init__(self, n_neighbors, weights, p, random_state=None): self.random_state = random_state def fit(self, X, Y): - import sklearn.neighbors import sklearn.multiclass + import sklearn.neighbors - estimator = \ - sklearn.neighbors.KNeighborsClassifier(n_neighbors=self.n_neighbors, - weights=self.weights, - p=self.p) + estimator = sklearn.neighbors.KNeighborsClassifier( + n_neighbors=self.n_neighbors, weights=self.weights, p=self.p + ) if len(Y.shape) == 2 and Y.shape[1] > 1: self.estimator = sklearn.multiclass.OneVsRestClassifier(estimator, n_jobs=1) @@ -42,25 +43,29 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'KNN', - 'name': 'K-Nearest Neighbor Classification', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "KNN", + "name": "K-Nearest Neighbor Classification", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_neighbors = UniformIntegerHyperparameter( - name="n_neighbors", lower=1, upper=100, log=True, default_value=1) + name="n_neighbors", lower=1, upper=100, log=True, default_value=1 + ) weights = CategoricalHyperparameter( - name="weights", choices=["uniform", "distance"], default_value="uniform") + name="weights", choices=["uniform", "distance"], default_value="uniform" + ) p = CategoricalHyperparameter(name="p", choices=[1, 2], default_value=2) cs.add_hyperparameters([n_neighbors, weights, p]) diff --git a/autosklearn/pipeline/components/classification/lda.py b/autosklearn/pipeline/components/classification/lda.py index 1897db78ca..29a08f80b5 100644 --- a/autosklearn/pipeline/components/classification/lda.py +++ b/autosklearn/pipeline/components/classification/lda.py @@ -1,17 +1,18 @@ -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, CategoricalHyperparameter from ConfigSpace.conditions import EqualsCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, +) -from autosklearn.pipeline.components.base import \ - AutoSklearnClassificationAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS +from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA from autosklearn.pipeline.implementations.util import softmax from autosklearn.util.common import check_none class LDA(AutoSklearnClassificationAlgorithm): - def __init__(self, shrinkage, tol, shrinkage_factor=0.5, - random_state=None): + def __init__(self, shrinkage, tol, shrinkage_factor=0.5, random_state=None): self.shrinkage = shrinkage self.tol = tol self.shrinkage_factor = shrinkage_factor @@ -23,20 +24,21 @@ def fit(self, X, Y): if check_none(self.shrinkage): self.shrinkage_ = None - solver = 'svd' + solver = "svd" elif self.shrinkage == "auto": - self.shrinkage_ = 'auto' - solver = 'lsqr' + self.shrinkage_ = "auto" + solver = "lsqr" elif self.shrinkage == "manual": self.shrinkage_ = float(self.shrinkage_factor) - solver = 'lsqr' + solver = "lsqr" else: raise ValueError(self.shrinkage) self.tol = float(self.tol) estimator = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( - shrinkage=self.shrinkage_, tol=self.tol, solver=solver) + shrinkage=self.shrinkage_, tol=self.tol, solver=solver + ) if len(Y.shape) == 2 and Y.shape[1] > 1: self.estimator = sklearn.multiclass.OneVsRestClassifier(estimator, n_jobs=1) @@ -60,25 +62,29 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'LDA', - 'name': 'Linear Discriminant Analysis', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "LDA", + "name": "Linear Discriminant Analysis", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() shrinkage = CategoricalHyperparameter( - "shrinkage", ["None", "auto", "manual"], default_value="None") - shrinkage_factor = UniformFloatHyperparameter( - "shrinkage_factor", 0., 1., 0.5) - tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default_value=1e-4, log=True) + "shrinkage", ["None", "auto", "manual"], default_value="None" + ) + shrinkage_factor = UniformFloatHyperparameter("shrinkage_factor", 0.0, 1.0, 0.5) + tol = UniformFloatHyperparameter( + "tol", 1e-5, 1e-1, default_value=1e-4, log=True + ) cs.add_hyperparameters([shrinkage, shrinkage_factor, tol]) cs.add_condition(EqualsCondition(shrinkage_factor, shrinkage, "manual")) diff --git a/autosklearn/pipeline/components/classification/liblinear_svc.py b/autosklearn/pipeline/components/classification/liblinear_svc.py index 9c625139f5..3f57ef8f94 100644 --- a/autosklearn/pipeline/components/classification/liblinear_svc.py +++ b/autosklearn/pipeline/components/classification/liblinear_svc.py @@ -1,20 +1,32 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter, Constant -from ConfigSpace.forbidden import ForbiddenEqualsClause, \ - ForbiddenAndConjunction +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.pipeline.implementations.util import softmax -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE from autosklearn.util.common import check_for_bool, check_none class LibLinear_SVC(AutoSklearnClassificationAlgorithm): # Liblinear is not deterministic as it uses a RNG inside - def __init__(self, penalty, loss, dual, tol, C, multi_class, - fit_intercept, intercept_scaling, class_weight=None, - random_state=None): + def __init__( + self, + penalty, + loss, + dual, + tol, + C, + multi_class, + fit_intercept, + intercept_scaling, + class_weight=None, + random_state=None, + ): self.penalty = penalty self.loss = loss self.dual = dual @@ -28,8 +40,8 @@ def __init__(self, penalty, loss, dual, tol, C, multi_class, self.estimator = None def fit(self, X, Y): - import sklearn.svm import sklearn.multiclass + import sklearn.svm self.C = float(self.C) self.tol = float(self.tol) @@ -43,16 +55,18 @@ def fit(self, X, Y): if check_none(self.class_weight): self.class_weight = None - estimator = sklearn.svm.LinearSVC(penalty=self.penalty, - loss=self.loss, - dual=self.dual, - tol=self.tol, - C=self.C, - class_weight=self.class_weight, - fit_intercept=self.fit_intercept, - intercept_scaling=self.intercept_scaling, - multi_class=self.multi_class, - random_state=self.random_state) + estimator = sklearn.svm.LinearSVC( + penalty=self.penalty, + loss=self.loss, + dual=self.dual, + tol=self.tol, + C=self.C, + class_weight=self.class_weight, + fit_intercept=self.fit_intercept, + intercept_scaling=self.intercept_scaling, + multi_class=self.multi_class, + random_state=self.random_state, + ) if len(Y.shape) == 2 and Y.shape[1] > 1: self.estimator = sklearn.multiclass.OneVsRestClassifier(estimator, n_jobs=1) @@ -76,50 +90,51 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'Liblinear-SVC', - 'name': 'Liblinear Support Vector Classification', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': False, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "Liblinear-SVC", + "name": "Liblinear Support Vector Classification", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": False, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() - penalty = CategoricalHyperparameter( - "penalty", ["l1", "l2"], default_value="l2") + penalty = CategoricalHyperparameter("penalty", ["l1", "l2"], default_value="l2") loss = CategoricalHyperparameter( - "loss", ["hinge", "squared_hinge"], default_value="squared_hinge") + "loss", ["hinge", "squared_hinge"], default_value="squared_hinge" + ) dual = Constant("dual", "False") # This is set ad-hoc tol = UniformFloatHyperparameter( - "tol", 1e-5, 1e-1, default_value=1e-4, log=True) - C = UniformFloatHyperparameter( - "C", 0.03125, 32768, log=True, default_value=1.0) + "tol", 1e-5, 1e-1, default_value=1e-4, log=True + ) + C = UniformFloatHyperparameter("C", 0.03125, 32768, log=True, default_value=1.0) multi_class = Constant("multi_class", "ovr") # These are set ad-hoc fit_intercept = Constant("fit_intercept", "True") intercept_scaling = Constant("intercept_scaling", 1) - cs.add_hyperparameters([penalty, loss, dual, tol, C, multi_class, - fit_intercept, intercept_scaling]) + cs.add_hyperparameters( + [penalty, loss, dual, tol, C, multi_class, fit_intercept, intercept_scaling] + ) penalty_and_loss = ForbiddenAndConjunction( - ForbiddenEqualsClause(penalty, "l1"), - ForbiddenEqualsClause(loss, "hinge") + ForbiddenEqualsClause(penalty, "l1"), ForbiddenEqualsClause(loss, "hinge") ) constant_penalty_and_loss = ForbiddenAndConjunction( ForbiddenEqualsClause(dual, "False"), ForbiddenEqualsClause(penalty, "l2"), - ForbiddenEqualsClause(loss, "hinge") + ForbiddenEqualsClause(loss, "hinge"), ) penalty_and_dual = ForbiddenAndConjunction( - ForbiddenEqualsClause(dual, "False"), - ForbiddenEqualsClause(penalty, "l1") + ForbiddenEqualsClause(dual, "False"), ForbiddenEqualsClause(penalty, "l1") ) cs.add_forbidden_clause(penalty_and_loss) cs.add_forbidden_clause(constant_penalty_and_loss) diff --git a/autosklearn/pipeline/components/classification/libsvm_svc.py b/autosklearn/pipeline/components/classification/libsvm_svc.py index 97c55be49d..ba423161c1 100644 --- a/autosklearn/pipeline/components/classification/libsvm_svc.py +++ b/autosklearn/pipeline/components/classification/libsvm_svc.py @@ -1,21 +1,35 @@ import resource import sys -from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.conditions import EqualsCondition, InCondition -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter, \ - UnParametrizedHyperparameter +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.pipeline.implementations.util import softmax from autosklearn.util.common import check_for_bool, check_none class LibSVM_SVC(AutoSklearnClassificationAlgorithm): - def __init__(self, C, kernel, gamma, shrinking, tol, max_iter, - class_weight=None, degree=3, coef0=0, random_state=None): + def __init__( + self, + C, + kernel, + gamma, + shrinking, + tol, + max_iter, + class_weight=None, + degree=3, + coef0=0, + random_state=None, + ): self.C = C self.kernel = kernel self.degree = degree @@ -31,9 +45,9 @@ def __init__(self, C, kernel, gamma, shrinking, tol, max_iter, def fit(self, X, Y): import sklearn.svm - # Calculate the size of the kernel cache (in MB) for sklearn's LibSVM. The cache size is - # calculated as 2/3 of the available memory (which is calculated as the memory limit minus - # the used memory) + # Calculate the size of the kernel cache (in MB) for sklearn's LibSVM. + # The cache size is calculated as 2/3 of the available memory + # (which is calculated as the memory limit minus the used memory) try: # Retrieve memory limits imposed on the process soft, hard = resource.getrlimit(resource.RLIMIT_AS) @@ -45,9 +59,9 @@ def fit(self, X, Y): # Retrieve memory used by this process maxrss = resource.getrusage(resource.RUSAGE_SELF)[2] / 1024 - # In MacOS, the MaxRSS output of resource.getrusage in bytes; on other platforms, - # it's in kilobytes - if sys.platform == 'darwin': + # In MacOS, the MaxRSS output of resource.getrusage in bytes; + # on other platforms, it's in kilobytes + if sys.platform == "darwin": maxrss = maxrss / 1024 cache_size = (soft - maxrss) / 1.5 @@ -80,18 +94,20 @@ def fit(self, X, Y): if check_none(self.class_weight): self.class_weight = None - self.estimator = sklearn.svm.SVC(C=self.C, - kernel=self.kernel, - degree=self.degree, - gamma=self.gamma, - coef0=self.coef0, - shrinking=self.shrinking, - tol=self.tol, - class_weight=self.class_weight, - max_iter=self.max_iter, - random_state=self.random_state, - cache_size=cache_size, - decision_function_shape='ovr') + self.estimator = sklearn.svm.SVC( + C=self.C, + kernel=self.kernel, + degree=self.degree, + gamma=self.gamma, + coef0=self.coef0, + shrinking=self.shrinking, + tol=self.tol, + class_weight=self.class_weight, + max_iter=self.max_iter, + random_state=self.random_state, + cache_size=cache_size, + decision_function_shape="ovr", + ) self.estimator.fit(X, Y) return self @@ -109,41 +125,45 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): return { - 'shortname': 'LibSVM-SVC', - 'name': 'LibSVM Support Vector Classification', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + "shortname": "LibSVM-SVC", + "name": "LibSVM Support Vector Classification", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): - C = UniformFloatHyperparameter("C", 0.03125, 32768, log=True, - default_value=1.0) + C = UniformFloatHyperparameter("C", 0.03125, 32768, log=True, default_value=1.0) # No linear kernel here, because we have liblinear - kernel = CategoricalHyperparameter(name="kernel", - choices=["rbf", "poly", "sigmoid"], - default_value="rbf") + kernel = CategoricalHyperparameter( + name="kernel", choices=["rbf", "poly", "sigmoid"], default_value="rbf" + ) degree = UniformIntegerHyperparameter("degree", 2, 5, default_value=3) - gamma = UniformFloatHyperparameter("gamma", 3.0517578125e-05, 8, - log=True, default_value=0.1) + gamma = UniformFloatHyperparameter( + "gamma", 3.0517578125e-05, 8, log=True, default_value=0.1 + ) # TODO this is totally ad-hoc coef0 = UniformFloatHyperparameter("coef0", -1, 1, default_value=0) # probability is no hyperparameter, but an argument to the SVM algo - shrinking = CategoricalHyperparameter("shrinking", ["True", "False"], - default_value="True") - tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default_value=1e-3, - log=True) + shrinking = CategoricalHyperparameter( + "shrinking", ["True", "False"], default_value="True" + ) + tol = UniformFloatHyperparameter( + "tol", 1e-5, 1e-1, default_value=1e-3, log=True + ) # cache size is not a hyperparameter, but an argument to the program! max_iter = UnParametrizedHyperparameter("max_iter", -1) cs = ConfigurationSpace() - cs.add_hyperparameters([C, kernel, degree, gamma, coef0, shrinking, - tol, max_iter]) + cs.add_hyperparameters( + [C, kernel, degree, gamma, coef0, shrinking, tol, max_iter] + ) degree_depends_on_poly = EqualsCondition(degree, kernel, "poly") coef0_condition = InCondition(coef0, kernel, ["poly", "sigmoid"]) diff --git a/autosklearn/pipeline/components/classification/mlp.py b/autosklearn/pipeline/components/classification/mlp.py index e26f2318cf..f7001d7bc1 100644 --- a/autosklearn/pipeline/components/classification/mlp.py +++ b/autosklearn/pipeline/components/classification/mlp.py @@ -1,30 +1,45 @@ import copy -import numpy as np -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, UnParametrizedHyperparameter, Constant, \ - CategoricalHyperparameter +import numpy as np from ConfigSpace.conditions import InCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, IterativeComponent, ) -from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, PREDICTIONS +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool -class MLPClassifier( - IterativeComponent, - AutoSklearnClassificationAlgorithm -): - def __init__(self, hidden_layer_depth, num_nodes_per_layer, activation, alpha, - learning_rate_init, early_stopping, solver, batch_size, - n_iter_no_change, tol, - shuffle, beta_1, beta_2, epsilon, - validation_fraction=None, - random_state=None, verbose=0): +class MLPClassifier(IterativeComponent, AutoSklearnClassificationAlgorithm): + def __init__( + self, + hidden_layer_depth, + num_nodes_per_layer, + activation, + alpha, + learning_rate_init, + early_stopping, + solver, + batch_size, + n_iter_no_change, + tol, + shuffle, + beta_1, + beta_2, + epsilon, + validation_fraction=None, + random_state=None, + verbose=0, + ): self.hidden_layer_depth = hidden_layer_depth self.num_nodes_per_layer = num_nodes_per_layer self.max_iter = self.get_max_iter() @@ -60,6 +75,7 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): Set n_iter=2 for the same reason as for SGD """ from sklearn.neural_network import MLPClassifier + n_iter = max(n_iter, 2) if refit: @@ -71,8 +87,9 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): self.max_iter = int(self.max_iter) self.hidden_layer_depth = int(self.hidden_layer_depth) self.num_nodes_per_layer = int(self.num_nodes_per_layer) - self.hidden_layer_sizes = tuple(self.num_nodes_per_layer - for i in range(self.hidden_layer_depth)) + self.hidden_layer_sizes = tuple( + self.num_nodes_per_layer for i in range(self.hidden_layer_depth) + ) self.activation = str(self.activation) self.alpha = float(self.alpha) self.learning_rate_init = float(self.learning_rate_init) @@ -88,7 +105,9 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): self.n_iter_no_change = int(self.n_iter_no_change) self.early_stopping_val = True else: - raise ValueError("Set early stopping to unknown value %s" % self.early_stopping) + raise ValueError( + "Set early stopping to unknown value %s" % self.early_stopping + ) # elif self.early_stopping == "off": # self.validation_fraction = 0 # self.tol = 10000 @@ -142,8 +161,10 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): new_max_iter = min(self.max_iter - self.estimator.n_iter_, n_iter) self.estimator.max_iter = new_max_iter self.estimator.fit(X, y) - if self.estimator.n_iter_ >= self.max_iter or \ - self.estimator._no_improvement_count > self.n_iter_no_change: + if ( + self.estimator.n_iter_ >= self.max_iter + or self.estimator._no_improvement_count > self.n_iter_no_change + ): self._fully_fit = True return self @@ -151,7 +172,7 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): def configuration_fully_fitted(self): if self.estimator is None: return False - elif not hasattr(self, '_fully_fit'): + elif not hasattr(self, "_fully_fit"): return False else: return self._fully_fit @@ -168,42 +189,55 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'MLP', - 'name': 'Multilayer Percepton', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "MLP", + "name": "Multilayer Percepton", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() - hidden_layer_depth = UniformIntegerHyperparameter(name="hidden_layer_depth", - lower=1, upper=3, default_value=1) - num_nodes_per_layer = UniformIntegerHyperparameter(name="num_nodes_per_layer", - lower=16, upper=264, default_value=32, - log=True) - activation = CategoricalHyperparameter(name="activation", choices=['tanh', 'relu'], - default_value='relu') - alpha = UniformFloatHyperparameter(name="alpha", lower=1e-7, upper=1e-1, default_value=1e-4, - log=True) - - learning_rate_init = UniformFloatHyperparameter(name="learning_rate_init", - lower=1e-4, upper=0.5, default_value=1e-3, - log=True) + hidden_layer_depth = UniformIntegerHyperparameter( + name="hidden_layer_depth", lower=1, upper=3, default_value=1 + ) + num_nodes_per_layer = UniformIntegerHyperparameter( + name="num_nodes_per_layer", lower=16, upper=264, default_value=32, log=True + ) + activation = CategoricalHyperparameter( + name="activation", choices=["tanh", "relu"], default_value="relu" + ) + alpha = UniformFloatHyperparameter( + name="alpha", lower=1e-7, upper=1e-1, default_value=1e-4, log=True + ) + + learning_rate_init = UniformFloatHyperparameter( + name="learning_rate_init", + lower=1e-4, + upper=0.5, + default_value=1e-3, + log=True, + ) # Not allowing to turn off early stopping - early_stopping = CategoricalHyperparameter(name="early_stopping", - choices=["valid", "train"], # , "off"], - default_value="valid") + early_stopping = CategoricalHyperparameter( + name="early_stopping", + choices=["valid", "train"], # , "off"], + default_value="valid", + ) # Constants - n_iter_no_change = Constant(name="n_iter_no_change", value=32) # default=10 is too low + n_iter_no_change = Constant( + name="n_iter_no_change", value=32 + ) # default=10 is too low validation_fraction = Constant(name="validation_fraction", value=0.1) tol = UnParametrizedHyperparameter(name="tol", value=1e-4) - solver = Constant(name="solver", value='adam') + solver = Constant(name="solver", value="adam") # Relying on sklearn defaults for now batch_size = UnParametrizedHyperparameter(name="batch_size", value="auto") @@ -221,17 +255,33 @@ def get_hyperparameter_search_space(dataset_properties=None): # max_fun --> only used when solver=lbfgs # activation=["identity", "logistic"] --> not useful for classification - cs.add_hyperparameters([hidden_layer_depth, num_nodes_per_layer, - activation, alpha, - learning_rate_init, early_stopping, - n_iter_no_change, validation_fraction, tol, - solver, batch_size, shuffle, - beta_1, beta_2, epsilon]) + cs.add_hyperparameters( + [ + hidden_layer_depth, + num_nodes_per_layer, + activation, + alpha, + learning_rate_init, + early_stopping, + n_iter_no_change, + validation_fraction, + tol, + solver, + batch_size, + shuffle, + beta_1, + beta_2, + epsilon, + ] + ) - validation_fraction_cond = InCondition(validation_fraction, early_stopping, ["valid"]) + validation_fraction_cond = InCondition( + validation_fraction, early_stopping, ["valid"] + ) cs.add_conditions([validation_fraction_cond]) # We always use early stopping - # n_iter_no_change_cond = InCondition(n_iter_no_change, early_stopping, ["valid", "train"]) + # n_iter_no_change_cond = \ + # InCondition(n_iter_no_change, early_stopping, ["valid", "train"]) # tol_cond = InCondition(n_iter_no_change, early_stopping, ["valid", "train"]) # cs.add_conditions([n_iter_no_change_cond, tol_cond]) return cs diff --git a/autosklearn/pipeline/components/classification/multinomial_nb.py b/autosklearn/pipeline/components/classification/multinomial_nb.py index e678bd4c77..7b65be8a5c 100644 --- a/autosklearn/pipeline/components/classification/multinomial_nb.py +++ b/autosklearn/pipeline/components/classification/multinomial_nb.py @@ -1,18 +1,16 @@ import numpy as np - from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter - -from autosklearn.pipeline.components.base import ( - AutoSklearnClassificationAlgorithm, +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, ) -from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, SIGNED_DATA + +from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SIGNED_DATA, SPARSE from autosklearn.util.common import check_for_bool class MultinomialNB(AutoSklearnClassificationAlgorithm): - def __init__(self, alpha, fit_prior, random_state=None, verbose=0): self.alpha = alpha self.fit_prior = fit_prior @@ -21,8 +19,8 @@ def __init__(self, alpha, fit_prior, random_state=None, verbose=0): self.estimator = None def fit(self, X, y): - import sklearn.naive_bayes import scipy.sparse + import sklearn.naive_bayes self.fit_prior = check_for_bool(self.fit_prior) self.alpha = float(self.alpha) @@ -31,7 +29,7 @@ def fit(self, X, y): self.estimator = sklearn.naive_bayes.MultinomialNB( alpha=self.alpha, fit_prior=self.fit_prior, - ) + ) self.classes_ = np.unique(y.astype(int)) # Because the pipeline guarantees that each feature is positive, @@ -44,8 +42,10 @@ def fit(self, X, y): # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass + self.estimator = sklearn.multiclass.OneVsRestClassifier( - self.estimator, n_jobs=1) + self.estimator, n_jobs=1 + ) self.estimator.fit(X, y) return self @@ -62,16 +62,18 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'MultinomialNB', - 'name': 'Multinomial Naive Bayes classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, SIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "MultinomialNB", + "name": "Multinomial Naive Bayes classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, SIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): @@ -80,12 +82,13 @@ def get_hyperparameter_search_space(dataset_properties=None): # the smoothing parameter is a non-negative float # I will limit it to 100 and put it on a logarithmic scale. (SF) # Please adjust that, if you know a proper range, this is just a guess. - alpha = UniformFloatHyperparameter(name="alpha", lower=1e-2, upper=100, - default_value=1, log=True) + alpha = UniformFloatHyperparameter( + name="alpha", lower=1e-2, upper=100, default_value=1, log=True + ) - fit_prior = CategoricalHyperparameter(name="fit_prior", - choices=["True", "False"], - default_value="True") + fit_prior = CategoricalHyperparameter( + name="fit_prior", choices=["True", "False"], default_value="True" + ) cs.add_hyperparameters([alpha, fit_prior]) diff --git a/autosklearn/pipeline/components/classification/passive_aggressive.py b/autosklearn/pipeline/components/classification/passive_aggressive.py index 5fb1f1bbf7..494ea7db06 100644 --- a/autosklearn/pipeline/components/classification/passive_aggressive.py +++ b/autosklearn/pipeline/components/classification/passive_aggressive.py @@ -1,14 +1,16 @@ import numpy as np - from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter, UnParametrizedHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, IterativeComponentWithSampleWeight, ) -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, PREDICTIONS +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.pipeline.implementations.util import softmax from autosklearn.util.common import check_for_bool @@ -76,9 +78,11 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass + self.estimator.max_iter = self.get_max_iter() self.estimator = sklearn.multiclass.OneVsRestClassifier( - self.estimator, n_jobs=1) + self.estimator, n_jobs=1 + ) self.estimator.fit(X, y) self.fully_fit_ = True else: @@ -91,7 +95,8 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): self.estimator._validate_params() lr = "pa1" if self.estimator.loss == "hinge" else "pa2" self.estimator._partial_fit( - X, y, + X, + y, alpha=1.0, C=self.estimator.C, loss="hinge", @@ -100,12 +105,12 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): classes=None, sample_weight=sample_weight, coef_init=None, - intercept_init=None + intercept_init=None, ) self.n_iter_ += self.estimator.n_iter_ if ( self.estimator.max_iter >= self.max_iter - or self.estimator.max_iter > self.n_iter_ + or self.estimator.max_iter > self.n_iter_ ): self.fully_fit_ = True @@ -114,7 +119,7 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): def configuration_fully_fitted(self): if self.estimator is None: return False - elif not hasattr(self, 'fully_fit_'): + elif not hasattr(self, "fully_fit_"): return False else: return self.fully_fit_ @@ -133,16 +138,18 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'PassiveAggressive Classifier', - 'name': 'Passive Aggressive Classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "PassiveAggressive Classifier", + "name": "Passive Aggressive Classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): @@ -152,11 +159,13 @@ def get_hyperparameter_search_space(dataset_properties=None): "loss", ["hinge", "squared_hinge"], default_value="hinge" ) - tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default_value=1e-4, - log=True) + tol = UniformFloatHyperparameter( + "tol", 1e-5, 1e-1, default_value=1e-4, log=True + ) # Note: Average could also be an Integer if > 1 - average = CategoricalHyperparameter('average', ['False', 'True'], - default_value='False') + average = CategoricalHyperparameter( + "average", ["False", "True"], default_value="False" + ) cs = ConfigurationSpace() cs.add_hyperparameters([loss, fit_intercept, tol, C, average]) diff --git a/autosklearn/pipeline/components/classification/qda.py b/autosklearn/pipeline/components/classification/qda.py index 7405b21fae..7b25858392 100644 --- a/autosklearn/pipeline/components/classification/qda.py +++ b/autosklearn/pipeline/components/classification/qda.py @@ -1,16 +1,13 @@ +import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformFloatHyperparameter -from autosklearn.pipeline.components.base import \ - AutoSklearnClassificationAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS +from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA from autosklearn.pipeline.implementations.util import softmax -import numpy as np - class QDA(AutoSklearnClassificationAlgorithm): - def __init__(self, reg_param, random_state=None): self.reg_param = float(reg_param) self.estimator = None @@ -18,11 +15,13 @@ def __init__(self, reg_param, random_state=None): def fit(self, X, Y): import sklearn.discriminant_analysis - estimator = sklearn.discriminant_analysis.\ - QuadraticDiscriminantAnalysis(reg_param=self.reg_param) + estimator = sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis( + reg_param=self.reg_param + ) if len(Y.shape) == 2 and Y.shape[1] > 1: import sklearn.multiclass + self.estimator = sklearn.multiclass.OneVsRestClassifier(estimator, n_jobs=1) else: self.estimator = estimator @@ -32,16 +31,17 @@ def fit(self, X, Y): if len(Y.shape) == 2 and Y.shape[1] > 1: problems = [] for est in self.estimator.estimators_: - problem = np.any(np.any([np.any(s <= 0.0) for s in - est.scalings_])) + problem = np.any(np.any([np.any(s <= 0.0) for s in est.scalings_])) problems.append(problem) problem = np.any(problems) else: - problem = np.any(np.any([np.any(s <= 0.0) for s in - self.estimator.scalings_])) + problem = np.any( + np.any([np.any(s <= 0.0) for s in self.estimator.scalings_]) + ) if problem: - raise ValueError('Numerical problems in QDA. QDA.scalings_ ' - 'contains values <= 0.0') + raise ValueError( + "Numerical problems in QDA. QDA.scalings_ " "contains values <= 0.0" + ) return self def predict(self, X): @@ -58,21 +58,22 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'QDA', - 'name': 'Quadratic Discriminant Analysis', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "QDA", + "name": "Quadratic Discriminant Analysis", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): - reg_param = UniformFloatHyperparameter('reg_param', 0.0, 1.0, - default_value=0.0) + reg_param = UniformFloatHyperparameter("reg_param", 0.0, 1.0, default_value=0.0) cs = ConfigurationSpace() cs.add_hyperparameter(reg_param) return cs diff --git a/autosklearn/pipeline/components/classification/random_forest.py b/autosklearn/pipeline/components/classification/random_forest.py index c2f4e9779a..6ccd720b3a 100644 --- a/autosklearn/pipeline/components/classification/random_forest.py +++ b/autosklearn/pipeline/components/classification/random_forest.py @@ -1,13 +1,19 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter, UnParametrizedHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, IterativeComponentWithSampleWeight, ) -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE -from autosklearn.pipeline.implementations.util import convert_multioutput_multiclass_to_multilabel +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA +from autosklearn.pipeline.implementations.util import ( + convert_multioutput_multiclass_to_multilabel, +) from autosklearn.util.common import check_for_bool, check_none @@ -15,11 +21,21 @@ class RandomForest( IterativeComponentWithSampleWeight, AutoSklearnClassificationAlgorithm, ): - def __init__(self, criterion, max_features, - max_depth, min_samples_split, min_samples_leaf, - min_weight_fraction_leaf, bootstrap, max_leaf_nodes, - min_impurity_decrease, random_state=None, n_jobs=1, - class_weight=None): + def __init__( + self, + criterion, + max_features, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + bootstrap, + max_leaf_nodes, + min_impurity_decrease, + random_state=None, + n_jobs=1, + class_weight=None, + ): self.n_estimators = self.get_max_iter() self.criterion = criterion self.max_features = max_features @@ -88,11 +104,13 @@ def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): random_state=self.random_state, n_jobs=self.n_jobs, class_weight=self.class_weight, - warm_start=True) + warm_start=True, + ) else: self.estimator.n_estimators += n_iter - self.estimator.n_estimators = min(self.estimator.n_estimators, - self.n_estimators) + self.estimator.n_estimators = min( + self.estimator.n_estimators, self.n_estimators + ) self.estimator.fit(X, y, sample_weight=sample_weight) return self @@ -117,42 +135,63 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'RF', - 'name': 'Random Forest Classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "RF", + "name": "Random Forest Classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() criterion = CategoricalHyperparameter( - "criterion", ["gini", "entropy"], default_value="gini") - - # The maximum number of features used in the forest is calculated as m^max_features, where - # m is the total number of features, and max_features is the hyperparameter specified below. - # The default is 0.5, which yields sqrt(m) features as max_features in the estimator. This - # corresponds with Geurts' heuristic. + "criterion", ["gini", "entropy"], default_value="gini" + ) + + # The maximum number of features used in the forest is calculated as + # m^max_features, where m is the total number of features, and max_features + # is the hyperparameter specified below. The default is 0.5, which yields + # sqrt(m) features as max_features in the estimator. + # This corresponds with Geurts' heuristic. max_features = UniformFloatHyperparameter( - "max_features", 0., 1., default_value=0.5) + "max_features", 0.0, 1.0, default_value=0.5 + ) max_depth = UnParametrizedHyperparameter("max_depth", "None") min_samples_split = UniformIntegerHyperparameter( - "min_samples_split", 2, 20, default_value=2) + "min_samples_split", 2, 20, default_value=2 + ) min_samples_leaf = UniformIntegerHyperparameter( - "min_samples_leaf", 1, 20, default_value=1) - min_weight_fraction_leaf = UnParametrizedHyperparameter("min_weight_fraction_leaf", 0.) + "min_samples_leaf", 1, 20, default_value=1 + ) + min_weight_fraction_leaf = UnParametrizedHyperparameter( + "min_weight_fraction_leaf", 0.0 + ) max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") - min_impurity_decrease = UnParametrizedHyperparameter('min_impurity_decrease', 0.0) + min_impurity_decrease = UnParametrizedHyperparameter( + "min_impurity_decrease", 0.0 + ) bootstrap = CategoricalHyperparameter( - "bootstrap", ["True", "False"], default_value="True") - cs.add_hyperparameters([criterion, max_features, - max_depth, min_samples_split, min_samples_leaf, - min_weight_fraction_leaf, max_leaf_nodes, - bootstrap, min_impurity_decrease]) + "bootstrap", ["True", "False"], default_value="True" + ) + cs.add_hyperparameters( + [ + criterion, + max_features, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_leaf_nodes, + bootstrap, + min_impurity_decrease, + ] + ) return cs diff --git a/autosklearn/pipeline/components/classification/sgd.py b/autosklearn/pipeline/components/classification/sgd.py index 6875541824..469c2605dd 100644 --- a/autosklearn/pipeline/components/classification/sgd.py +++ b/autosklearn/pipeline/components/classification/sgd.py @@ -1,13 +1,16 @@ -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter, UnParametrizedHyperparameter from ConfigSpace.conditions import EqualsCondition, InCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, IterativeComponentWithSampleWeight, ) -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.pipeline.implementations.util import softmax from autosklearn.util.common import check_for_bool @@ -16,9 +19,21 @@ class SGD( IterativeComponentWithSampleWeight, AutoSklearnClassificationAlgorithm, ): - def __init__(self, loss, penalty, alpha, fit_intercept, tol, - learning_rate, l1_ratio=0.15, epsilon=0.1, - eta0=0.01, power_t=0.5, average=False, random_state=None): + def __init__( + self, + loss, + penalty, + alpha, + fit_intercept, + tol, + learning_rate, + l1_ratio=0.15, + epsilon=0.1, + eta0=0.01, + power_t=0.5, + average=False, + random_state=None, + ): self.max_iter = self.get_max_iter() self.loss = loss self.penalty = penalty @@ -61,32 +76,31 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): self.fully_fit_ = False self.alpha = float(self.alpha) - self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ - else 0.15 - self.epsilon = float(self.epsilon) if self.epsilon is not None \ - else 0.1 + self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None else 0.15 + self.epsilon = float(self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) - self.power_t = float(self.power_t) if self.power_t is not None \ - else 0.5 + self.power_t = float(self.power_t) if self.power_t is not None else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) - self.estimator = SGDClassifier(loss=self.loss, - penalty=self.penalty, - alpha=self.alpha, - fit_intercept=self.fit_intercept, - max_iter=n_iter, - tol=self.tol, - learning_rate=self.learning_rate, - l1_ratio=self.l1_ratio, - epsilon=self.epsilon, - eta0=self.eta0, - power_t=self.power_t, - shuffle=True, - average=self.average, - random_state=self.random_state, - warm_start=True) + self.estimator = SGDClassifier( + loss=self.loss, + penalty=self.penalty, + alpha=self.alpha, + fit_intercept=self.fit_intercept, + max_iter=n_iter, + tol=self.tol, + learning_rate=self.learning_rate, + l1_ratio=self.l1_ratio, + epsilon=self.epsilon, + eta0=self.eta0, + power_t=self.power_t, + shuffle=True, + average=self.average, + random_state=self.random_state, + warm_start=True, + ) self.estimator.fit(X, y, sample_weight=sample_weight) self.n_iter_ = self.estimator.n_iter_ else: @@ -94,7 +108,8 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): self.estimator.max_iter = min(self.estimator.max_iter, self.max_iter) self.estimator._validate_params() self.estimator._partial_fit( - X, y, + X, + y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, @@ -103,11 +118,14 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): sample_weight=sample_weight, classes=None, coef_init=None, - intercept_init=None + intercept_init=None, ) self.n_iter_ += self.estimator.n_iter_ - if self.estimator.max_iter >= self.max_iter or self.estimator.max_iter > self.n_iter_: + if ( + self.estimator.max_iter >= self.max_iter + or self.estimator.max_iter > self.n_iter_ + ): self.fully_fit_ = True return self @@ -115,7 +133,7 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): def configuration_fully_fitted(self): if self.estimator is None: return False - elif not hasattr(self, 'fully_fit_'): + elif not hasattr(self, "fully_fit_"): return False else: return self.fully_fit_ @@ -137,16 +155,18 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'SGD Classifier', - 'name': 'Stochastic Gradient Descent Classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "SGD Classifier", + "name": "Stochastic Gradient Descent Classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): @@ -156,44 +176,63 @@ def get_hyperparameter_search_space(dataset_properties=None): "loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default_value="log", - ) + ) penalty = CategoricalHyperparameter( - "penalty", ["l1", "l2", "elasticnet"], default_value="l2") + "penalty", ["l1", "l2", "elasticnet"], default_value="l2" + ) alpha = UniformFloatHyperparameter( - "alpha", 1e-7, 1e-1, log=True, default_value=0.0001) + "alpha", 1e-7, 1e-1, log=True, default_value=0.0001 + ) l1_ratio = UniformFloatHyperparameter( - "l1_ratio", 1e-9, 1, log=True, default_value=0.15) + "l1_ratio", 1e-9, 1, log=True, default_value=0.15 + ) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") - tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True, - default_value=1e-4) + tol = UniformFloatHyperparameter( + "tol", 1e-5, 1e-1, log=True, default_value=1e-4 + ) epsilon = UniformFloatHyperparameter( - "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True) + "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True + ) learning_rate = CategoricalHyperparameter( - "learning_rate", ["optimal", "invscaling", "constant"], - default_value="invscaling") + "learning_rate", + ["optimal", "invscaling", "constant"], + default_value="invscaling", + ) eta0 = UniformFloatHyperparameter( - "eta0", 1e-7, 1e-1, default_value=0.01, log=True) - power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, - default_value=0.5) + "eta0", 1e-7, 1e-1, default_value=0.01, log=True + ) + power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default_value=0.5) average = CategoricalHyperparameter( - "average", ["False", "True"], default_value="False") - cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept, - tol, epsilon, learning_rate, eta0, power_t, - average]) + "average", ["False", "True"], default_value="False" + ) + cs.add_hyperparameters( + [ + loss, + penalty, + alpha, + l1_ratio, + fit_intercept, + tol, + epsilon, + learning_rate, + eta0, + power_t, + average, + ] + ) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") - power_t_condition = EqualsCondition(power_t, learning_rate, - "invscaling") + power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") # eta0 is only relevant if learning_rate!='optimal' according to code # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/ # linear_model/sgd_fast.pyx#L603 - eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", - "constant"]) - cs.add_conditions([elasticnet, epsilon_condition, power_t_condition, - eta0_in_inv_con]) + eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", "constant"]) + cs.add_conditions( + [elasticnet, epsilon_condition, power_t_condition, eta0_in_inv_con] + ) return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/__init__.py b/autosklearn/pipeline/components/data_preprocessing/__init__.py index 3ba5981965..5693efd441 100644 --- a/autosklearn/pipeline/components/data_preprocessing/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/__init__.py @@ -1,18 +1,24 @@ +from typing import Dict, Optional, Type + import os from collections import OrderedDict -from typing import Dict, Optional, Type from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter + from autosklearn.pipeline.base import PIPELINE_DATA_DTYPE -from ..base import find_components, \ - ThirdPartyComponents, AutoSklearnChoice, AutoSklearnPreprocessingAlgorithm +from ..base import ( + AutoSklearnChoice, + AutoSklearnPreprocessingAlgorithm, + ThirdPartyComponents, + find_components, +) classifier_directory = os.path.split(__file__)[0] -_preprocessors = find_components(__package__, - classifier_directory, - AutoSklearnPreprocessingAlgorithm) +_preprocessors = find_components( + __package__, classifier_directory, AutoSklearnPreprocessingAlgorithm +) _addons = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm) @@ -21,7 +27,6 @@ def add_preprocessor(preprocessor: Type[AutoSklearnPreprocessingAlgorithm]) -> N class DataPreprocessorChoice(AutoSklearnChoice): - @classmethod def get_components(cls) -> OrderedDict: components: OrderedDict = OrderedDict() @@ -29,23 +34,28 @@ def get_components(cls) -> OrderedDict: components.update(_addons.components) return components - def get_available_components(self, dataset_properties: Optional[Dict] = None, - include: Optional[Dict] = None, - exclude: Optional[Dict] = None) -> OrderedDict: + def get_available_components( + self, + dataset_properties: Optional[Dict] = None, + include: Optional[Dict] = None, + exclude: Optional[Dict] = None, + ) -> OrderedDict: if dataset_properties is None: dataset_properties = {} if include is not None and exclude is not None: raise ValueError( - "The argument include and exclude cannot be used together.") + "The argument include and exclude cannot be used together." + ) available_comp = self.get_components() if include is not None: for incl in include: if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) + raise ValueError( + "Trying to include unknown component: " "%s" % incl + ) # TODO check for task type classification and/or regression! @@ -59,38 +69,47 @@ def get_available_components(self, dataset_properties: Optional[Dict] = None, entry = available_comp[name] # Exclude itself to avoid infinite loop - if entry == DataPreprocessorChoice or hasattr(entry, 'get_components'): + if entry == DataPreprocessorChoice or hasattr(entry, "get_components"): continue - target_type = dataset_properties['target_type'] - if target_type == 'classification': - if entry.get_properties()['handles_classification'] is False: + target_type = dataset_properties["target_type"] + if target_type == "classification": + if entry.get_properties()["handles_classification"] is False: continue - if dataset_properties.get('multiclass') is True and \ - entry.get_properties()['handles_multiclass'] is False: + if ( + dataset_properties.get("multiclass") is True + and entry.get_properties()["handles_multiclass"] is False + ): continue - if dataset_properties.get('multilabel') is True and \ - entry.get_properties()['handles_multilabel'] is False: + if ( + dataset_properties.get("multilabel") is True + and entry.get_properties()["handles_multilabel"] is False + ): continue - elif target_type == 'regression': - if entry.get_properties()['handles_regression'] is False: + elif target_type == "regression": + if entry.get_properties()["handles_regression"] is False: continue - if dataset_properties.get('multioutput') is True and \ - entry.get_properties()['handles_multioutput'] is False: + if ( + dataset_properties.get("multioutput") is True + and entry.get_properties()["handles_multioutput"] is False + ): continue else: - raise ValueError('Unknown target type %s' % target_type) + raise ValueError("Unknown target type %s" % target_type) components_dict[name] = entry return components_dict - def get_hyperparameter_search_space(self, dataset_properties: Optional[Dict] = None, - default: str = None, - include: Optional[Dict] = None, - exclude: Optional[Dict] = None) -> ConfigurationSpace: + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict] = None, + default: str = None, + include: Optional[Dict] = None, + exclude: Optional[Dict] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() if dataset_properties is None: @@ -98,12 +117,11 @@ def get_hyperparameter_search_space(self, dataset_properties: Optional[Dict] = N # Compile a list of legal preprocessors for this problem available_preprocessors = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) + dataset_properties=dataset_properties, include=include, exclude=exclude + ) if len(available_preprocessors) == 0: - raise ValueError( - "No preprocessors found, please add NoPreprocessing") + raise ValueError("No preprocessors found, please add NoPreprocessing") if default is None: defaults = ["feature_type"] @@ -112,43 +130,48 @@ def get_hyperparameter_search_space(self, dataset_properties: Optional[Dict] = N default = default_ break - preprocessor = CategoricalHyperparameter('__choice__', - list(available_preprocessors.keys()), - default_value=default) + preprocessor = CategoricalHyperparameter( + "__choice__", list(available_preprocessors.keys()), default_value=default + ) cs.add_hyperparameter(preprocessor) for name in available_preprocessors: preprocessor_configuration_space = available_preprocessors[name]( - dataset_properties=dataset_properties). \ - get_hyperparameter_search_space(dataset_properties) - parent_hyperparameter = {'parent': preprocessor, 'value': name} - cs.add_configuration_space(name, preprocessor_configuration_space, - parent_hyperparameter=parent_hyperparameter) + dataset_properties=dataset_properties + ).get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {"parent": preprocessor, "value": name} + cs.add_configuration_space( + name, + preprocessor_configuration_space, + parent_hyperparameter=parent_hyperparameter, + ) return cs def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.choice.transform(X) - def set_hyperparameters(self, configuration: ConfigurationSpace, - init_params: Optional[Dict] = None) -> 'DataPreprocessorChoice': + def set_hyperparameters( + self, configuration: ConfigurationSpace, init_params: Optional[Dict] = None + ) -> "DataPreprocessorChoice": config = {} params = configuration.get_dictionary() - choice = params['__choice__'] - del params['__choice__'] + choice = params["__choice__"] + del params["__choice__"] for param, value in params.items(): - param = param.replace(choice, '').split(':', 1)[1] + param = param.replace(choice, "").split(":", 1)[1] config[param] = value new_params = {} feat_type = None if init_params is not None: for param, value in init_params.items(): - param = param.replace(choice, '').split(':', 1)[-1] + param = param.replace(choice, "").split(":", 1)[-1] if "feat_type" in param: feat_type = value else: new_params[param] = value - self.choice = self.get_components()[choice](config=config, init_params=new_params, - feat_type=feat_type) + self.choice = self.get_components()[choice]( + config=config, init_params=new_params, feat_type=feat_type + ) return self diff --git a/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py b/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py index 7850a1665b..721fe63fc5 100644 --- a/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py +++ b/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py @@ -1,38 +1,47 @@ -from typing import Any, List, Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np - from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter - from sklearn.base import BaseEstimator from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, SIGNED_DATA, INPUT +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import ( + DENSE, + INPUT, + SIGNED_DATA, + SPARSE, + UNSIGNED_DATA, +) class Balancing(AutoSklearnPreprocessingAlgorithm): def __init__( self, - strategy: str = 'none', + strategy: str = "none", random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: self.strategy = strategy self.random_state = random_state - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'Balancing': + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "Balancing": self.fitted_ = True return self def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X - def get_weights(self, Y: PIPELINE_DATA_DTYPE, - classifier: BaseEstimator, preprocessor: BaseEstimator, - init_params: Optional[Dict[str, Any]], fit_params: Optional[Dict[str, Any]], - ) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]: + def get_weights( + self, + Y: PIPELINE_DATA_DTYPE, + classifier: BaseEstimator, + preprocessor: BaseEstimator, + init_params: Optional[Dict[str, Any]], + fit_params: Optional[Dict[str, Any]], + ) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]: if init_params is None: init_params = {} @@ -45,12 +54,18 @@ def get_weights(self, Y: PIPELINE_DATA_DTYPE, # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/ensemble/weight_boosting.py#L121 # Have RF and ET in here because they emit a warning if class_weights # are used together with warmstarts - clf_ = ['adaboost', 'random_forest', 'extra_trees', 'sgd', 'passive_aggressive', - 'gradient_boosting'] + clf_ = [ + "adaboost", + "random_forest", + "extra_trees", + "sgd", + "passive_aggressive", + "gradient_boosting", + ] pre_: List[str] = [] if classifier in clf_ or preprocessor in pre_: if len(Y.shape) > 1: - offsets = [2 ** i for i in range(Y.shape[1])] + offsets = [2**i for i in range(Y.shape[1])] Y_ = np.sum(Y * offsets, axis=1) else: Y_ = Y @@ -68,65 +83,68 @@ def get_weights(self, Y: PIPELINE_DATA_DTYPE, sample_weights[mask] *= cw[i] if classifier in clf_: - fit_params['classifier:sample_weight'] = sample_weights + fit_params["classifier:sample_weight"] = sample_weights if preprocessor in pre_: - fit_params['feature_preprocessor:sample_weight'] = sample_weights + fit_params["feature_preprocessor:sample_weight"] = sample_weights # Classifiers which can adjust sample weights themselves via the # argument `class_weight` - clf_ = ['decision_tree', 'liblinear_svc', - 'libsvm_svc'] - pre_ = ['liblinear_svc_preprocessor', - 'extra_trees_preproc_for_classification'] + clf_ = ["decision_tree", "liblinear_svc", "libsvm_svc"] + pre_ = ["liblinear_svc_preprocessor", "extra_trees_preproc_for_classification"] if classifier in clf_: - init_params['classifier:class_weight'] = 'balanced' + init_params["classifier:class_weight"] = "balanced" if preprocessor in pre_: - init_params['feature_preprocessor:class_weight'] = 'balanced' + init_params["feature_preprocessor:class_weight"] = "balanced" - clf_ = ['ridge'] + clf_ = ["ridge"] if classifier in clf_: class_weights = {} unique, counts = np.unique(Y, return_counts=True) - cw = 1. / counts + cw = 1.0 / counts cw = cw / np.mean(cw) for i, ue in enumerate(unique): class_weights[ue] = cw[i] if classifier in clf_: - init_params['classifier:class_weight'] = class_weights + init_params["classifier:class_weight"] = class_weights return init_params, fit_params @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'Balancing', - 'name': 'Balancing Imbalanced Class Distributions', - 'handles_missing_values': True, - 'handles_nominal_values': True, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA, SIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "Balancing", + "name": "Balancing Imbalanced Class Distributions", + "handles_missing_values": True, + "handles_nominal_values": True, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA, SIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: # TODO add replace by zero! strategy = CategoricalHyperparameter( - "strategy", ["none", "weighting"], default_value="none") + "strategy", ["none", "weighting"], default_value="none" + ) cs = ConfigurationSpace() cs.add_hyperparameter(strategy) return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py index c4d34ab306..5d1647b24a 100644 --- a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py @@ -1,33 +1,34 @@ -from collections import OrderedDict -import os - from typing import Any, Dict, Optional +import os +from collections import OrderedDict + from ConfigSpace import Configuration from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter - from sklearn.base import BaseEstimator -from ...base import AutoSklearnPreprocessingAlgorithm, find_components, \ - ThirdPartyComponents, AutoSklearnChoice, _addons - from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE +from ...base import ( + AutoSklearnChoice, + AutoSklearnPreprocessingAlgorithm, + ThirdPartyComponents, + _addons, + find_components, +) + ohe_directory = os.path.split(__file__)[0] -_ohes = find_components(__package__, - ohe_directory, - AutoSklearnPreprocessingAlgorithm) +_ohes = find_components(__package__, ohe_directory, AutoSklearnPreprocessingAlgorithm) additional_components = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm) -_addons['data_preprocessing.categorical_encoding'] = additional_components +_addons["data_preprocessing.categorical_encoding"] = additional_components -def add_ohe(ohe: 'OHEChoice') -> None: +def add_ohe(ohe: "OHEChoice") -> None: additional_components.add_component(ohe) class OHEChoice(AutoSklearnChoice): - @classmethod def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]: components: Dict[str, BaseEstimator] = OrderedDict() @@ -49,48 +50,52 @@ def get_hyperparameter_search_space( # Compile a list of legal preprocessors for this problem available_preprocessors = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) + dataset_properties=dataset_properties, include=include, exclude=exclude + ) if len(available_preprocessors) == 0: raise ValueError( "No ohe hot encoders found, please add any one hot encoder " - "component.") + "component." + ) if default is None: - defaults = ['one_hot_encoding', 'no_encoding'] + defaults = ["one_hot_encoding", "no_encoding"] for default_ in defaults: if default_ in available_preprocessors: default = default_ break - preprocessor = CategoricalHyperparameter('__choice__', - list( - available_preprocessors.keys()), - default_value=default) + preprocessor = CategoricalHyperparameter( + "__choice__", list(available_preprocessors.keys()), default_value=default + ) cs.add_hyperparameter(preprocessor) for name in available_preprocessors: - preprocessor_configuration_space = available_preprocessors[name]. \ - get_hyperparameter_search_space(dataset_properties) - parent_hyperparameter = {'parent': preprocessor, 'value': name} - cs.add_configuration_space(name, preprocessor_configuration_space, - parent_hyperparameter=parent_hyperparameter) + preprocessor_configuration_space = available_preprocessors[ + name + ].get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {"parent": preprocessor, "value": name} + cs.add_configuration_space( + name, + preprocessor_configuration_space, + parent_hyperparameter=parent_hyperparameter, + ) self.configuration_space = cs self.dataset_properties = dataset_properties return cs - def set_hyperparameters(self, configuration: Configuration, - init_params: Optional[Dict[str, Any]] = None - ) -> 'OHEChoice': + def set_hyperparameters( + self, configuration: Configuration, init_params: Optional[Dict[str, Any]] = None + ) -> "OHEChoice": new_params = {} params = configuration.get_dictionary() - choice = params['__choice__'] - del params['__choice__'] + choice = params["__choice__"] + del params["__choice__"] for param, value in params.items(): - param = param.replace(choice, '').replace(':', '') + param = param.replace(choice, "").replace(":", "") new_params[param] = value if init_params is not None: @@ -100,10 +105,10 @@ def set_hyperparameters(self, configuration: Configuration, # in order to not pass it to the no encoding if choice not in param: continue - param = param.replace(choice, '').replace(':', '') + param = param.replace(choice, "").replace(":", "") new_params[param] = value - new_params['random_state'] = self.random_state + new_params["random_state"] = self.random_state self.new_params = new_params self.choice = self.get_components()[choice](**new_params) diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py index 3ebb411457..43d578219f 100644 --- a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py @@ -1,30 +1,29 @@ from typing import Dict, Optional, Tuple, Union import numpy as np - -from ConfigSpace.configuration_space import ConfigurationSpace - import scipy.sparse - +from ConfigSpace.configuration_space import ConfigurationSpace from sklearn.preprocessing import OrdinalEncoder from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class OrdinalEncoding(AutoSklearnPreprocessingAlgorithm): def __init__( - self, - random_state: Optional[Union[int, np.random.RandomState]] = None + self, random_state: Optional[Union[int, np.random.RandomState]] = None ) -> None: self.random_state = random_state - def fit(self, X: PIPELINE_DATA_DTYPE, - y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'OrdinalEncoding': + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "OrdinalEncoding": if not scipy.sparse.issparse(X): self.preprocessor = OrdinalEncoder( - categories='auto', handle_unknown='use_encoded_value', unknown_value=-1, + categories="auto", + handle_unknown="use_encoded_value", + unknown_value=-1, ) self.preprocessor.fit(X, y) return self @@ -50,20 +49,23 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.preprocessor.transform(X) + 1 @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'OrdinalEncoder', - 'name': 'Ordinal Encoder', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - # TODO find out of this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), } + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "OrdinalEncoder", + "name": "Ordinal Encoder", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + # TODO find out of this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space( diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py index ab196396ed..028a4fb9c1 100644 --- a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py @@ -1,24 +1,23 @@ from typing import Dict, Optional, Tuple, Union -import numpy as np +import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class NoEncoding(AutoSklearnPreprocessingAlgorithm): def __init__( - self, - random_state: Optional[Union[int, np.random.RandomState]] = None + self, random_state: Optional[Union[int, np.random.RandomState]] = None ) -> None: pass - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'NoEncoding': - self.preprocessor = 'passthrough' + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "NoEncoding": + self.preprocessor = "passthrough" self.fitted_ = True return self @@ -26,22 +25,26 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'no encoding', - 'name': 'No categorical variable encoding', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,)} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "no encoding", + "name": "No categorical variable encoding", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py index 8973d1979f..9b9ee87c81 100644 --- a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py @@ -1,33 +1,31 @@ from typing import Dict, Optional, Tuple, Union -from ConfigSpace.configuration_space import ConfigurationSpace - +import numpy as np import scipy.sparse - +from ConfigSpace.configuration_space import ConfigurationSpace from sklearn.preprocessing import OneHotEncoder as DenseOneHotEncoder -import numpy as np - from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE -from autosklearn.pipeline.implementations.SparseOneHotEncoder import SparseOneHotEncoder from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA +from autosklearn.pipeline.implementations.SparseOneHotEncoder import SparseOneHotEncoder class OneHotEncoder(AutoSklearnPreprocessingAlgorithm): def __init__( - self, - random_state: Optional[Union[int, np.random.RandomState]] = None + self, random_state: Optional[Union[int, np.random.RandomState]] = None ) -> None: self.random_state = random_state - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'OneHotEncoder': + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "OneHotEncoder": if scipy.sparse.issparse(X): self.preprocessor = SparseOneHotEncoder() else: self.preprocessor = DenseOneHotEncoder( - sparse=False, categories='auto', handle_unknown='ignore') + sparse=False, categories="auto", handle_unknown="ignore" + ) self.preprocessor.fit(X, y) return self @@ -37,22 +35,26 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.preprocessor.transform(X) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': '1Hot', - 'name': 'One Hot Encoder', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - # TODO find out of this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), } + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "1Hot", + "name": "One Hot Encoder", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + # TODO find out of this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: return ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py b/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py index 318a84ee2d..f2dc2bf304 100644 --- a/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py +++ b/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py @@ -1,32 +1,32 @@ from typing import Dict, Optional, Tuple, Union -from ConfigSpace.configuration_space import ConfigurationSpace - import numpy as np +from ConfigSpace.configuration_space import ConfigurationSpace -from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE import autosklearn.pipeline.implementations.CategoryShift +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class CategoryShift(AutoSklearnPreprocessingAlgorithm): - """ Add 3 to every category. + """Add 3 to every category. Down in the pipeline, category 2 will be attribute to missing values, category 1 will be assigned to low occurence categories, and category 0 is not used, so to provide compatibility with sparse matrices. """ def __init__( - self, - random_state: Optional[Union[int, np.random.RandomState]] = None + self, random_state: Optional[Union[int, np.random.RandomState]] = None ) -> None: self.random_state = random_state - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'CategoryShift': - self.preprocessor = autosklearn.pipeline.implementations.CategoryShift\ - .CategoryShift() + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "CategoryShift": + self.preprocessor = ( + autosklearn.pipeline.implementations.CategoryShift.CategoryShift() + ) self.preprocessor.fit(X, y) return self @@ -36,29 +36,33 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.preprocessor.transform(X) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'CategShift', - 'name': 'Category Shift', - 'handles_missing_values': True, - 'handles_nominal_values': True, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - # TODO find out of this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "CategShift", + "name": "Category Shift", + "handles_missing_values": True, + "handles_nominal_values": True, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + # TODO find out of this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: return ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_reduction/truncated_svd.py b/autosklearn/pipeline/components/data_preprocessing/feature_reduction/truncated_svd.py index 3bc4e7c002..c104a18fd7 100644 --- a/autosklearn/pipeline/components/data_preprocessing/feature_reduction/truncated_svd.py +++ b/autosklearn/pipeline/components/data_preprocessing/feature_reduction/truncated_svd.py @@ -1,15 +1,13 @@ from typing import Dict, Optional, Tuple, Union -from ConfigSpace.configuration_space import ConfigurationSpace import ConfigSpace.hyperparameters as CSH - import numpy as np +from ConfigSpace.configuration_space import ConfigurationSpace +from sklearn.decomposition import TruncatedSVD from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT - -from sklearn.decomposition import TruncatedSVD +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class FeatureReduction(AutoSklearnPreprocessingAlgorithm): @@ -20,22 +18,27 @@ class FeatureReduction(AutoSklearnPreprocessingAlgorithm): def __init__( self, n_components: Optional[int] = None, - random_state: Optional[Union[int, np.random.RandomState]] = None + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: self.n_components = n_components self.random_state = random_state - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'FeatureReduction': + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "FeatureReduction": if X.shape[1] > self.n_components: - self.preprocessor = TruncatedSVD(n_components=self.n_components, - random_state=self.random_state) + self.preprocessor = TruncatedSVD( + n_components=self.n_components, random_state=self.random_state + ) elif X.shape[1] <= self.n_components and X.shape[1] != 1: - self.preprocessor = TruncatedSVD(n_components=X.shape[1] - 1, - random_state=self.random_state) + self.preprocessor = TruncatedSVD( + n_components=X.shape[1] - 1, random_state=self.random_state + ) else: - raise ValueError("The text embedding consists only of a single dimension.\n" - "Are you sure that your text data is necessary?") + raise ValueError( + "The text embedding consists only of a single dimension.\n" + "Are you sure that your text data is necessary?" + ) self.preprocessor.fit(X) return self @@ -45,32 +48,38 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.preprocessor.transform(X) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'TextFeatureReduction', - 'name': 'TextFeatureReduction', - 'handles_missing_values': True, - 'handles_nominal_values': True, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "TextFeatureReduction", + "name": "TextFeatureReduction", + "handles_missing_values": True, + "handles_nominal_values": True, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() cs.add_hyperparameter( - CSH.UniformIntegerHyperparameter("n_components", lower=1, upper=10000, - default_value=100, log=True)) + CSH.UniformIntegerHyperparameter( + "n_components", lower=1, upper=10000, default_value=100, log=True + ) + ) return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type.py b/autosklearn/pipeline/components/data_preprocessing/feature_type.py index cfd31e8c3a..5c37e4cb98 100644 --- a/autosklearn/pipeline/components/data_preprocessing/feature_type.py +++ b/autosklearn/pipeline/components/data_preprocessing/feature_type.py @@ -1,38 +1,40 @@ -from typing import Any, List, Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union +import numpy as np import sklearn.compose -from scipy import sparse - from ConfigSpace import Configuration from ConfigSpace.configuration_space import ConfigurationSpace - -import numpy as np - +from scipy import sparse from sklearn.base import BaseEstimator +from autosklearn.data.validation import SUPPORTED_FEAT_TYPES, SUPPORTED_TARGET_TYPES from autosklearn.pipeline.base import ( - BasePipeline, - DATASET_PROPERTIES_TYPE, - PIPELINE_DATA_DTYPE, - ) -from autosklearn.pipeline.components.data_preprocessing.feature_type_categorical \ - import CategoricalPreprocessingPipeline -from autosklearn.pipeline.components.data_preprocessing.feature_type_numerical \ - import NumericalPreprocessingPipeline -from autosklearn.pipeline.components.data_preprocessing.feature_type_text \ - import TextPreprocessingPipeline -from autosklearn.pipeline.components.base import AutoSklearnComponent, AutoSklearnChoice, \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT -from autosklearn.data.validation import ( - SUPPORTED_FEAT_TYPES, - SUPPORTED_TARGET_TYPES, + DATASET_PROPERTIES_TYPE, + PIPELINE_DATA_DTYPE, + BasePipeline, +) +from autosklearn.pipeline.components.base import ( + AutoSklearnChoice, + AutoSklearnComponent, + AutoSklearnPreprocessingAlgorithm, +) +from autosklearn.pipeline.components.data_preprocessing.feature_type_categorical import ( # noqa : E501 + CategoricalPreprocessingPipeline, ) +from autosklearn.pipeline.components.data_preprocessing.feature_type_numerical import ( + NumericalPreprocessingPipeline, +) +from autosklearn.pipeline.components.data_preprocessing.feature_type_text import ( + TextPreprocessingPipeline, +) +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class FeatTypeSplit(AutoSklearnPreprocessingAlgorithm): - """ This component is used to apply distinct transformations to categorical, - numerical and text features of a dataset. It is built on top of sklearn's ColumnTransformer. + """ + This component is used to apply distinct transformations to categorical, + numerical and text features of a dataset. It is built on top of sklearn's + ColumnTransformer. """ def __init__( @@ -70,9 +72,14 @@ def __init__( # TODO: Extract the child configuration space from the FeatTypeSplit to the # pipeline if needed self.categ_ppl = CategoricalPreprocessingPipeline( - config=None, steps=pipeline, dataset_properties=dataset_properties, - include=include, exclude=exclude, random_state=random_state, - init_params=init_params) + config=None, + steps=pipeline, + dataset_properties=dataset_properties, + include=include, + exclude=exclude, + random_state=random_state, + init_params=init_params, + ) # The pipeline that will be applied to the numerical features (i.e. columns) # of the dataset # Configuration of the data-preprocessor is different from the configuration of @@ -81,9 +88,14 @@ def __init__( # TODO: Extract the child configuration space from the FeatTypeSplit to the # pipeline if needed self.numer_ppl = NumericalPreprocessingPipeline( - config=None, steps=pipeline, dataset_properties=dataset_properties, - include=include, exclude=exclude, random_state=random_state, - init_params=init_params) + config=None, + steps=pipeline, + dataset_properties=dataset_properties, + include=include, + exclude=exclude, + random_state=random_state, + init_params=init_params, + ) # The pipeline that will be applied to the text features (i.e. columns) # of the dataset @@ -93,9 +105,14 @@ def __init__( # TODO: Extract the child configuration space from the FeatTypeSplit to the # pipeline if needed self.txt_ppl = TextPreprocessingPipeline( - config=None, steps=pipeline, dataset_properties=dataset_properties, - include=include, exclude=exclude, random_state=random_state, - init_params=init_params) + config=None, + steps=pipeline, + dataset_properties=dataset_properties, + include=include, + exclude=exclude, + random_state=random_state, + init_params=init_params, + ) self._transformers: List[Tuple[str, AutoSklearnComponent]] = [ ("categorical_transformer", self.categ_ppl), @@ -106,8 +123,9 @@ def __init__( self.set_hyperparameters(self.config, init_params=init_params) self.column_transformer = column_transformer - def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = None - ) -> 'FeatTypeSplit': + def fit( + self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = None + ) -> "FeatTypeSplit": n_feats = X.shape[1] categorical_features = [] @@ -116,78 +134,99 @@ def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = Non if self.feat_type is not None: # Make sure that we are not missing any column! expected = set(self.feat_type.keys()) - if hasattr(X, 'columns'): + if hasattr(X, "columns"): columns = set(X.columns) else: columns = set(range(n_feats)) if expected != columns: - raise ValueError(f"Train data has columns={expected} yet the" - f" feat_types are feat={columns}") - categorical_features = [key for key, value in self.feat_type.items() - if value.lower() == 'categorical'] - numerical_features = [key for key, value in self.feat_type.items() - if value.lower() == 'numerical'] - text_features = [key for key, value in self.feat_type.items() - if value.lower() == "string"] + raise ValueError( + f"Train data has columns={expected} yet the" + f" feat_types are feat={columns}" + ) + categorical_features = [ + key + for key, value in self.feat_type.items() + if value.lower() == "categorical" + ] + numerical_features = [ + key + for key, value in self.feat_type.items() + if value.lower() == "numerical" + ] + text_features = [ + key + for key, value in self.feat_type.items() + if value.lower() == "string" + ] sklearn_transf_spec = [ (name, transformer, feature_columns) - for name, transformer, feature_columns - in [ + for name, transformer, feature_columns in [ ("text_transformer", self.txt_ppl, text_features), ("categorical_transformer", self.categ_ppl, categorical_features), - ("numerical_transformer", self.numer_ppl, numerical_features) + ("numerical_transformer", self.numer_ppl, numerical_features), ] if len(feature_columns) > 0 ] else: # self.feature_type == None assumes numerical case - sklearn_transf_spec = [("numerical_transformer", self.numer_ppl, [True]*n_feats)] + sklearn_transf_spec = [ + ("numerical_transformer", self.numer_ppl, [True] * n_feats) + ] # And one last check in case feat type is None # And to make sure the final specification has all the columns # considered in the column transformer - total_columns = sum([len(features) for name, ppl, features in sklearn_transf_spec]) + total_columns = sum( + [len(features) for name, ppl, features in sklearn_transf_spec] + ) if total_columns != n_feats: - raise ValueError("Missing columns in the specification of the data validator" - f" for train data={np.shape(X)} and spec={sklearn_transf_spec}") + raise ValueError( + "Missing columns in the specification of the data validator" + f" for train data={np.shape(X)} and spec={sklearn_transf_spec}" + ) self.sparse_ = sparse.issparse(X) or self.force_sparse_output self.column_transformer = sklearn.compose.ColumnTransformer( transformers=sklearn_transf_spec, sparse_threshold=float(self.sparse_), - ) + ) self.column_transformer.fit(X, y) return self def transform(self, X: SUPPORTED_FEAT_TYPES) -> PIPELINE_DATA_DTYPE: if self.column_transformer is None: - raise ValueError("Cannot call transform on a Datapreprocessor that has not" - "yet been fit. Please check the log files for errors " - "while trying to fit the model." - ) + raise ValueError( + "Cannot call transform on a Datapreprocessor that has not" + "yet been fit. Please check the log files for errors " + "while trying to fit the model." + ) return self.column_transformer.transform(X) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'FeatTypeSplit', - 'name': 'Feature Type Splitter', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - # TODO find out of this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), } - - def set_hyperparameters(self, configuration: Configuration, - init_params: Optional[Dict[str, Any]] = None) -> 'FeatTypeSplit': - if init_params is not None and 'feat_type' in init_params.keys(): - self.feat_type = init_params['feat_type'] + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "FeatTypeSplit", + "name": "Feature Type Splitter", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + # TODO find out of this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } + + def set_hyperparameters( + self, configuration: Configuration, init_params: Optional[Dict[str, Any]] = None + ) -> "FeatTypeSplit": + if init_params is not None and "feat_type" in init_params.keys(): + self.feat_type = init_params["feat_type"] self.config = configuration @@ -197,29 +236,32 @@ def set_hyperparameters(self, configuration: Configuration, ) sub_config_dict = {} for param in configuration: - if param.startswith('%s:' % transf_name): + if param.startswith("%s:" % transf_name): value = configuration[param] - new_name = param.replace('%s:' % transf_name, '', 1) + new_name = param.replace("%s:" % transf_name, "", 1) sub_config_dict[new_name] = value - sub_configuration = Configuration(sub_configuration_space, - values=sub_config_dict) + sub_configuration = Configuration( + sub_configuration_space, values=sub_config_dict + ) sub_init_params_dict: Optional[Dict[str, Any]] = None if init_params is not None: sub_init_params_dict = {} for param in init_params: - if param.startswith('%s:' % transf_name): + if param.startswith("%s:" % transf_name): value = init_params[param] - new_name = param.replace('%s:' % transf_name, '', 1) + new_name = param.replace("%s:" % transf_name, "", 1) sub_init_params_dict[new_name] = value - if isinstance(transf_op, ( - AutoSklearnChoice, AutoSklearnComponent, BasePipeline)): + if isinstance( + transf_op, (AutoSklearnChoice, AutoSklearnComponent, BasePipeline) + ): transf_op.set_hyperparameters( - configuration=sub_configuration, init_params=sub_init_params_dict) + configuration=sub_configuration, init_params=sub_init_params_dict + ) else: - raise NotImplementedError('Not supported yet!') + raise NotImplementedError("Not supported yet!") return self @@ -230,7 +272,8 @@ def get_hyperparameter_search_space( self.dataset_properties = dataset_properties cs = ConfigurationSpace() cs = FeatTypeSplit._get_hyperparameter_search_space_recursevely( - dataset_properties, cs, self._transformers) + dataset_properties, cs, self._transformers + ) return cs @staticmethod @@ -243,8 +286,10 @@ def _get_hyperparameter_search_space_recursevely( if hasattr(st_operation, "get_hyperparameter_search_space"): cs.add_configuration_space( st_name, - st_operation.get_hyperparameter_search_space(dataset_properties)) + st_operation.get_hyperparameter_search_space(dataset_properties), + ) else: return FeatTypeSplit._get_hyperparameter_search_space_recursevely( - dataset_properties, cs, st_operation) + dataset_properties, cs, st_operation + ) return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type_categorical.py b/autosklearn/pipeline/components/data_preprocessing/feature_type_categorical.py index f8430aa978..0a46887799 100644 --- a/autosklearn/pipeline/components/data_preprocessing/feature_type_categorical.py +++ b/autosklearn/pipeline/components/data_preprocessing/feature_type_categorical.py @@ -1,27 +1,26 @@ -from typing import Any, List, Dict, Optional, Tuple, Union - -from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np - +from ConfigSpace.configuration_space import Configuration, ConfigurationSpace from sklearn.base import BaseEstimator -from autosklearn.pipeline.components.data_preprocessing.category_shift.\ - category_shift import CategoryShift -from autosklearn.pipeline.components.data_preprocessing.imputation.\ - categorical_imputation import CategoricalImputation -from autosklearn.pipeline.components.data_preprocessing.minority_coalescense \ - import CoalescenseChoice -from autosklearn.pipeline.components.data_preprocessing.categorical_encoding \ - import OHEChoice -from autosklearn.pipeline.components.data_preprocessing.categorical_encoding.encoding import ( - OrdinalEncoding +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, BasePipeline +from autosklearn.pipeline.components.data_preprocessing.categorical_encoding import ( # noqa: E501 + OHEChoice, +) +from autosklearn.pipeline.components.data_preprocessing.categorical_encoding.encoding import ( # noqa: E501 + OrdinalEncoding, +) +from autosklearn.pipeline.components.data_preprocessing.category_shift.category_shift import ( # noqa: E501 + CategoryShift, ) -from autosklearn.pipeline.base import ( - BasePipeline, - DATASET_PROPERTIES_TYPE, +from autosklearn.pipeline.components.data_preprocessing.imputation.categorical_imputation import ( # noqa: E501 + CategoricalImputation, ) -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.components.data_preprocessing.minority_coalescense import ( + CoalescenseChoice, +) +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class CategoricalPreprocessingPipeline(BasePipeline): @@ -53,35 +52,43 @@ def __init__( include: Optional[Dict[str, str]] = None, exclude: Optional[Dict[str, str]] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, - init_params: Optional[Dict[str, Any]] = None + init_params: Optional[Dict[str, Any]] = None, ) -> None: self._output_dtype = np.int32 super().__init__( - config, steps, dataset_properties, include, exclude, - random_state, init_params + config, + steps, + dataset_properties, + include, + exclude, + random_state, + init_params, ) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'cat_datapreproc', - 'name': 'categorical data preprocessing', - 'handles_missing_values': True, - 'handles_nominal_values': True, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'is_deterministic': True, - # TODO find out if this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "cat_datapreproc", + "name": "categorical data preprocessing", + "handles_missing_values": True, + "handles_nominal_values": True, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "is_deterministic": True, + # TODO find out if this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } def _get_hyperparameter_search_space( self, @@ -102,27 +109,34 @@ def _get_hyperparameter_search_space( dataset_properties = dict() cs = self._get_base_search_space( - cs=cs, dataset_properties=dataset_properties, - exclude=exclude, include=include, pipeline=self.steps) + cs=cs, + dataset_properties=dataset_properties, + exclude=exclude, + include=include, + pipeline=self.steps, + ) return cs - def _get_pipeline_steps(self, - dataset_properties: Optional[Dict[str, str]] = None, - ) -> List[Tuple[str, BaseEstimator]]: + def _get_pipeline_steps( + self, + dataset_properties: Optional[Dict[str, str]] = None, + ) -> List[Tuple[str, BaseEstimator]]: steps = [] default_dataset_properties = {} if dataset_properties is not None and isinstance(dataset_properties, dict): default_dataset_properties.update(dataset_properties) - steps.extend([ - ("imputation", CategoricalImputation()), - ("encoding", OrdinalEncoding()), - ("category_shift", CategoryShift()), - ("category_coalescence", CoalescenseChoice(default_dataset_properties)), - ("categorical_encoding", OHEChoice(default_dataset_properties)), - ]) + steps.extend( + [ + ("imputation", CategoricalImputation()), + ("encoding", OrdinalEncoding()), + ("category_shift", CategoryShift()), + ("category_coalescence", CoalescenseChoice(default_dataset_properties)), + ("categorical_encoding", OHEChoice(default_dataset_properties)), + ] + ) return steps diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type_numerical.py b/autosklearn/pipeline/components/data_preprocessing/feature_type_numerical.py index 5ef47e2699..18008378ab 100644 --- a/autosklearn/pipeline/components/data_preprocessing/feature_type_numerical.py +++ b/autosklearn/pipeline/components/data_preprocessing/feature_type_numerical.py @@ -1,23 +1,20 @@ -from typing import Any, List, Dict, Optional, Tuple, Union - -from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np - +from ConfigSpace.configuration_space import Configuration, ConfigurationSpace from sklearn.base import BaseEstimator -from autosklearn.pipeline.components.data_preprocessing import rescaling as \ - rescaling_components -from autosklearn.pipeline.components.data_preprocessing.imputation.numerical_imputation \ - import NumericalImputation -from autosklearn.pipeline.components.data_preprocessing.variance_threshold\ - .variance_threshold import VarianceThreshold - -from autosklearn.pipeline.base import ( - BasePipeline, - DATASET_PROPERTIES_TYPE, +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, BasePipeline +from autosklearn.pipeline.components.data_preprocessing import ( + rescaling as rescaling_components, +) +from autosklearn.pipeline.components.data_preprocessing.imputation.numerical_imputation import ( # noqa: E501 + NumericalImputation, ) -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.components.data_preprocessing.variance_threshold.variance_threshold import ( # noqa: E501 + VarianceThreshold, +) +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class NumericalPreprocessingPipeline(BasePipeline): @@ -48,35 +45,43 @@ def __init__( include: Optional[Dict[str, str]] = None, exclude: Optional[Dict[str, str]] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, - init_params: Optional[Dict[str, Any]] = None + init_params: Optional[Dict[str, Any]] = None, ) -> None: self._output_dtype = np.int32 super().__init__( - config, steps, dataset_properties, include, exclude, - random_state, init_params + config, + steps, + dataset_properties, + include, + exclude, + random_state, + init_params, ) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'num_datapreproc', - 'name': 'numeric data preprocessing', - 'handles_missing_values': True, - 'handles_nominal_values': True, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'is_deterministic': True, - # TODO find out if this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "num_datapreproc", + "name": "numeric data preprocessing", + "handles_missing_values": True, + "handles_nominal_values": True, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "is_deterministic": True, + # TODO find out if this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } def _get_hyperparameter_search_space( self, @@ -100,25 +105,35 @@ def _get_hyperparameter_search_space( dataset_properties = dict() cs = self._get_base_search_space( - cs=cs, dataset_properties=dataset_properties, - exclude=exclude, include=include, pipeline=self.steps) + cs=cs, + dataset_properties=dataset_properties, + exclude=exclude, + include=include, + pipeline=self.steps, + ) return cs - def _get_pipeline_steps(self, - dataset_properties: Optional[Dict[str, str]] = None, - ) -> List[Tuple[str, BaseEstimator]]: + def _get_pipeline_steps( + self, + dataset_properties: Optional[Dict[str, str]] = None, + ) -> List[Tuple[str, BaseEstimator]]: steps = [] default_dataset_properties = {} if dataset_properties is not None and isinstance(dataset_properties, dict): default_dataset_properties.update(dataset_properties) - steps.extend([ - ("imputation", NumericalImputation()), - ("variance_threshold", VarianceThreshold()), - ("rescaling", rescaling_components.RescalingChoice(default_dataset_properties)), - ]) + steps.extend( + [ + ("imputation", NumericalImputation()), + ("variance_threshold", VarianceThreshold()), + ( + "rescaling", + rescaling_components.RescalingChoice(default_dataset_properties), + ), + ] + ) return steps diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py b/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py index a21980f000..6030460ee1 100644 --- a/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py +++ b/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py @@ -1,20 +1,17 @@ -from typing import Any, List, Dict, Optional, Tuple, Union - -from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np - +from ConfigSpace.configuration_space import Configuration, ConfigurationSpace from sklearn.base import BaseEstimator -from autosklearn.pipeline.components.data_preprocessing.text_encoding \ - import BagOfWordChoice -from autosklearn.pipeline.components.data_preprocessing.feature_reduction.truncated_svd import \ - FeatureReduction -from autosklearn.pipeline.base import ( - BasePipeline, - DATASET_PROPERTIES_TYPE, +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, BasePipeline +from autosklearn.pipeline.components.data_preprocessing.feature_reduction.truncated_svd import ( # noqa: 501 + FeatureReduction, +) +from autosklearn.pipeline.components.data_preprocessing.text_encoding import ( + BagOfWordChoice, ) -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class TextPreprocessingPipeline(BasePipeline): @@ -43,34 +40,42 @@ def __init__( include: Optional[Dict[str, str]] = None, exclude: Optional[Dict[str, str]] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, - init_params: Optional[Dict[str, Any]] = None + init_params: Optional[Dict[str, Any]] = None, ) -> None: self._output_dtype = np.int32 super().__init__( - config, steps, dataset_properties, include, exclude, - random_state, init_params + config, + steps, + dataset_properties, + include, + exclude, + random_state, + init_params, ) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'txt_datapreproc', - 'name': 'text data preprocessing', - 'handles_missing_values': True, - 'handles_nominal_values': False, - 'handles_numerical_features': False, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "txt_datapreproc", + "name": "text data preprocessing", + "handles_missing_values": True, + "handles_nominal_values": False, + "handles_numerical_features": False, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "is_deterministic": True, + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } def _get_hyperparameter_search_space( self, @@ -94,25 +99,36 @@ def _get_hyperparameter_search_space( dataset_properties = dict() cs = self._get_base_search_space( - cs=cs, dataset_properties=dataset_properties, - exclude=exclude, include=include, pipeline=self.steps) + cs=cs, + dataset_properties=dataset_properties, + exclude=exclude, + include=include, + pipeline=self.steps, + ) return cs - def _get_pipeline_steps(self, - dataset_properties: Optional[Dict[str, str]] = None, - ) -> List[Tuple[str, BaseEstimator]]: + def _get_pipeline_steps( + self, + dataset_properties: Optional[Dict[str, str]] = None, + ) -> List[Tuple[str, BaseEstimator]]: steps = [] default_dataset_properties = {} if dataset_properties is not None and isinstance(dataset_properties, dict): default_dataset_properties.update(dataset_properties) - steps.extend([ - ("text_encoding", BagOfWordChoice(default_dataset_properties, - random_state=self.random_state)), - ("feature_reduction", FeatureReduction(random_state=self.random_state)) - ]) + steps.extend( + [ + ( + "text_encoding", + BagOfWordChoice( + default_dataset_properties, random_state=self.random_state + ), + ), + ("feature_reduction", FeatureReduction(random_state=self.random_state)), + ] + ) return steps def _get_estimator_hyperparameter_name(self) -> str: diff --git a/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py b/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py index 519155ea20..00b627daed 100644 --- a/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py +++ b/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py @@ -1,13 +1,12 @@ from typing import Dict, Optional, Tuple, Union -from ConfigSpace.configuration_space import ConfigurationSpace - import numpy as np +from ConfigSpace.configuration_space import ConfigurationSpace from scipy.sparse import spmatrix from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class CategoricalImputation(AutoSklearnPreprocessingAlgorithm): @@ -20,16 +19,16 @@ class CategoricalImputation(AutoSklearnPreprocessingAlgorithm): """ def __init__( - self, - random_state: Optional[Union[int, np.random.RandomState]] = None + self, random_state: Optional[Union[int, np.random.RandomState]] = None ) -> None: self.random_state = random_state - def fit(self, X: PIPELINE_DATA_DTYPE, - y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'CategoricalImputation': + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "CategoricalImputation": import sklearn.impute - if hasattr(X, 'columns'): + if hasattr(X, "columns"): kind = X[X.columns[-1]].dtype.kind else: # Series, sparse and numpy have dtype @@ -53,7 +52,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, fill_value = min(np.unique(X)) - 1 self.preprocessor = sklearn.impute.SimpleImputer( - strategy='constant', copy=False, fill_value=fill_value + strategy="constant", copy=False, fill_value=fill_value ) self.preprocessor.fit(X) return self @@ -65,29 +64,33 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'CategoricalImputation', - 'name': 'Categorical Imputation', - 'handles_missing_values': True, - 'handles_nominal_values': True, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - # TODO find out of this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "CategoricalImputation", + "name": "Categorical Imputation", + "handles_missing_values": True, + "handles_nominal_values": True, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + # TODO find out of this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: return ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py b/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py index e730718032..d7d6a645ab 100644 --- a/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py +++ b/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py @@ -1,31 +1,31 @@ from typing import Dict, Optional, Tuple, Union +import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter -import numpy as np - from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class NumericalImputation(AutoSklearnPreprocessingAlgorithm): - def __init__( self, - strategy: str = 'mean', - random_state: Optional[Union[int, np.random.RandomState]] = None + strategy: str = "mean", + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: self.strategy = strategy self.random_state = random_state - def fit(self, X: PIPELINE_DATA_DTYPE, - y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'NumericalImputation': + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "NumericalImputation": import sklearn.impute self.preprocessor = sklearn.impute.SimpleImputer( - strategy=self.strategy, copy=False) + strategy=self.strategy, copy=False + ) self.preprocessor.fit(X) return self @@ -35,34 +35,39 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.preprocessor.transform(X) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'NumericalImputation', - 'name': 'Numerical Imputation', - 'handles_missing_values': True, - 'handles_nominal_values': True, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - # TODO find out if this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "NumericalImputation", + "name": "Numerical Imputation", + "handles_missing_values": True, + "handles_nominal_values": True, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + # TODO find out if this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: # TODO add replace by zero! strategy = CategoricalHyperparameter( - "strategy", ["mean", "median", "most_frequent"], default_value="mean") + "strategy", ["mean", "median", "most_frequent"], default_value="mean" + ) cs = ConfigurationSpace() cs.add_hyperparameter(strategy) return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py index 0db0955cb5..fbf999761c 100644 --- a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py @@ -1,24 +1,27 @@ -from collections import OrderedDict -import os - from typing import Any, Dict, Optional +import os +from collections import OrderedDict + from ConfigSpace import Configuration from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter - -from ...base import AutoSklearnPreprocessingAlgorithm, find_components, \ - ThirdPartyComponents, AutoSklearnChoice, _addons - from sklearn.base import BaseEstimator from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE +from ...base import ( + AutoSklearnChoice, + AutoSklearnPreprocessingAlgorithm, + ThirdPartyComponents, + _addons, + find_components, +) + mc_directory = os.path.split(__file__)[0] -_mcs = find_components( - __package__, mc_directory, AutoSklearnPreprocessingAlgorithm) +_mcs = find_components(__package__, mc_directory, AutoSklearnPreprocessingAlgorithm) additional_components = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm) -_addons['data_preprocessing.minority_coalescense'] = additional_components +_addons["data_preprocessing.minority_coalescense"] = additional_components def add_mc(mc: BaseEstimator) -> None: @@ -26,7 +29,6 @@ def add_mc(mc: BaseEstimator) -> None: class CoalescenseChoice(AutoSklearnChoice): - @classmethod def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]: components: Dict[str, BaseEstimator] = OrderedDict() @@ -48,46 +50,52 @@ def get_hyperparameter_search_space( # Compile a list of legal preprocessors for this problem available_preprocessors = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) + dataset_properties=dataset_properties, include=include, exclude=exclude + ) if len(available_preprocessors) == 0: raise ValueError( "No minority coalescers found, please add any one minority coalescer" - "component.") + "component." + ) if default is None: - defaults = ['minority_coalescer', 'no_coalescense'] + defaults = ["minority_coalescer", "no_coalescense"] for default_ in defaults: if default_ in available_preprocessors: default = default_ break preprocessor = CategoricalHyperparameter( - '__choice__', list(available_preprocessors.keys()), default_value=default) + "__choice__", list(available_preprocessors.keys()), default_value=default + ) cs.add_hyperparameter(preprocessor) for name in available_preprocessors: - preprocessor_configuration_space = available_preprocessors[name]. \ - get_hyperparameter_search_space(dataset_properties) - parent_hyperparameter = {'parent': preprocessor, 'value': name} - cs.add_configuration_space(name, preprocessor_configuration_space, - parent_hyperparameter=parent_hyperparameter) + preprocessor_configuration_space = available_preprocessors[ + name + ].get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {"parent": preprocessor, "value": name} + cs.add_configuration_space( + name, + preprocessor_configuration_space, + parent_hyperparameter=parent_hyperparameter, + ) self.configuration_space = cs self.dataset_properties = dataset_properties return cs - def set_hyperparameters(self, configuration: Configuration, - init_params: Optional[Dict[str, Any]] = None - ) -> 'CoalescenseChoice': + def set_hyperparameters( + self, configuration: Configuration, init_params: Optional[Dict[str, Any]] = None + ) -> "CoalescenseChoice": new_params = {} params = configuration.get_dictionary() - choice = params['__choice__'] - del params['__choice__'] + choice = params["__choice__"] + del params["__choice__"] for param, value in params.items(): - param = param.replace(choice, '').replace(':', '') + param = param.replace(choice, "").replace(":", "") new_params[param] = value if init_params is not None: @@ -97,10 +105,10 @@ def set_hyperparameters(self, configuration: Configuration, # in order to not pass it to the no encoding if choice not in param: continue - param = param.replace(choice, '').replace(':', '') + param = param.replace(choice, "").replace(":", "") new_params[param] = value - new_params['random_state'] = self.random_state + new_params["random_state"] = self.random_state self.new_params = new_params self.choice = self.get_components()[choice](**new_params) diff --git a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py index 5b3b66caa7..278cf0bfb9 100644 --- a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py +++ b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py @@ -1,34 +1,35 @@ from typing import Dict, Optional, Tuple, Union - +import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformFloatHyperparameter -import numpy as np - import autosklearn.pipeline.implementations.MinorityCoalescer from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class MinorityCoalescer(AutoSklearnPreprocessingAlgorithm): - """ Group together categories which occurence is less than a specified minimum fraction. - """ + """Group categories whose occurence is less than a specified minimum fraction.""" def __init__( self, minimum_fraction: float = 0.01, - random_state: Optional[Union[int, np.random.RandomState]] = None + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: self.minimum_fraction = minimum_fraction - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'MinorityCoalescer': + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "MinorityCoalescer": self.minimum_fraction = float(self.minimum_fraction) - self.preprocessor = autosklearn.pipeline.implementations.MinorityCoalescer\ - .MinorityCoalescer(minimum_fraction=self.minimum_fraction) + self.preprocessor = ( + autosklearn.pipeline.implementations.MinorityCoalescer.MinorityCoalescer( + minimum_fraction=self.minimum_fraction + ) + ) self.preprocessor.fit(X, y) return self @@ -38,26 +39,31 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.preprocessor.transform(X) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'coalescer', - 'name': 'Categorical minority coalescer', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - # TODO find out of this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), } + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "coalescer", + "name": "Categorical minority coalescer", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + # TODO find out of this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() minimum_fraction = UniformFloatHyperparameter( - "minimum_fraction", lower=.0001, upper=0.5, default_value=0.01, log=True) + "minimum_fraction", lower=0.0001, upper=0.5, default_value=0.01, log=True + ) cs.add_hyperparameter(minimum_fraction) return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py index d252821ccc..d05c146d98 100644 --- a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py +++ b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py @@ -1,47 +1,49 @@ from typing import Dict, Optional, Tuple, Union - +import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace -import numpy as np from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class NoCoalescence(AutoSklearnPreprocessingAlgorithm): def __init__( - self, - random_state: Optional[Union[int, np.random.RandomState]] = None + self, random_state: Optional[Union[int, np.random.RandomState]] = None ) -> None: pass - def fit(self, X: np.array, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> PIPELINE_DATA_DTYPE: - self.preprocessor = 'passthrough' + def fit( + self, X: np.array, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> PIPELINE_DATA_DTYPE: + self.preprocessor = "passthrough" return self def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'no coalescence', - 'name': 'No categorical variable coalescence', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,)} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "no coalescence", + "name": "No categorical variable coalescence", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py index b37ad3ce24..2a9fbdb842 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py @@ -1,26 +1,31 @@ -from collections import OrderedDict -import os - from typing import Dict, Optional +import os +from collections import OrderedDict + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter - from sklearn.base import BaseEstimator -from ...base import AutoSklearnPreprocessingAlgorithm, find_components, \ - ThirdPartyComponents, AutoSklearnChoice, _addons from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE -from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( - Rescaling +from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( # noqa: E501 + Rescaling, +) + +from ...base import ( + AutoSklearnChoice, + AutoSklearnPreprocessingAlgorithm, + ThirdPartyComponents, + _addons, + find_components, ) rescaling_directory = os.path.split(__file__)[0] -_rescalers = find_components(__package__, - rescaling_directory, - AutoSklearnPreprocessingAlgorithm) +_rescalers = find_components( + __package__, rescaling_directory, AutoSklearnPreprocessingAlgorithm +) additional_components = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm) -_addons['data_preprocessing.rescaling'] = additional_components +_addons["data_preprocessing.rescaling"] = additional_components def add_rescaler(rescaler: Rescaling) -> None: @@ -28,7 +33,6 @@ def add_rescaler(rescaler: Rescaling) -> None: class RescalingChoice(AutoSklearnChoice): - @classmethod def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]: components: Dict[str, BaseEstimator] = OrderedDict() @@ -50,31 +54,33 @@ def get_hyperparameter_search_space( # Compile a list of legal preprocessors for this problem available_preprocessors = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) + dataset_properties=dataset_properties, include=include, exclude=exclude + ) if len(available_preprocessors) == 0: - raise ValueError( - "No rescalers found, please add any rescaling component.") + raise ValueError("No rescalers found, please add any rescaling component.") if default is None: - defaults = ['standardize', 'none', 'minmax', 'normalize'] + defaults = ["standardize", "none", "minmax", "normalize"] for default_ in defaults: if default_ in available_preprocessors: default = default_ break - preprocessor = CategoricalHyperparameter('__choice__', - list( - available_preprocessors.keys()), - default_value=default) + preprocessor = CategoricalHyperparameter( + "__choice__", list(available_preprocessors.keys()), default_value=default + ) cs.add_hyperparameter(preprocessor) for name in available_preprocessors: - preprocessor_configuration_space = available_preprocessors[name]. \ - get_hyperparameter_search_space(dataset_properties) - parent_hyperparameter = {'parent': preprocessor, 'value': name} - cs.add_configuration_space(name, preprocessor_configuration_space, - parent_hyperparameter=parent_hyperparameter) + preprocessor_configuration_space = available_preprocessors[ + name + ].get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {"parent": preprocessor, "value": name} + cs.add_configuration_space( + name, + preprocessor_configuration_space, + parent_hyperparameter=parent_hyperparameter, + ) self.configuration_space = cs self.dataset_properties = dataset_properties diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py index dc9c9c60ac..05e1a4e898 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py @@ -1,9 +1,7 @@ from typing import Optional, Union -from ConfigSpace.configuration_space import ConfigurationSpace - import numpy as np - +from ConfigSpace.configuration_space import ConfigurationSpace from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError @@ -14,16 +12,13 @@ class Rescaling(object): # Rescaling does not support fit_transform (as of 0.19.1)! def __init__( - self, - random_state: Optional[Union[int, np.random.RandomState]] = None + self, random_state: Optional[Union[int, np.random.RandomState]] = None ) -> None: self.preprocessor: Optional[BaseEstimator] = None def fit( - self, - X: PIPELINE_DATA_DTYPE, - y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'AutoSklearnPreprocessingAlgorithm': + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "AutoSklearnPreprocessingAlgorithm": if self.preprocessor is None: raise NotFittedError() @@ -42,7 +37,8 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return transformed_X @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/minmax.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/minmax.py index 67650376e1..3663a23d35 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/minmax.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/minmax.py @@ -3,36 +3,43 @@ import numpy as np from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, SIGNED_DATA, INPUT -from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ - import Rescaling from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( # noqa: E501 + Rescaling, +) +from autosklearn.pipeline.constants import DENSE, INPUT, SIGNED_DATA, UNSIGNED_DATA class MinMaxScalerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): - def __init__(self, random_state: Optional[Union[int, np.random.RandomState]] = None): + def __init__( + self, random_state: Optional[Union[int, np.random.RandomState]] = None + ): from sklearn.preprocessing import MinMaxScaler + self.preprocessor = MinMaxScaler(copy=False) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'MinMaxScaler', - 'name': 'MinMaxScaler', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - # TODO find out if this is right! - 'handles_sparse': False, - 'handles_dense': True, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (INPUT, SIGNED_DATA), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "MinMaxScaler", + "name": "MinMaxScaler", + "handles_missing_values": False, + "handles_nominal_values": False, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + # TODO find out if this is right! + "handles_sparse": False, + "handles_dense": True, + "input": (DENSE, UNSIGNED_DATA), + "output": (INPUT, SIGNED_DATA), + "preferred_dtype": None, + } diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/none.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/none.py index 83377e2544..ee94213f57 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/none.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/none.py @@ -1,41 +1,45 @@ from typing import Dict, Optional, Tuple, Union from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, INPUT, SPARSE -from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ - import Rescaling from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( # noqa: E501 + Rescaling, +) +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class NoRescalingComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): - - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'AutoSklearnPreprocessingAlgorithm': - self.preprocessor = 'passthrough' + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "AutoSklearnPreprocessingAlgorithm": + self.preprocessor = "passthrough" return self def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'NoRescaling', - 'name': 'NoRescaling', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - # TODO find out if this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "NoRescaling", + "name": "NoRescaling", + "handles_missing_values": False, + "handles_nominal_values": False, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + # TODO find out if this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/normalize.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/normalize.py index 036c75a4ef..00395833e9 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/normalize.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/normalize.py @@ -3,41 +3,45 @@ import numpy as np from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, INPUT, SPARSE -from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ - import Rescaling from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( # noqa: E501 + Rescaling, +) +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class NormalizerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): def __init__( - self, - random_state: Optional[Union[int, np.random.RandomState]] = None + self, random_state: Optional[Union[int, np.random.RandomState]] = None ) -> None: # Use custom implementation because sklearn implementation cannot # handle float32 input matrix from sklearn.preprocessing import Normalizer + self.preprocessor = Normalizer(copy=False) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'Normalizer', - 'name': 'Normalizer', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - # TODO find out if this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "Normalizer", + "name": "Normalizer", + "handles_missing_values": False, + "handles_nominal_values": False, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + # TODO find out if this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/power_transformer.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/power_transformer.py index 759c921caa..dd9ab616ae 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/power_transformer.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/power_transformer.py @@ -3,10 +3,11 @@ import numpy as np from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, INPUT -from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ - import Rescaling from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( # noqa: E501 + Rescaling, +) +from autosklearn.pipeline.constants import DENSE, INPUT, UNSIGNED_DATA class PowerTransformerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): @@ -15,27 +16,31 @@ def __init__( random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: from sklearn.preprocessing import PowerTransformer + self.preprocessor = PowerTransformer(copy=False) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'PowerTransformer', - 'name': 'PowerTransformer', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - # TODO find out of this is right! - 'handles_sparse': False, - 'handles_dense': True, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "PowerTransformer", + "name": "PowerTransformer", + "handles_missing_values": False, + "handles_nominal_values": False, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + # TODO find out of this is right! + "handles_sparse": False, + "handles_dense": True, + "input": (DENSE, UNSIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py index b7206fbaaf..2611c0650d 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py @@ -1,17 +1,24 @@ from typing import Dict, Optional, Tuple, Union import numpy as np - -from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, \ - CategoricalHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, +) -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, SIGNED_DATA, SPARSE, INPUT -from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ - import Rescaling -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( # noqa: E501 + Rescaling, +) +from autosklearn.pipeline.constants import ( + DENSE, + INPUT, + SIGNED_DATA, + SPARSE, + UNSIGNED_DATA, +) class QuantileTransformerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): @@ -19,46 +26,51 @@ def __init__( self, n_quantiles: int, output_distribution: str, - random_state: Optional[Union[int, np.random.RandomState]] = None + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: from sklearn.preprocessing import QuantileTransformer + self.n_quantiles = n_quantiles self.output_distribution = output_distribution self.preprocessor = QuantileTransformer( n_quantiles=n_quantiles, output_distribution=output_distribution, copy=False, - random_state=random_state + random_state=random_state, ) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'QuantileTransformer', - 'name': 'QuantileTransformer', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - # TODO find out if this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT, SIGNED_DATA), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "QuantileTransformer", + "name": "QuantileTransformer", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + # TODO find out if this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (INPUT, SIGNED_DATA), + "preferred_dtype": None, + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() # TODO parametrize like the Random Forest as n_quantiles = n_features^param n_quantiles = UniformIntegerHyperparameter( - 'n_quantiles', lower=10, upper=2000, default_value=1000 + "n_quantiles", lower=10, upper=2000, default_value=1000 ) output_distribution = CategoricalHyperparameter( - 'output_distribution', ['normal', 'uniform'] + "output_distribution", ["normal", "uniform"] ) cs.add_hyperparameters((n_quantiles, output_distribution)) return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py index 614b79ee40..af3b4c0558 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py @@ -1,19 +1,23 @@ from typing import Dict, Optional, Tuple, Union import numpy as np - - -from scipy import sparse -from sklearn.exceptions import NotFittedError from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformFloatHyperparameter +from scipy import sparse +from sklearn.exceptions import NotFittedError from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, SIGNED_DATA, INPUT, SPARSE -from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ - import Rescaling -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( # noqa: E501 + Rescaling, +) +from autosklearn.pipeline.constants import ( + DENSE, + INPUT, + SIGNED_DATA, + SPARSE, + UNSIGNED_DATA, +) class RobustScalerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): @@ -21,48 +25,51 @@ def __init__( self, q_min: float, q_max: float, - random_state: Optional[Union[int, np.random.RandomState]] = None + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: from sklearn.preprocessing import RobustScaler + self.q_min = q_min self.q_max = q_max self.preprocessor = RobustScaler( - quantile_range=(self.q_min, self.q_max), copy=False, + quantile_range=(self.q_min, self.q_max), + copy=False, ) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'RobustScaler', - 'name': 'RobustScaler', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - # TODO find out if this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT, SIGNED_DATA), - 'preferred_dtype': None} + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "RobustScaler", + "name": "RobustScaler", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + # TODO find out if this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (INPUT, SIGNED_DATA), + "preferred_dtype": None, + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() - q_min = UniformFloatHyperparameter( - 'q_min', 0.001, 0.3, default_value=0.25 - ) - q_max = UniformFloatHyperparameter( - 'q_max', 0.7, 0.999, default_value=0.75 - ) + q_min = UniformFloatHyperparameter("q_min", 0.001, 0.3, default_value=0.25) + q_max = UniformFloatHyperparameter("q_max", 0.7, 0.999, default_value=0.75) cs.add_hyperparameters((q_min, q_max)) return cs - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'AutoSklearnPreprocessingAlgorithm': + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "AutoSklearnPreprocessingAlgorithm": if self.preprocessor is None: raise NotFittedError() if sparse.isspmatrix(X): diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/standardize.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/standardize.py index adb156ab93..a1da729907 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/standardize.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/standardize.py @@ -1,52 +1,54 @@ from typing import Dict, Optional, Tuple, Union import numpy as np - from scipy import sparse - from sklearn.exceptions import NotFittedError from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT -from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ - import Rescaling -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( # noqa: E501 + Rescaling, +) +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class StandardScalerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): def __init__( - self, - random_state: Optional[Union[int, np.random.RandomState]] = None + self, random_state: Optional[Union[int, np.random.RandomState]] = None ) -> None: from sklearn.preprocessing import StandardScaler + self.preprocessor = StandardScaler(copy=False) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'StandardScaler', - 'name': 'StandardScaler', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - # TODO find out if this is right! - 'handles_sparse': True, - 'handles_dense': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} - - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'AutoSklearnPreprocessingAlgorithm': + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "StandardScaler", + "name": "StandardScaler", + "handles_missing_values": False, + "handles_nominal_values": False, + "handles_numerical_features": True, + "prefers_data_scaled": False, + "prefers_data_normalized": False, + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + # TODO find out if this is right! + "handles_sparse": True, + "handles_dense": True, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (INPUT,), + "preferred_dtype": None, + } + + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "AutoSklearnPreprocessingAlgorithm": if self.preprocessor is None: raise NotFittedError() if sparse.isspmatrix(X): diff --git a/autosklearn/pipeline/components/data_preprocessing/text_encoding/__init__.py b/autosklearn/pipeline/components/data_preprocessing/text_encoding/__init__.py index 949ce83298..990ad579ca 100644 --- a/autosklearn/pipeline/components/data_preprocessing/text_encoding/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/text_encoding/__init__.py @@ -1,34 +1,35 @@ -from collections import OrderedDict -import os - from typing import Any, Dict, Optional +import os +from collections import OrderedDict + from ConfigSpace import Configuration from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter - from sklearn.base import BaseEstimator -from ...base import AutoSklearnPreprocessingAlgorithm, find_components,\ - ThirdPartyComponents, AutoSklearnChoice, _addons - from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE +from ...base import ( + AutoSklearnChoice, + AutoSklearnPreprocessingAlgorithm, + ThirdPartyComponents, + _addons, + find_components, +) + bow_directory = os.path.split(__file__)[0] -_bows = find_components(__package__, - bow_directory, - AutoSklearnPreprocessingAlgorithm) +_bows = find_components(__package__, bow_directory, AutoSklearnPreprocessingAlgorithm) additional_components = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm) -_addons['data_preprocessing.text_encoding'] = additional_components +_addons["data_preprocessing.text_encoding"] = additional_components -def add_bow(classifier: 'BagOfWordChoice') -> None: +def add_bow(classifier: "BagOfWordChoice") -> None: additional_components.add_component(classifier) class BagOfWordChoice(AutoSklearnChoice): - @classmethod def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]: components: Dict[str, BaseEstimator] = OrderedDict() @@ -50,49 +51,53 @@ def get_hyperparameter_search_space( # Compile a list of legal preprocessors for this problem available_preprocessors = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) + dataset_properties=dataset_properties, include=include, exclude=exclude + ) if len(available_preprocessors) == 0: raise ValueError( "No bag of word encoders found, please add any bag of word encoder" - "component.") + "component." + ) if default is None: - defaults = ['bag_of_words_encoding'] + defaults = ["bag_of_words_encoding"] for default_ in defaults: if default_ in available_preprocessors: default = default_ break - preprocessor = CategoricalHyperparameter('__choice__', - list( - available_preprocessors.keys()), - default_value=default) + preprocessor = CategoricalHyperparameter( + "__choice__", list(available_preprocessors.keys()), default_value=default + ) cs.add_hyperparameter(preprocessor) for name in available_preprocessors: - preprocessor_configuration_space = available_preprocessors[name]. \ - get_hyperparameter_search_space(dataset_properties) - parent_hyperparameter = {'parent': preprocessor, 'value': name} - cs.add_configuration_space(name, preprocessor_configuration_space, - parent_hyperparameter=parent_hyperparameter) + preprocessor_configuration_space = available_preprocessors[ + name + ].get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {"parent": preprocessor, "value": name} + cs.add_configuration_space( + name, + preprocessor_configuration_space, + parent_hyperparameter=parent_hyperparameter, + ) self.configuration_space = cs self.dataset_properties = dataset_properties return cs - def set_hyperparameters(self, configuration: Configuration, - init_params: Optional[Dict[str, Any]] = None - ) -> 'BagOfWordChoice': + def set_hyperparameters( + self, configuration: Configuration, init_params: Optional[Dict[str, Any]] = None + ) -> "BagOfWordChoice": new_params = {} params = configuration.get_dictionary() - choice = params['__choice__'] - del params['__choice__'] + choice = params["__choice__"] + del params["__choice__"] for param, value in params.items(): - param = param.replace(choice, '').replace(':', '') + param = param.replace(choice, "").replace(":", "") new_params[param] = value if init_params is not None: @@ -102,10 +107,10 @@ def set_hyperparameters(self, configuration: Configuration, # in order to not pass it to the no encoding if choice not in param: continue - param = param.replace(choice, '').replace(':', '') + param = param.replace(choice, "").replace(":", "") new_params[param] = value - new_params['random_state'] = self.random_state + new_params["random_state"] = self.random_state self.new_params = new_params self.choice = self.get_components()[choice](**new_params) diff --git a/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py b/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py index c66c67046b..47a80684f7 100644 --- a/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py @@ -1,19 +1,17 @@ from typing import Dict, Optional, Tuple, Union -from ConfigSpace.configuration_space import ConfigurationSpace -import ConfigSpace.hyperparameters as CSH -from ConfigSpace import EqualsCondition +import itertools +import ConfigSpace.hyperparameters as CSH import numpy as np import pandas as pd -import itertools - +from ConfigSpace import EqualsCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from sklearn.feature_extraction.text import CountVectorizer from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT - -from sklearn.feature_extraction.text import CountVectorizer +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class BagOfWordEncoder(AutoSklearnPreprocessingAlgorithm): @@ -23,7 +21,7 @@ def __init__( min_df_choice: str = "min_df_absolute", min_df_absolute: int = 0, min_df_relative: float = 0.01, - random_state: Optional[Union[int, np.random.RandomState]] = None + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: self.ngram_range = ngram_range self.random_state = random_state @@ -31,30 +29,38 @@ def __init__( self.min_df_absolute = min_df_absolute self.min_df_relative = min_df_relative - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'BagOfWordEncoder': - - if isinstance(X, pd.DataFrame): - X.fillna("", inplace=True) - # define a CountVectorizer for every feature (implicitly defined by order of columns, - # maybe change the list - # to a dictionary with features as keys) - if self.min_df_choice == "min_df_absolute": - self.preprocessor = CountVectorizer(min_df=self.min_df_absolute, - ngram_range=(1, self.ngram_range)) - elif self.min_df_choice == "min_df_relative": - self.preprocessor = CountVectorizer(min_df=self.min_df_relative, - ngram_range=(1, self.ngram_range)) - else: - raise KeyError() + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "BagOfWordEncoder": + + if not isinstance(X, pd.DataFrame): + raise ValueError( + "Your text data is not encoded in a pandas.DataFrame\n" + "Please make sure to use a pandas.DataFrame and ensure" + "that the text features are encoded as strings." + ) + + X.fillna("", inplace=True) + + # define a CountVectorizer for used on every feature + if self.min_df_choice == "min_df_absolute": + self.preprocessor = CountVectorizer( + min_df=self.min_df_absolute, + ngram_range=(1, self.ngram_range), + ) - all_text = itertools.chain.from_iterable(X[col] for col in X.columns) - self.preprocessor = self.preprocessor.fit(all_text) + elif self.min_df_choice == "min_df_relative": + self.preprocessor = CountVectorizer( + min_df=self.min_df_relative, + ngram_range=(1, self.ngram_range), + ) else: - raise ValueError("Your text data is not encoded in a pandas.DataFrame\n" - "Please make sure to use a pandas.DataFrame and ensure" - "that the text features are encoded as strings.") + raise KeyError() + + all_text = itertools.chain.from_iterable(X[col] for col in X.columns) + self.preprocessor = self.preprocessor.fit(all_text) + return self def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: @@ -70,42 +76,55 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X_transformed @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'BOW', - 'name': 'Bag Of Word Encoder', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), } + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "BOW", + "name": "Bag Of Word Encoder", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() - hp_ngram_range = CSH.UniformIntegerHyperparameter(name="ngram_range", lower=1, upper=3, - default_value=1) - hp_min_df_choice_bow = CSH.CategoricalHyperparameter("min_df_choice", - choices=["min_df_absolute", - "min_df_relative"]) - hp_min_df_absolute_bow = CSH.UniformIntegerHyperparameter(name="min_df_absolute", lower=0, - upper=10, - default_value=0) - hp_min_df_relative_bow = CSH.UniformFloatHyperparameter(name="min_df_relative", lower=0.01, - upper=1.0, - default_value=0.01, log=True) + hp_ngram_range = CSH.UniformIntegerHyperparameter( + name="ngram_range", lower=1, upper=3, default_value=1 + ) + hp_min_df_choice_bow = CSH.CategoricalHyperparameter( + "min_df_choice", choices=["min_df_absolute", "min_df_relative"] + ) + hp_min_df_absolute_bow = CSH.UniformIntegerHyperparameter( + name="min_df_absolute", lower=0, upper=10, default_value=0 + ) + hp_min_df_relative_bow = CSH.UniformFloatHyperparameter( + name="min_df_relative", lower=0.01, upper=1.0, default_value=0.01, log=True + ) cs.add_hyperparameters( - [hp_ngram_range, hp_min_df_choice_bow, hp_min_df_absolute_bow, hp_min_df_relative_bow]) - - cond_min_df_absolute_bow = EqualsCondition(hp_min_df_absolute_bow, hp_min_df_choice_bow, - "min_df_absolute") - cond_min_df_relative_bow = EqualsCondition(hp_min_df_relative_bow, hp_min_df_choice_bow, - "min_df_relative") + [ + hp_ngram_range, + hp_min_df_choice_bow, + hp_min_df_absolute_bow, + hp_min_df_relative_bow, + ] + ) + + cond_min_df_absolute_bow = EqualsCondition( + hp_min_df_absolute_bow, hp_min_df_choice_bow, "min_df_absolute" + ) + cond_min_df_relative_bow = EqualsCondition( + hp_min_df_relative_bow, hp_min_df_choice_bow, "min_df_relative" + ) cs.add_conditions([cond_min_df_absolute_bow, cond_min_df_relative_bow]) # maybe add bigrams ... diff --git a/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py b/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py index 85851ca72e..22a0be1088 100644 --- a/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py +++ b/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py @@ -1,18 +1,16 @@ from typing import Dict, Optional, Tuple, Union -from ConfigSpace.configuration_space import ConfigurationSpace import ConfigSpace.hyperparameters as CSH -from ConfigSpace import EqualsCondition - import numpy as np import pandas as pd +from ConfigSpace import EqualsCondition +from ConfigSpace.configuration_space import ConfigurationSpace from scipy.sparse import hstack +from sklearn.feature_extraction.text import CountVectorizer from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT - -from sklearn.feature_extraction.text import CountVectorizer +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class BagOfWordEncoder(AutoSklearnPreprocessingAlgorithm): @@ -22,7 +20,7 @@ def __init__( min_df_choice: str = "min_df_absolute", min_df_absolute: int = 0, min_df_relative: float = 0.01, - random_state: Optional[Union[int, np.random.RandomState]] = None + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: self.ngram_range = ngram_range self.random_state = random_state @@ -30,8 +28,9 @@ def __init__( self.min_df_absolute = min_df_absolute self.min_df_relative = min_df_relative - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'BagOfWordEncoder': + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "BagOfWordEncoder": if isinstance(X, pd.DataFrame): X.fillna("", inplace=True) @@ -40,8 +39,9 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None self.preprocessor = {} for feature in X.columns: - vectorizer = CountVectorizer(min_df=self.min_df_absolute, - ngram_range=(1, self.ngram_range)).fit(X[feature]) + vectorizer = CountVectorizer( + min_df=self.min_df_absolute, ngram_range=(1, self.ngram_range) + ).fit(X[feature]) self.preprocessor[feature] = vectorizer elif self.min_df_choice == "min_df_relative": @@ -49,15 +49,18 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None self.preprocessor = {} for feature in X.columns: - vectorizer = CountVectorizer(min_df=self.min_df_relative, - ngram_range=(1, self.ngram_range)).fit(X[feature]) + vectorizer = CountVectorizer( + min_df=self.min_df_relative, ngram_range=(1, self.ngram_range) + ).fit(X[feature]) self.preprocessor[feature] = vectorizer else: raise KeyError() else: - raise ValueError("Your text data is not encoded in a pandas.DataFrame\n" - "Please make sure to use a pandas.DataFrame and ensure" - "that the text features are encoded as strings.") + raise ValueError( + "Your text data is not encoded in a pandas.DataFrame\n" + "Please make sure to use a pandas.DataFrame and ensure" + "that the text features are encoded as strings." + ) return self def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: @@ -77,42 +80,55 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X_new @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'BOW', - 'name': 'Bag Of Word Encoder', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), } + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "BOW", + "name": "Bag Of Word Encoder", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() - hp_ngram_range = CSH.UniformIntegerHyperparameter(name="ngram_range", lower=1, upper=3, - default_value=1) - hp_min_df_choice_bow = CSH.CategoricalHyperparameter("min_df_choice", - choices=["min_df_absolute", - "min_df_relative"]) - hp_min_df_absolute_bow = CSH.UniformIntegerHyperparameter(name="min_df_absolute", lower=0, - upper=10, - default_value=0) - hp_min_df_relative_bow = CSH.UniformFloatHyperparameter(name="min_df_relative", lower=0.01, - upper=1.0, - default_value=0.01, log=True) + hp_ngram_range = CSH.UniformIntegerHyperparameter( + name="ngram_range", lower=1, upper=3, default_value=1 + ) + hp_min_df_choice_bow = CSH.CategoricalHyperparameter( + "min_df_choice", choices=["min_df_absolute", "min_df_relative"] + ) + hp_min_df_absolute_bow = CSH.UniformIntegerHyperparameter( + name="min_df_absolute", lower=0, upper=10, default_value=0 + ) + hp_min_df_relative_bow = CSH.UniformFloatHyperparameter( + name="min_df_relative", lower=0.01, upper=1.0, default_value=0.01, log=True + ) cs.add_hyperparameters( - [hp_ngram_range, hp_min_df_choice_bow, hp_min_df_absolute_bow, hp_min_df_relative_bow]) - - cond_min_df_absolute_bow = EqualsCondition(hp_min_df_absolute_bow, hp_min_df_choice_bow, - "min_df_absolute") - cond_min_df_relative_bow = EqualsCondition(hp_min_df_relative_bow, hp_min_df_choice_bow, - "min_df_relative") + [ + hp_ngram_range, + hp_min_df_choice_bow, + hp_min_df_absolute_bow, + hp_min_df_relative_bow, + ] + ) + + cond_min_df_absolute_bow = EqualsCondition( + hp_min_df_absolute_bow, hp_min_df_choice_bow, "min_df_absolute" + ) + cond_min_df_relative_bow = EqualsCondition( + hp_min_df_relative_bow, hp_min_df_choice_bow, "min_df_relative" + ) cs.add_conditions([cond_min_df_absolute_bow, cond_min_df_relative_bow]) # maybe add bigrams ... diff --git a/autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py b/autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py index 0b7ef239f1..aea4a05906 100644 --- a/autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py @@ -1,18 +1,17 @@ from typing import Dict, Optional, Tuple, Union -from ConfigSpace.configuration_space import ConfigurationSpace -import ConfigSpace.hyperparameters as CSH -from ConfigSpace import EqualsCondition +import itertools +import ConfigSpace.hyperparameters as CSH import numpy as np import pandas as pd -import itertools +from ConfigSpace import EqualsCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from sklearn.feature_extraction.text import TfidfVectorizer from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT - -from sklearn.feature_extraction.text import TfidfVectorizer +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class TfidfEncoder(AutoSklearnPreprocessingAlgorithm): @@ -23,7 +22,7 @@ def __init__( min_df_choice: str = "min_df_absolute", min_df_absolute: int = 0, min_df_relative: float = 0.01, - random_state: Optional[Union[int, np.random.RandomState]] = None + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: self.ngram_range = ngram_range self.random_state = random_state @@ -32,32 +31,41 @@ def __init__( self.min_df_absolute = min_df_absolute self.min_df_relative = min_df_relative - def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None - ) -> 'TfidfEncoder': - - if isinstance(X, pd.DataFrame): - X.fillna("", inplace=True) - # define a CountVectorizer for every feature (implicitly defined by order of columns, - # maybe change the list - # to a dictionary with features as keys) - if self.min_df_choice == "min_df_absolute": - self.preprocessor = TfidfVectorizer(min_df=self.min_df_absolute, - use_idf=self.use_idf, - ngram_range=(1, self.ngram_range)) - elif self.min_df_choice == "min_df_relative": - self.preprocessor = TfidfVectorizer(min_df=self.min_df_relative, - use_idf=self.use_idf, - ngram_range=(1, self.ngram_range)) - else: - raise KeyError() + def fit( + self, + X: PIPELINE_DATA_DTYPE, + y: Optional[PIPELINE_DATA_DTYPE] = None, + ) -> "TfidfEncoder": + + if not isinstance(X, pd.DataFrame): + raise ValueError( + "Your text data is not encoded in a pandas.DataFrame\n" + "Please make sure to use a pandas.DataFrame and ensure" + " that the text features are encoded as strings." + ) + + X.fillna("", inplace=True) + + if self.min_df_choice == "min_df_absolute": + self.preprocessor = TfidfVectorizer( + min_df=self.min_df_absolute, + use_idf=self.use_idf, + ngram_range=(1, self.ngram_range), + ) - all_text = itertools.chain.from_iterable(X[col] for col in X.columns) - self.preprocessor = self.preprocessor.fit(all_text) + elif self.min_df_choice == "min_df_relative": + self.preprocessor = TfidfVectorizer( + min_df=self.min_df_relative, + use_idf=self.use_idf, + ngram_range=(1, self.ngram_range), + ) else: - raise ValueError("Your text data is not encoded in a pandas.DataFrame\n" - "Please make sure to use a pandas.DataFrame and ensure" - " that the text features are encoded as strings.") + raise KeyError() + + all_text = itertools.chain.from_iterable(X[col] for col in X.columns) + self.preprocessor = self.preprocessor.fit(all_text) + return self def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: @@ -73,41 +81,57 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X_transformed @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: - return {'shortname': 'RBOW', - 'name': 'Relative Bag Of Word Encoder', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), } + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + "shortname": "RBOW", + "name": "Relative Bag Of Word Encoder", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() - hp_ngram_range = CSH.UniformIntegerHyperparameter(name="ngram_range", lower=1, upper=3, - default_value=1) + hp_ngram_range = CSH.UniformIntegerHyperparameter( + name="ngram_range", lower=1, upper=3, default_value=1 + ) hp_use_idf = CSH.CategoricalHyperparameter("use_idf", choices=[False, True]) - hp_min_df_choice = CSH.CategoricalHyperparameter("min_df_choice", - choices=["min_df_absolute", - "min_df_relative"]) - hp_min_df_absolute = CSH.UniformIntegerHyperparameter(name="min_df_absolute", lower=0, - upper=10, default_value=0) - hp_min_df_relative = CSH.UniformFloatHyperparameter(name="min_df_relative", lower=0.01, - upper=1.0, default_value=0.01, log=True) + hp_min_df_choice = CSH.CategoricalHyperparameter( + "min_df_choice", choices=["min_df_absolute", "min_df_relative"] + ) + hp_min_df_absolute = CSH.UniformIntegerHyperparameter( + name="min_df_absolute", lower=0, upper=10, default_value=0 + ) + hp_min_df_relative = CSH.UniformFloatHyperparameter( + name="min_df_relative", lower=0.01, upper=1.0, default_value=0.01, log=True + ) cs.add_hyperparameters( - [hp_ngram_range, hp_use_idf, hp_min_df_choice, hp_min_df_absolute, hp_min_df_relative]) - - cond_min_df_absolute = EqualsCondition(hp_min_df_absolute, hp_min_df_choice, - "min_df_absolute") - cond_min_df_relative = EqualsCondition(hp_min_df_relative, hp_min_df_choice, - "min_df_relative") + [ + hp_ngram_range, + hp_use_idf, + hp_min_df_choice, + hp_min_df_absolute, + hp_min_df_relative, + ] + ) + + cond_min_df_absolute = EqualsCondition( + hp_min_df_absolute, hp_min_df_choice, "min_df_absolute" + ) + cond_min_df_relative = EqualsCondition( + hp_min_df_relative, hp_min_df_choice, "min_df_relative" + ) cs.add_conditions([cond_min_df_absolute, cond_min_df_relative]) # maybe add bigrams ... diff --git a/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py b/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py index c6dc42a4df..365ae405a0 100644 --- a/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py +++ b/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py @@ -1,29 +1,25 @@ from typing import Dict, Optional, Tuple, Union -from ConfigSpace.configuration_space import ConfigurationSpace - import numpy as np +import sklearn.feature_selection +from ConfigSpace.configuration_space import ConfigurationSpace from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT - -import sklearn.feature_selection +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class VarianceThreshold(AutoSklearnPreprocessingAlgorithm): def __init__( - self, - random_state: Optional[Union[int, np.random.RandomState]] = None + self, random_state: Optional[Union[int, np.random.RandomState]] = None ) -> None: # VarianceThreshold does not support fit_transform (as of 0.19.1)! self.random_state = random_state - def fit(self, X: PIPELINE_DATA_DTYPE, - y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'VarianceThreshold': - self.preprocessor = sklearn.feature_selection.VarianceThreshold( - threshold=0.0 - ) + def fit( + self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> "VarianceThreshold": + self.preprocessor = sklearn.feature_selection.VarianceThreshold(threshold=0.0) self.preprocessor = self.preprocessor.fit(X) return self @@ -33,25 +29,27 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.preprocessor.transform(X) @staticmethod - def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return { - 'shortname': 'Variance Threshold', - 'name': 'Variance Threshold (constant feature removal)', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), + "shortname": "Variance Threshold", + "name": "Variance Threshold (constant feature removal)", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + "handles_sparse": True, + "handles_dense": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), } @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None - ) -> ConfigurationSpace: + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/__init__.py b/autosklearn/pipeline/components/feature_preprocessing/__init__.py index e124d135d0..cd52d6ad34 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/__init__.py +++ b/autosklearn/pipeline/components/feature_preprocessing/__init__.py @@ -1,18 +1,25 @@ +from typing import Type + import os from collections import OrderedDict -from typing import Type -from ..base import AutoSklearnPreprocessingAlgorithm, find_components, \ - ThirdPartyComponents, AutoSklearnChoice, _addons from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from ..base import ( + AutoSklearnChoice, + AutoSklearnPreprocessingAlgorithm, + ThirdPartyComponents, + _addons, + find_components, +) + classifier_directory = os.path.split(__file__)[0] -_preprocessors = find_components(__package__, - classifier_directory, - AutoSklearnPreprocessingAlgorithm) +_preprocessors = find_components( + __package__, classifier_directory, AutoSklearnPreprocessingAlgorithm +) additional_components = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm) -_addons['feature_preprocessing'] = additional_components +_addons["feature_preprocessing"] = additional_components def add_preprocessor(preprocessor: Type[AutoSklearnPreprocessingAlgorithm]) -> None: @@ -20,7 +27,6 @@ def add_preprocessor(preprocessor: Type[AutoSklearnPreprocessingAlgorithm]) -> N class FeaturePreprocessorChoice(AutoSklearnChoice): - @classmethod def get_components(cls): components = OrderedDict() @@ -28,23 +34,25 @@ def get_components(cls): components.update(additional_components.components) return components - def get_available_components(self, dataset_properties=None, - include=None, - exclude=None): + def get_available_components( + self, dataset_properties=None, include=None, exclude=None + ): if dataset_properties is None: dataset_properties = {} if include is not None and exclude is not None: raise ValueError( - "The argument include and exclude cannot be used together.") + "The argument include and exclude cannot be used together." + ) available_comp = self.get_components() if include is not None: for incl in include: if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) + raise ValueError( + "Trying to include unknown component: " "%s" % incl + ) # TODO check for task type classification and/or regression! @@ -58,38 +66,43 @@ def get_available_components(self, dataset_properties=None, entry = available_comp[name] # Exclude itself to avoid infinite loop - if entry == FeaturePreprocessorChoice or hasattr(entry, 'get_components'): + if entry == FeaturePreprocessorChoice or hasattr(entry, "get_components"): continue - target_type = dataset_properties['target_type'] - if target_type == 'classification': - if entry.get_properties()['handles_classification'] is False: + target_type = dataset_properties["target_type"] + if target_type == "classification": + if entry.get_properties()["handles_classification"] is False: continue - if dataset_properties.get('multiclass') is True and \ - entry.get_properties()['handles_multiclass'] is False: + if ( + dataset_properties.get("multiclass") is True + and entry.get_properties()["handles_multiclass"] is False + ): continue - if dataset_properties.get('multilabel') is True and \ - entry.get_properties()['handles_multilabel'] is False: + if ( + dataset_properties.get("multilabel") is True + and entry.get_properties()["handles_multilabel"] is False + ): continue - elif target_type == 'regression': - if entry.get_properties()['handles_regression'] is False: + elif target_type == "regression": + if entry.get_properties()["handles_regression"] is False: continue - if dataset_properties.get('multioutput') is True and \ - entry.get_properties()['handles_multioutput'] is False: + if ( + dataset_properties.get("multioutput") is True + and entry.get_properties()["handles_multioutput"] is False + ): continue else: - raise ValueError('Unknown target type %s' % target_type) + raise ValueError("Unknown target type %s" % target_type) components_dict[name] = entry return components_dict - def get_hyperparameter_search_space(self, dataset_properties=None, - default=None, - include=None, - exclude=None): + def get_hyperparameter_search_space( + self, dataset_properties=None, default=None, include=None, exclude=None + ): cs = ConfigurationSpace() if dataset_properties is None: @@ -97,32 +110,33 @@ def get_hyperparameter_search_space(self, dataset_properties=None, # Compile a list of legal preprocessors for this problem available_preprocessors = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) + dataset_properties=dataset_properties, include=include, exclude=exclude + ) if len(available_preprocessors) == 0: - raise ValueError( - "No preprocessors found, please add NoPreprocessing") + raise ValueError("No preprocessors found, please add NoPreprocessing") if default is None: - defaults = ['no_preprocessing', 'select_percentile', 'pca', - 'truncatedSVD'] + defaults = ["no_preprocessing", "select_percentile", "pca", "truncatedSVD"] for default_ in defaults: if default_ in available_preprocessors: default = default_ break - preprocessor = CategoricalHyperparameter('__choice__', - list( - available_preprocessors.keys()), - default_value=default) + preprocessor = CategoricalHyperparameter( + "__choice__", list(available_preprocessors.keys()), default_value=default + ) cs.add_hyperparameter(preprocessor) for name in available_preprocessors: - preprocessor_configuration_space = available_preprocessors[name]. \ - get_hyperparameter_search_space(dataset_properties) - parent_hyperparameter = {'parent': preprocessor, 'value': name} - cs.add_configuration_space(name, preprocessor_configuration_space, - parent_hyperparameter=parent_hyperparameter) + preprocessor_configuration_space = available_preprocessors[ + name + ].get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {"parent": preprocessor, "value": name} + cs.add_configuration_space( + name, + preprocessor_configuration_space, + parent_hyperparameter=parent_hyperparameter, + ) return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/densifier.py b/autosklearn/pipeline/components/feature_preprocessing/densifier.py index 0f0732f298..f5c88ecadf 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/densifier.py +++ b/autosklearn/pipeline/components/feature_preprocessing/densifier.py @@ -1,7 +1,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import SPARSE, UNSIGNED_DATA, DENSE, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class Densifier(AutoSklearnPreprocessingAlgorithm): @@ -14,6 +14,7 @@ def fit(self, X, y=None): def transform(self, X): from scipy import sparse + if sparse.issparse(X): return X.todense().getA() else: @@ -21,16 +22,18 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'RandomTreesEmbedding', - 'name': 'Random Trees Embedding', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (SPARSE, UNSIGNED_DATA), - 'output': (DENSE, INPUT)} + return { + "shortname": "RandomTreesEmbedding", + "name": "Random Trees Embedding", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + "input": (SPARSE, UNSIGNED_DATA), + "output": (DENSE, INPUT), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py index 622180af8f..dad45795b8 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py +++ b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py @@ -1,27 +1,43 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter, \ - UnParametrizedHyperparameter, Constant - -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) + +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none class ExtraTreesPreprocessorClassification(AutoSklearnPreprocessingAlgorithm): - - def __init__(self, n_estimators, criterion, min_samples_leaf, - min_samples_split, max_features, bootstrap, max_leaf_nodes, - max_depth, min_weight_fraction_leaf, min_impurity_decrease, - oob_score=False, n_jobs=1, random_state=None, verbose=0, - class_weight=None): + def __init__( + self, + n_estimators, + criterion, + min_samples_leaf, + min_samples_split, + max_features, + bootstrap, + max_leaf_nodes, + max_depth, + min_weight_fraction_leaf, + min_impurity_decrease, + oob_score=False, + n_jobs=1, + random_state=None, + verbose=0, + class_weight=None, + ): self.n_estimators = n_estimators self.estimator_increment = 10 if criterion not in ("gini", "entropy"): - raise ValueError("'criterion' is not in ('gini', 'entropy'): " - "%s" % criterion) + raise ValueError( + "'criterion' is not in ('gini', 'entropy'): " "%s" % criterion + ) self.criterion = criterion self.min_samples_leaf = min_samples_leaf self.min_samples_split = min_samples_split @@ -78,11 +94,12 @@ def fit(self, X, Y, sample_weight=None): n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, - class_weight=self.class_weight) + class_weight=self.class_weight, + ) estimator.fit(X, Y, sample_weight=sample_weight) - self.preprocessor = SelectFromModel(estimator=estimator, - threshold='mean', - prefit=True) + self.preprocessor = SelectFromModel( + estimator=estimator, threshold="mean", prefit=True + ) return self def transform(self, X): @@ -92,16 +109,18 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'ETC', - 'name': 'Extra Trees Classifier Preprocessing', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,)} + return { + "shortname": "ETC", + "name": "Extra Trees Classifier Preprocessing", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): @@ -109,28 +128,45 @@ def get_hyperparameter_search_space(dataset_properties=None): n_estimators = Constant("n_estimators", 100) criterion = CategoricalHyperparameter( - "criterion", ["gini", "entropy"], default_value="gini") - max_features = UniformFloatHyperparameter("max_features", 0, 1, - default_value=0.5) + "criterion", ["gini", "entropy"], default_value="gini" + ) + max_features = UniformFloatHyperparameter( + "max_features", 0, 1, default_value=0.5 + ) max_depth = UnParametrizedHyperparameter(name="max_depth", value="None") max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") min_samples_split = UniformIntegerHyperparameter( - "min_samples_split", 2, 20, default_value=2) + "min_samples_split", 2, 20, default_value=2 + ) min_samples_leaf = UniformIntegerHyperparameter( - "min_samples_leaf", 1, 20, default_value=1) + "min_samples_leaf", 1, 20, default_value=1 + ) min_weight_fraction_leaf = UnParametrizedHyperparameter( - 'min_weight_fraction_leaf', 0.) + "min_weight_fraction_leaf", 0.0 + ) min_impurity_decrease = UnParametrizedHyperparameter( - 'min_impurity_decrease', 0.) + "min_impurity_decrease", 0.0 + ) bootstrap = CategoricalHyperparameter( - "bootstrap", ["True", "False"], default_value="False") - - cs.add_hyperparameters([n_estimators, criterion, max_features, - max_depth, max_leaf_nodes, min_samples_split, - min_samples_leaf, min_weight_fraction_leaf, - min_impurity_decrease, bootstrap]) + "bootstrap", ["True", "False"], default_value="False" + ) + + cs.add_hyperparameters( + [ + n_estimators, + criterion, + max_features, + max_depth, + max_leaf_nodes, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + min_impurity_decrease, + bootstrap, + ] + ) return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_regression.py b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_regression.py index e8e28a2736..3287b837c5 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_regression.py +++ b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_regression.py @@ -1,29 +1,43 @@ import numpy as np - from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter, \ - UnParametrizedHyperparameter, Constant - -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) + +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none class ExtraTreesPreprocessorRegression(AutoSklearnPreprocessingAlgorithm): - - def __init__(self, n_estimators, criterion, min_samples_leaf, - min_samples_split, max_features, - bootstrap=False, max_leaf_nodes=None, max_depth="None", - min_weight_fraction_leaf=0.0, - oob_score=False, n_jobs=1, random_state=None, verbose=0): + def __init__( + self, + n_estimators, + criterion, + min_samples_leaf, + min_samples_split, + max_features, + bootstrap=False, + max_leaf_nodes=None, + max_depth="None", + min_weight_fraction_leaf=0.0, + oob_score=False, + n_jobs=1, + random_state=None, + verbose=0, + ): self.n_estimators = n_estimators self.estimator_increment = 10 if criterion not in ("mse", "friedman_mse", "mae"): - raise ValueError("'criterion' is not in ('mse', 'friedman_mse', " - "'mae'): %s" % criterion) + raise ValueError( + "'criterion' is not in ('mse', 'friedman_mse', " + "'mae'): %s" % criterion + ) self.criterion = criterion self.min_samples_leaf = min_samples_leaf self.min_samples_split = min_samples_split @@ -64,23 +78,29 @@ def fit(self, X, Y): self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) num_features = X.shape[1] - max_features = int( - float(self.max_features) * (np.log(num_features) + 1)) + max_features = int(float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) estimator = ExtraTreesRegressor( - n_estimators=self.n_estimators, criterion=self.criterion, - max_depth=self.max_depth, min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, - max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, - oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + bootstrap=self.bootstrap, + max_features=max_features, + max_leaf_nodes=self.max_leaf_nodes, + oob_score=self.oob_score, + n_jobs=self.n_jobs, + verbose=self.verbose, min_weight_fraction_leaf=self.min_weight_fraction_leaf, - random_state=self.random_state) + random_state=self.random_state, + ) estimator.fit(X, Y) - self.preprocessor = SelectFromModel(estimator=estimator, - threshold='mean', - prefit=True) + self.preprocessor = SelectFromModel( + estimator=estimator, threshold="mean", prefit=True + ) return self @@ -91,42 +111,58 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'ETR', - 'name': 'Extra Trees Regressor Preprocessing', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,)} + return { + "shortname": "ETR", + "name": "Extra Trees Regressor Preprocessing", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": True, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_estimators = Constant("n_estimators", 100) - criterion = CategoricalHyperparameter("criterion", - ["mse", 'friedman_mse', 'mae']) + criterion = CategoricalHyperparameter( + "criterion", ["mse", "friedman_mse", "mae"] + ) max_features = UniformFloatHyperparameter( - "max_features", 0.1, 1.0, default_value=1.0) + "max_features", 0.1, 1.0, default_value=1.0 + ) max_depth = UnParametrizedHyperparameter(name="max_depth", value="None") max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") min_samples_split = UniformIntegerHyperparameter( - "min_samples_split", 2, 20, default_value=2) + "min_samples_split", 2, 20, default_value=2 + ) min_samples_leaf = UniformIntegerHyperparameter( - "min_samples_leaf", 1, 20, default_value=1) - min_weight_fraction_leaf = Constant('min_weight_fraction_leaf', 0.) + "min_samples_leaf", 1, 20, default_value=1 + ) + min_weight_fraction_leaf = Constant("min_weight_fraction_leaf", 0.0) bootstrap = CategoricalHyperparameter( - "bootstrap", ["True", "False"], default_value="False") - - cs.add_hyperparameters([n_estimators, criterion, max_features, max_depth, - max_leaf_nodes, min_samples_split, - min_samples_leaf, min_weight_fraction_leaf, - bootstrap]) + "bootstrap", ["True", "False"], default_value="False" + ) + + cs.add_hyperparameters( + [ + n_estimators, + criterion, + max_features, + max_depth, + max_leaf_nodes, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + bootstrap, + ] + ) return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py b/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py index 549d708506..695ff3c2cc 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py +++ b/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py @@ -1,19 +1,19 @@ import warnings -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import CategoricalHyperparameter, \ - UniformIntegerHyperparameter from ConfigSpace.conditions import EqualsCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, +) -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import INPUT, UNSIGNED_DATA, DENSE +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, INPUT, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none class FastICA(AutoSklearnPreprocessingAlgorithm): - def __init__(self, algorithm, whiten, fun, n_components=None, - random_state=None): + def __init__(self, algorithm, whiten, fun, n_components=None, random_state=None): self.algorithm = algorithm self.whiten = whiten self.fun = fun @@ -31,18 +31,25 @@ def fit(self, X, Y=None): self.n_components = int(self.n_components) self.preprocessor = sklearn.decomposition.FastICA( - n_components=self.n_components, algorithm=self.algorithm, - fun=self.fun, whiten=self.whiten, random_state=self.random_state + n_components=self.n_components, + algorithm=self.algorithm, + fun=self.fun, + whiten=self.whiten, + random_state=self.random_state, ) # Make the RuntimeWarning an Exception! with warnings.catch_warnings(): - warnings.filterwarnings("error", message='array must not contain infs or NaNs') + warnings.filterwarnings( + "error", message="array must not contain infs or NaNs" + ) try: self.preprocessor.fit(X) except ValueError as e: - if 'array must not contain infs or NaNs' in e.args[0]: - raise ValueError("Bug in scikit-learn: " - "https://github.com/scikit-learn/scikit-learn/pull/2738") + if "array must not contain infs or NaNs" in e.args[0]: + raise ValueError( + "Bug in scikit-learn: " + "https://github.com/scikit-learn/scikit-learn/pull/2738" + ) return self @@ -53,25 +60,31 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'FastICA', - 'name': 'Fast Independent Component Analysis', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': False, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (INPUT, UNSIGNED_DATA)} + return { + "shortname": "FastICA", + "name": "Fast Independent Component Analysis", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": False, + "input": (DENSE, UNSIGNED_DATA), + "output": (INPUT, UNSIGNED_DATA), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() - n_components = UniformIntegerHyperparameter("n_components", 10, 2000, default_value=100) - algorithm = CategoricalHyperparameter('algorithm', ['parallel', 'deflation'], 'parallel') - whiten = CategoricalHyperparameter('whiten', ['False', 'True'], 'False') - fun = CategoricalHyperparameter('fun', ['logcosh', 'exp', 'cube'], 'logcosh') + n_components = UniformIntegerHyperparameter( + "n_components", 10, 2000, default_value=100 + ) + algorithm = CategoricalHyperparameter( + "algorithm", ["parallel", "deflation"], "parallel" + ) + whiten = CategoricalHyperparameter("whiten", ["False", "True"], "False") + fun = CategoricalHyperparameter("fun", ["logcosh", "exp", "cube"], "logcosh") cs.add_hyperparameters([n_components, algorithm, whiten, fun]) cs.add_condition(EqualsCondition(n_components, whiten, "True")) diff --git a/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py b/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py index e23ff1b865..d51242de21 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py +++ b/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py @@ -1,28 +1,28 @@ import numpy as np - from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import CategoricalHyperparameter, \ - UniformIntegerHyperparameter -from ConfigSpace.forbidden import ForbiddenInClause, \ - ForbiddenAndConjunction, ForbiddenEqualsClause +from ConfigSpace.forbidden import ( + ForbiddenAndConjunction, + ForbiddenEqualsClause, + ForbiddenInClause, +) +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, +) -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, INPUT, UNSIGNED_DATA class FeatureAgglomeration(AutoSklearnPreprocessingAlgorithm): - def __init__(self, n_clusters, affinity, linkage, pooling_func, - random_state=None): + def __init__(self, n_clusters, affinity, linkage, pooling_func, random_state=None): self.n_clusters = n_clusters self.affinity = affinity self.linkage = linkage self.pooling_func = pooling_func self.random_state = random_state - self.pooling_func_mapping = dict(mean=np.mean, - median=np.median, - max=np.max) + self.pooling_func_mapping = dict(mean=np.mean, median=np.median, max=np.max) def fit(self, X, Y=None): import sklearn.cluster @@ -34,8 +34,11 @@ def fit(self, X, Y=None): self.pooling_func = self.pooling_func_mapping[self.pooling_func] self.preprocessor = sklearn.cluster.FeatureAgglomeration( - n_clusters=n_clusters, affinity=self.affinity, - linkage=self.linkage, pooling_func=self.pooling_func) + n_clusters=n_clusters, + affinity=self.affinity, + linkage=self.linkage, + pooling_func=self.pooling_func, + ) self.preprocessor.fit(X) return self @@ -46,32 +49,38 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'Feature Agglomeration', - 'name': 'Feature Agglomeration', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (INPUT,)} + return { + "shortname": "Feature Agglomeration", + "name": "Feature Agglomeration", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + "input": (DENSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_clusters = UniformIntegerHyperparameter("n_clusters", 2, 400, 25) affinity = CategoricalHyperparameter( - "affinity", ["euclidean", "manhattan", "cosine"], "euclidean") + "affinity", ["euclidean", "manhattan", "cosine"], "euclidean" + ) linkage = CategoricalHyperparameter( - "linkage", ["ward", "complete", "average"], "ward") + "linkage", ["ward", "complete", "average"], "ward" + ) pooling_func = CategoricalHyperparameter( - "pooling_func", ["mean", "median", "max"]) + "pooling_func", ["mean", "median", "max"] + ) cs.add_hyperparameters([n_clusters, affinity, linkage, pooling_func]) affinity_and_linkage = ForbiddenAndConjunction( ForbiddenInClause(affinity, ["manhattan", "cosine"]), - ForbiddenEqualsClause(linkage, "ward")) + ForbiddenEqualsClause(linkage, "ward"), + ) cs.add_forbidden_clause(affinity_and_linkage) return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py b/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py index 7ed0086248..4e96bfb1c2 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py +++ b/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py @@ -1,20 +1,22 @@ import warnings import numpy as np - -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import CategoricalHyperparameter, \ - UniformIntegerHyperparameter, UniformFloatHyperparameter from ConfigSpace.conditions import EqualsCondition, InCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, +) -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA class KernelPCA(AutoSklearnPreprocessingAlgorithm): - def __init__(self, n_components, kernel, degree=3, gamma=0.25, coef0=0.0, - random_state=None): + def __init__( + self, n_components, kernel, degree=3, gamma=0.25, coef0=0.0, random_state=None + ): self.n_components = n_components self.kernel = kernel self.degree = degree @@ -32,9 +34,14 @@ def fit(self, X, Y=None): self.coef0 = float(self.coef0) self.preprocessor = sklearn.decomposition.KernelPCA( - n_components=self.n_components, kernel=self.kernel, - degree=self.degree, gamma=self.gamma, coef0=self.coef0, - remove_zero_eig=True, random_state=self.random_state) + n_components=self.n_components, + kernel=self.kernel, + degree=self.degree, + gamma=self.gamma, + coef0=self.coef0, + remove_zero_eig=True, + random_state=self.random_state, + ) if scipy.sparse.issparse(X): X = X.astype(np.float64) with warnings.catch_warnings(): @@ -43,7 +50,7 @@ def fit(self, X, Y=None): # Raise an informative error message, equation is based ~line 249 in # kernel_pca.py in scikit-learn if len(self.preprocessor.alphas_ / self.preprocessor.lambdas_) == 0: - raise ValueError('KernelPCA removed all features!') + raise ValueError("KernelPCA removed all features!") return self def transform(self, X): @@ -61,29 +68,35 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'KernelPCA', - 'name': 'Kernel Principal Component Analysis', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': False, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (DENSE, UNSIGNED_DATA)} + return { + "shortname": "KernelPCA", + "name": "Kernel Principal Component Analysis", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": False, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (DENSE, UNSIGNED_DATA), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): n_components = UniformIntegerHyperparameter( - "n_components", 10, 2000, default_value=100) - kernel = CategoricalHyperparameter('kernel', ['poly', 'rbf', 'sigmoid', 'cosine'], 'rbf') + "n_components", 10, 2000, default_value=100 + ) + kernel = CategoricalHyperparameter( + "kernel", ["poly", "rbf", "sigmoid", "cosine"], "rbf" + ) gamma = UniformFloatHyperparameter( "gamma", - 3.0517578125e-05, 8, + 3.0517578125e-05, + 8, log=True, default_value=0.01, ) - degree = UniformIntegerHyperparameter('degree', 2, 5, 3) + degree = UniformIntegerHyperparameter("degree", 2, 5, 3) coef0 = UniformFloatHyperparameter("coef0", -1, 1, default_value=0) cs = ConfigurationSpace() cs.add_hyperparameters([n_components, kernel, degree, gamma, coef0]) diff --git a/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py b/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py index 12ff57c21d..a81e9ddd78 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py +++ b/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py @@ -1,21 +1,22 @@ from typing import Optional, Union -from numpy.random import RandomState from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter +from ConfigSpace.hyperparameters import ( + UniformFloatHyperparameter, + UniformIntegerHyperparameter, +) +from numpy.random import RandomState from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class RandomKitchenSinks(AutoSklearnPreprocessingAlgorithm): - def __init__( self, gamma: float, n_components: int, - random_state: Optional[Union[int, RandomState]] = None + random_state: Optional[Union[int, RandomState]] = None, ) -> None: """ Parameters @@ -42,7 +43,7 @@ def fit(self, X, Y=None): self.preprocessor = sklearn.kernel_approximation.RBFSampler( gamma=self.gamma, n_components=self.n_components, - random_state=self.random_state + random_state=self.random_state, ) self.preprocessor.fit(X) return self @@ -54,23 +55,27 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'KitchenSink', - 'name': 'Random Kitchen Sinks', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT, UNSIGNED_DATA)} + return { + "shortname": "KitchenSink", + "name": "Random Kitchen Sinks", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (INPUT, UNSIGNED_DATA), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): gamma = UniformFloatHyperparameter( - "gamma", 3.0517578125e-05, 8, default_value=1.0, log=True) + "gamma", 3.0517578125e-05, 8, default_value=1.0, log=True + ) n_components = UniformIntegerHyperparameter( - "n_components", 50, 10000, default_value=100, log=True) + "n_components", 50, 10000, default_value=100, log=True + ) cs = ConfigurationSpace() cs.add_hyperparameters([gamma, n_components]) return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py b/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py index 6e6de1a998..546c8742ad 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py +++ b/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py @@ -1,19 +1,31 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter, Constant -from ConfigSpace.forbidden import ForbiddenEqualsClause, \ - ForbiddenAndConjunction +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none class LibLinear_Preprocessor(AutoSklearnPreprocessingAlgorithm): # Liblinear is not deterministic as it uses a RNG inside - def __init__(self, penalty, loss, dual, tol, C, multi_class, - fit_intercept, intercept_scaling, class_weight=None, - random_state=None): + def __init__( + self, + penalty, + loss, + dual, + tol, + C, + multi_class, + fit_intercept, + intercept_scaling, + class_weight=None, + random_state=None, + ): self.penalty = penalty self.loss = loss self.dual = dual @@ -39,21 +51,23 @@ def fit(self, X, Y): if check_none(self.class_weight): self.class_weight = None - estimator = sklearn.svm.LinearSVC(penalty=self.penalty, - loss=self.loss, - dual=self.dual, - tol=self.tol, - C=self.C, - class_weight=self.class_weight, - fit_intercept=self.fit_intercept, - intercept_scaling=self.intercept_scaling, - multi_class=self.multi_class, - random_state=self.random_state) + estimator = sklearn.svm.LinearSVC( + penalty=self.penalty, + loss=self.loss, + dual=self.dual, + tol=self.tol, + C=self.C, + class_weight=self.class_weight, + fit_intercept=self.fit_intercept, + intercept_scaling=self.intercept_scaling, + multi_class=self.multi_class, + random_state=self.random_state, + ) estimator.fit(X, Y) - self.preprocessor = SelectFromModel(estimator=estimator, - threshold='mean', - prefit=True) + self.preprocessor = SelectFromModel( + estimator=estimator, threshold="mean", prefit=True + ) return self @@ -64,15 +78,17 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'LinearSVC Preprocessor', - 'name': 'Liblinear Support Vector Classification Preprocessing', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT,)} + return { + "shortname": "LinearSVC Preprocessor", + "name": "Liblinear Support Vector Classification Preprocessing", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": False, + "handles_multioutput": False, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): @@ -80,22 +96,25 @@ def get_hyperparameter_search_space(dataset_properties=None): penalty = Constant("penalty", "l1") loss = CategoricalHyperparameter( - "loss", ["hinge", "squared_hinge"], default_value="squared_hinge") + "loss", ["hinge", "squared_hinge"], default_value="squared_hinge" + ) dual = Constant("dual", "False") # This is set ad-hoc - tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default_value=1e-4, log=True) + tol = UniformFloatHyperparameter( + "tol", 1e-5, 1e-1, default_value=1e-4, log=True + ) C = UniformFloatHyperparameter("C", 0.03125, 32768, log=True, default_value=1.0) multi_class = Constant("multi_class", "ovr") # These are set ad-hoc fit_intercept = Constant("fit_intercept", "True") intercept_scaling = Constant("intercept_scaling", 1) - cs.add_hyperparameters([penalty, loss, dual, tol, C, multi_class, - fit_intercept, intercept_scaling]) + cs.add_hyperparameters( + [penalty, loss, dual, tol, C, multi_class, fit_intercept, intercept_scaling] + ) penalty_and_loss = ForbiddenAndConjunction( - ForbiddenEqualsClause(penalty, "l1"), - ForbiddenEqualsClause(loss, "hinge") + ForbiddenEqualsClause(penalty, "l1"), ForbiddenEqualsClause(loss, "hinge") ) cs.add_forbidden_clause(penalty_and_loss) return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py b/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py index 92e949b46d..550872d551 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py +++ b/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py @@ -1,16 +1,15 @@ from ConfigSpace.configuration_space import ConfigurationSpace from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class NoPreprocessing(AutoSklearnPreprocessingAlgorithm): - def __init__(self, random_state): - """ This preprocessors does not change the data """ + """This preprocessors does not change the data""" def fit(self, X, Y=None): - self.preprocessor = 'passthrough' + self.preprocessor = "passthrough" self.fitted_ = True return self @@ -21,16 +20,18 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'no', - 'name': 'NoPreprocessing', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT,)} + return { + "shortname": "no", + "name": "NoPreprocessing", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py b/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py index d450d8f09f..097f59e0f1 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py +++ b/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py @@ -1,17 +1,26 @@ import numpy as np - +from ConfigSpace.conditions import EqualsCondition, InCondition from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter -from ConfigSpace.conditions import InCondition, EqualsCondition +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT, SIGNED_DATA +from autosklearn.pipeline.constants import ( + DENSE, + INPUT, + SIGNED_DATA, + SPARSE, + UNSIGNED_DATA, +) class Nystroem(AutoSklearnPreprocessingAlgorithm): - def __init__(self, kernel, n_components, gamma=1.0, degree=3, - coef0=1, random_state=None): + def __init__( + self, kernel, n_components, gamma=1.0, degree=3, coef0=1, random_state=None + ): self.kernel = kernel self.n_components = n_components self.gamma = gamma @@ -29,13 +38,17 @@ def fit(self, X, Y=None): self.coef0 = float(self.coef0) self.preprocessor = sklearn.kernel_approximation.Nystroem( - kernel=self.kernel, n_components=self.n_components, - gamma=self.gamma, degree=self.degree, coef0=self.coef0, - random_state=self.random_state) + kernel=self.kernel, + n_components=self.n_components, + gamma=self.gamma, + degree=self.degree, + coef0=self.coef0, + random_state=self.random_state, + ) # Because the pipeline guarantees that each feature is positive, # clip all values below zero to zero - if self.kernel == 'chi2': + if self.kernel == "chi2": if scipy.sparse.issparse(X): X.data[X.data < 0] = 0.0 else: @@ -49,7 +62,7 @@ def transform(self, X): # Because the pipeline guarantees that each feature is positive, # clip all values below zero to zero - if self.kernel == 'chi2': + if self.kernel == "chi2": if scipy.sparse.issparse(X): X.data[X.data < 0] = 0.0 else: @@ -64,38 +77,43 @@ def get_properties(dataset_properties=None): data_type = UNSIGNED_DATA if dataset_properties is not None: - signed = dataset_properties.get('signed') + signed = dataset_properties.get("signed") if signed is not None: data_type = SIGNED_DATA if signed is True else UNSIGNED_DATA - return {'shortname': 'Nystroem', - 'name': 'Nystroem kernel approximation', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (SPARSE, DENSE, data_type), - 'output': (INPUT, UNSIGNED_DATA)} + return { + "shortname": "Nystroem", + "name": "Nystroem kernel approximation", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + "input": (SPARSE, DENSE, data_type), + "output": (INPUT, UNSIGNED_DATA), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): - if dataset_properties is not None and \ - (dataset_properties.get("sparse") is True or - dataset_properties.get("signed") is False): + if dataset_properties is not None and ( + dataset_properties.get("sparse") is True + or dataset_properties.get("signed") is False + ): allow_chi2 = False else: allow_chi2 = True - possible_kernels = ['poly', 'rbf', 'sigmoid', 'cosine'] + possible_kernels = ["poly", "rbf", "sigmoid", "cosine"] if allow_chi2: possible_kernels.append("chi2") - kernel = CategoricalHyperparameter('kernel', possible_kernels, 'rbf') + kernel = CategoricalHyperparameter("kernel", possible_kernels, "rbf") n_components = UniformIntegerHyperparameter( - "n_components", 50, 10000, default_value=100, log=True) - gamma = UniformFloatHyperparameter("gamma", 3.0517578125e-05, 8, - log=True, default_value=0.1) - degree = UniformIntegerHyperparameter('degree', 2, 5, 3) + "n_components", 50, 10000, default_value=100, log=True + ) + gamma = UniformFloatHyperparameter( + "gamma", 3.0517578125e-05, 8, log=True, default_value=0.1 + ) + degree = UniformIntegerHyperparameter("degree", 2, 5, 3) coef0 = UniformFloatHyperparameter("coef0", -1, 1, default_value=0) cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/feature_preprocessing/pca.py b/autosklearn/pipeline/components/feature_preprocessing/pca.py index ae992520fa..a1ad9f3981 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/pca.py +++ b/autosklearn/pipeline/components/feature_preprocessing/pca.py @@ -1,8 +1,9 @@ import numpy as np - from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA @@ -18,12 +19,13 @@ def __init__(self, keep_variance, whiten, random_state=None): def fit(self, X, Y=None): import sklearn.decomposition + n_components = float(self.keep_variance) self.whiten = check_for_bool(self.whiten) - self.preprocessor = sklearn.decomposition.PCA(n_components=n_components, - whiten=self.whiten, - copy=True) + self.preprocessor = sklearn.decomposition.PCA( + n_components=n_components, whiten=self.whiten, copy=True + ) self.preprocessor.fit(X) if not np.isfinite(self.preprocessor.components_).all(): @@ -38,24 +40,28 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'PCA', - 'name': 'Principle Component Analysis', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - # TODO document that we have to be very careful - 'is_deterministic': False, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (DENSE, UNSIGNED_DATA)} + return { + "shortname": "PCA", + "name": "Principle Component Analysis", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + # TODO document that we have to be very careful + "is_deterministic": False, + "input": (DENSE, UNSIGNED_DATA), + "output": (DENSE, UNSIGNED_DATA), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): keep_variance = UniformFloatHyperparameter( - "keep_variance", 0.5, 0.9999, default_value=0.9999) + "keep_variance", 0.5, 0.9999, default_value=0.9999 + ) whiten = CategoricalHyperparameter( - "whiten", ["False", "True"], default_value="False") + "whiten", ["False", "True"], default_value="False" + ) cs = ConfigurationSpace() cs.add_hyperparameters([keep_variance, whiten]) return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/polynomial.py b/autosklearn/pipeline/components/feature_preprocessing/polynomial.py index 478040c497..bd5312bba0 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/polynomial.py +++ b/autosklearn/pipeline/components/feature_preprocessing/polynomial.py @@ -1,10 +1,11 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import CategoricalHyperparameter, \ - UniformIntegerHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, +) -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool @@ -25,8 +26,10 @@ def fit(self, X, Y): self.include_bias = check_for_bool(self.include_bias) self.preprocessor = sklearn.preprocessing.PolynomialFeatures( - degree=self.degree, interaction_only=self.interaction_only, - include_bias=self.include_bias) + degree=self.degree, + interaction_only=self.interaction_only, + include_bias=self.include_bias, + ) self.preprocessor.fit(X, Y) return self @@ -37,25 +40,29 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'PolynomialFeatures', - 'name': 'PolynomialFeatures', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,)} + return { + "shortname": "PolynomialFeatures", + "name": "PolynomialFeatures", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): # More than degree 3 is too expensive! degree = UniformIntegerHyperparameter("degree", 2, 3, 2) - interaction_only = CategoricalHyperparameter("interaction_only", - ["False", "True"], "False") - include_bias = CategoricalHyperparameter("include_bias", - ["True", "False"], "True") + interaction_only = CategoricalHyperparameter( + "interaction_only", ["False", "True"], "False" + ) + include_bias = CategoricalHyperparameter( + "include_bias", ["True", "False"], "True" + ) cs = ConfigurationSpace() cs.add_hyperparameters([degree, interaction_only, include_bias]) diff --git a/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py b/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py index a5e9ff1b8c..9daed1ae97 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py +++ b/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py @@ -1,17 +1,30 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, \ - UnParametrizedHyperparameter, Constant, CategoricalHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, SIGNED_DATA -from autosklearn.util.common import check_none, check_for_bool +from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, SPARSE, UNSIGNED_DATA +from autosklearn.util.common import check_for_bool, check_none class RandomTreesEmbedding(AutoSklearnPreprocessingAlgorithm): - - def __init__(self, n_estimators, max_depth, min_samples_split, - min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, - bootstrap, sparse_output=True, n_jobs=1, random_state=None): + def __init__( + self, + n_estimators, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_leaf_nodes, + bootstrap, + sparse_output=True, + n_jobs=1, + random_state=None, + ): self.n_estimators = n_estimators self.max_depth = max_depth self.min_samples_split = min_samples_split @@ -48,7 +61,7 @@ def _fit(self, X, Y=None): max_leaf_nodes=self.max_leaf_nodes, sparse_output=self.sparse_output, n_jobs=self.n_jobs, - random_state=self.random_state + random_state=self.random_state, ) self.preprocessor.fit(X, Y) return self @@ -67,37 +80,48 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'RandomTreesEmbedding', - 'name': 'Random Trees Embedding', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (SPARSE, SIGNED_DATA)} + return { + "shortname": "RandomTreesEmbedding", + "name": "Random Trees Embedding", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (SPARSE, SIGNED_DATA), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): - n_estimators = UniformIntegerHyperparameter(name="n_estimators", - lower=10, upper=100, - default_value=10) - max_depth = UniformIntegerHyperparameter(name="max_depth", - lower=2, upper=10, - default_value=5) - min_samples_split = UniformIntegerHyperparameter(name="min_samples_split", - lower=2, upper=20, - default_value=2) - min_samples_leaf = UniformIntegerHyperparameter(name="min_samples_leaf", - lower=1, upper=20, - default_value=1) - min_weight_fraction_leaf = Constant('min_weight_fraction_leaf', 1.0) - max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes", - value="None") - bootstrap = CategoricalHyperparameter('bootstrap', ['True', 'False']) + n_estimators = UniformIntegerHyperparameter( + name="n_estimators", lower=10, upper=100, default_value=10 + ) + max_depth = UniformIntegerHyperparameter( + name="max_depth", lower=2, upper=10, default_value=5 + ) + min_samples_split = UniformIntegerHyperparameter( + name="min_samples_split", lower=2, upper=20, default_value=2 + ) + min_samples_leaf = UniformIntegerHyperparameter( + name="min_samples_leaf", lower=1, upper=20, default_value=1 + ) + min_weight_fraction_leaf = Constant("min_weight_fraction_leaf", 1.0) + max_leaf_nodes = UnParametrizedHyperparameter( + name="max_leaf_nodes", value="None" + ) + bootstrap = CategoricalHyperparameter("bootstrap", ["True", "False"]) cs = ConfigurationSpace() - cs.add_hyperparameters([n_estimators, max_depth, min_samples_split, - min_samples_leaf, min_weight_fraction_leaf, - max_leaf_nodes, bootstrap]) + cs.add_hyperparameters( + [ + n_estimators, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_leaf_nodes, + bootstrap, + ] + ) return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_percentile.py b/autosklearn/pipeline/components/feature_preprocessing/select_percentile.py index c928e2f471..66f760bfb0 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_percentile.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_percentile.py @@ -1,11 +1,10 @@ class SelectPercentileBase(object): - def fit(self, X, y): import sklearn.feature_selection self.preprocessor = sklearn.feature_selection.SelectPercentile( - score_func=self.score_func, - percentile=self.percentile) + score_func=self.score_func, percentile=self.percentile + ) self.preprocessor.fit(X, y) return self diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py index f6a3a1152c..3caa50b46d 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py @@ -1,19 +1,30 @@ -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import \ - UniformFloatHyperparameter, CategoricalHyperparameter, Constant from functools import partial -from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.components.feature_preprocessing.select_percentile import \ - SelectPercentileBase -from autosklearn.pipeline.constants import SPARSE, DENSE, INPUT, UNSIGNED_DATA, SIGNED_DATA - - -class SelectPercentileClassification(SelectPercentileBase, - AutoSklearnPreprocessingAlgorithm): +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, +) +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.components.feature_preprocessing.select_percentile import ( + SelectPercentileBase, +) +from autosklearn.pipeline.constants import ( + DENSE, + INPUT, + SIGNED_DATA, + SPARSE, + UNSIGNED_DATA, +) + + +class SelectPercentileClassification( + SelectPercentileBase, AutoSklearnPreprocessingAlgorithm +): def __init__(self, percentile, score_func="chi2", random_state=None): - """ Parameters: + """Parameters: random state : ignored score_func : callable, Function taking two arrays X and y, and @@ -28,11 +39,15 @@ def __init__(self, percentile, score_func="chi2", random_state=None): elif score_func == "f_classif": self.score_func = sklearn.feature_selection.f_classif elif score_func == "mutual_info": - self.score_func = partial(sklearn.feature_selection.mutual_info_classif, - random_state=self.random_state) + self.score_func = partial( + sklearn.feature_selection.mutual_info_classif, + random_state=self.random_state, + ) else: - raise ValueError("score_func must be in ('chi2, 'f_classif', 'mutual_info'), " - "but is: %s" % score_func) + raise ValueError( + "score_func must be in ('chi2, 'f_classif', 'mutual_info'), " + "but is: %s" % score_func + ) def fit(self, X, y): import scipy.sparse @@ -41,7 +56,7 @@ def fit(self, X, y): self.preprocessor = sklearn.feature_selection.SelectPercentile( score_func=self.score_func, percentile=self.percentile, - ) + ) # Because the pipeline guarantees that each feature is positive, # clip all values below zero to zero @@ -70,44 +85,45 @@ def transform(self, X): raise NotImplementedError() Xt = self.preprocessor.transform(X) if Xt.shape[1] == 0: - raise ValueError( - "%s removed all features." % self.__class__.__name__) + raise ValueError("%s removed all features." % self.__class__.__name__) return Xt @staticmethod def get_properties(dataset_properties=None): data_type = UNSIGNED_DATA if dataset_properties is not None: - signed = dataset_properties.get('signed') + signed = dataset_properties.get("signed") if signed is not None: data_type = SIGNED_DATA if signed is True else UNSIGNED_DATA - return {'shortname': 'SPC', - 'name': 'Select Percentile Classification', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (SPARSE, DENSE, data_type), - 'output': (INPUT,)} + return { + "shortname": "SPC", + "name": "Select Percentile Classification", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (SPARSE, DENSE, data_type), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): percentile = UniformFloatHyperparameter( - name="percentile", lower=1, upper=99, default_value=50) + name="percentile", lower=1, upper=99, default_value=50 + ) score_func = CategoricalHyperparameter( name="score_func", choices=["chi2", "f_classif", "mutual_info"], - default_value="chi2" + default_value="chi2", ) if dataset_properties is not None: # Chi2 can handle sparse data, so we respect this - if 'sparse' in dataset_properties and dataset_properties['sparse']: - score_func = Constant( - name="score_func", value="chi2") + if "sparse" in dataset_properties and dataset_properties["sparse"]: + score_func = Constant(name="score_func", value="chi2") cs = ConfigurationSpace() cs.add_hyperparameters([percentile, score_func]) diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py index 79b528c095..e9343fead4 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py @@ -1,18 +1,23 @@ -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, CategoricalHyperparameter from functools import partial -from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.components.feature_preprocessing.select_percentile import \ - SelectPercentileBase -from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, +) +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.components.feature_preprocessing.select_percentile import ( + SelectPercentileBase, +) +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA -class SelectPercentileRegression(SelectPercentileBase, - AutoSklearnPreprocessingAlgorithm): +class SelectPercentileRegression( + SelectPercentileBase, AutoSklearnPreprocessingAlgorithm +): def __init__(self, percentile, score_func="f_regression", random_state=None): - """ Parameters: + """Parameters: random state : ignored score_func : callable, Function taking two arrays X and y, and @@ -25,31 +30,37 @@ def __init__(self, percentile, score_func="f_regression", random_state=None): if score_func == "f_regression": self.score_func = sklearn.feature_selection.f_regression elif score_func == "mutual_info": - self.score_func = partial(sklearn.feature_selection.mutual_info_regression, - random_state=self.random_state) + self.score_func = partial( + sklearn.feature_selection.mutual_info_regression, + random_state=self.random_state, + ) else: raise ValueError("Don't know this scoring function: %s" % score_func) @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'SPR', - 'name': 'Select Percentile Regression', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,)} + return { + "shortname": "SPR", + "name": "Select Percentile Regression", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): percentile = UniformFloatHyperparameter( - "percentile", lower=1, upper=99, default_value=50) + "percentile", lower=1, upper=99, default_value=50 + ) score_func = CategoricalHyperparameter( - name="score_func", choices=["f_regression", "mutual_info"]) + name="score_func", choices=["f_regression", "mutual_info"] + ) cs = ConfigurationSpace() cs.add_hyperparameters([percentile, score_func]) diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py index de6e950f0b..0c4768d000 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py @@ -1,17 +1,24 @@ -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter -from ConfigSpace import NotEqualsCondition from functools import partial -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import SIGNED_DATA, UNSIGNED_DATA, SPARSE, DENSE, INPUT +from ConfigSpace import NotEqualsCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, +) + +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import ( + DENSE, + INPUT, + SIGNED_DATA, + SPARSE, + UNSIGNED_DATA, +) class SelectClassificationRates(AutoSklearnPreprocessingAlgorithm): - def __init__(self, alpha, mode='fpr', - score_func="chi2", random_state=None): + def __init__(self, alpha, mode="fpr", score_func="chi2", random_state=None): import sklearn.feature_selection self.random_state = random_state # We don't use this @@ -23,14 +30,18 @@ def __init__(self, alpha, mode='fpr', elif score_func == "f_classif": self.score_func = sklearn.feature_selection.f_classif elif score_func == "mutual_info_classif": - self.score_func = partial(sklearn.feature_selection.mutual_info_classif, - random_state=self.random_state) + self.score_func = partial( + sklearn.feature_selection.mutual_info_classif, + random_state=self.random_state, + ) # mutual info classif constantly crashes without mode percentile - self.mode = 'percentile' + self.mode = "percentile" else: - raise ValueError("score_func must be in ('chi2, 'f_classif', 'mutual_info_classif') " - "for classification " - "but is: %s " % (score_func)) + raise ValueError( + "score_func must be in ('chi2, 'f_classif', 'mutual_info_classif') " + "for classification " + "but is: %s " % (score_func) + ) def fit(self, X, y): import scipy.sparse @@ -39,7 +50,8 @@ def fit(self, X, y): self.alpha = float(self.alpha) self.preprocessor = sklearn.feature_selection.GenericUnivariateSelect( - score_func=self.score_func, param=self.alpha, mode=self.mode) + score_func=self.score_func, param=self.alpha, mode=self.mode + ) # Because the pipeline guarantees that each feature is positive, # clip all values below zero to zero @@ -69,16 +81,16 @@ def transform(self, X): try: Xt = self.preprocessor.transform(X) except ValueError as e: - if "zero-size array to reduction operation maximum which has no " \ - "identity" in e.message: - raise ValueError( - "%s removed all features." % self.__class__.__name__) + if ( + "zero-size array to reduction operation maximum which has no " + "identity" in e.message + ): + raise ValueError("%s removed all features." % self.__class__.__name__) else: raise e if Xt.shape[1] == 0: - raise ValueError( - "%s removed all features." % self.__class__.__name__) + raise ValueError("%s removed all features." % self.__class__.__name__) return Xt @staticmethod @@ -86,37 +98,39 @@ def get_properties(dataset_properties=None): data_type = UNSIGNED_DATA if dataset_properties is not None: - signed = dataset_properties.get('signed') + signed = dataset_properties.get("signed") if signed is not None: data_type = SIGNED_DATA if signed is True else UNSIGNED_DATA - return {'shortname': 'SR', - 'name': 'Univariate Feature Selection based on rates', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (SPARSE, DENSE, data_type), - 'output': (INPUT,)} + return { + "shortname": "SR", + "name": "Univariate Feature Selection based on rates", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (SPARSE, DENSE, data_type), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): alpha = UniformFloatHyperparameter( - name="alpha", lower=0.01, upper=0.5, default_value=0.1) + name="alpha", lower=0.01, upper=0.5, default_value=0.1 + ) - if dataset_properties is not None and dataset_properties.get('sparse'): - choices = ['chi2', 'mutual_info_classif'] + if dataset_properties is not None and dataset_properties.get("sparse"): + choices = ["chi2", "mutual_info_classif"] else: - choices = ['chi2', 'f_classif', 'mutual_info_classif'] + choices = ["chi2", "f_classif", "mutual_info_classif"] score_func = CategoricalHyperparameter( - name="score_func", - choices=choices, - default_value="chi2") + name="score_func", choices=choices, default_value="chi2" + ) - mode = CategoricalHyperparameter('mode', ['fpr', 'fdr', 'fwe'], 'fpr') + mode = CategoricalHyperparameter("mode", ["fpr", "fdr", "fwe"], "fpr") cs = ConfigurationSpace() cs.add_hyperparameter(alpha) @@ -125,7 +139,7 @@ def get_hyperparameter_search_space(dataset_properties=None): # mutual_info_classif constantly crashes if mode is not percentile # as a WA, fix the mode for this score - cond = NotEqualsCondition(mode, score_func, 'mutual_info_classif') + cond = NotEqualsCondition(mode, score_func, "mutual_info_classif") cs.add_condition(cond) return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py index b5bfd2a103..ffec19e6ec 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py @@ -1,17 +1,20 @@ -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter -from ConfigSpace import NotEqualsCondition from functools import partial -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import UNSIGNED_DATA, SPARSE, DENSE, INPUT +from ConfigSpace import NotEqualsCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, +) + +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class SelectRegressionRates(AutoSklearnPreprocessingAlgorithm): - def __init__(self, alpha, mode='percentile', - score_func="f_regression", random_state=None): + def __init__( + self, alpha, mode="percentile", score_func="f_regression", random_state=None + ): import sklearn.feature_selection self.random_state = random_state # We don't use this @@ -21,14 +24,18 @@ def __init__(self, alpha, mode='percentile', if score_func == "f_regression": self.score_func = sklearn.feature_selection.f_regression elif score_func == "mutual_info_regression": - self.score_func = partial(sklearn.feature_selection.mutual_info_regression, - random_state=self.random_state) + self.score_func = partial( + sklearn.feature_selection.mutual_info_regression, + random_state=self.random_state, + ) # Mutual info consistently crashes if percentile is not the mode - self.mode = 'percentile' + self.mode = "percentile" else: - raise ValueError("score_func must be in ('f_regression, 'mutual_info_regression') " - "for task=regression " - "but is: %s " % (score_func)) + raise ValueError( + "score_func must be in ('f_regression, 'mutual_info_regression') " + "for task=regression " + "but is: %s " % (score_func) + ) def fit(self, X, y): import sklearn.feature_selection @@ -36,7 +43,8 @@ def fit(self, X, y): self.alpha = float(self.alpha) self.preprocessor = sklearn.feature_selection.GenericUnivariateSelect( - score_func=self.score_func, param=self.alpha, mode=self.mode) + score_func=self.score_func, param=self.alpha, mode=self.mode + ) self.preprocessor.fit(X, y) return self @@ -48,47 +56,49 @@ def transform(self, X): try: Xt = self.preprocessor.transform(X) except ValueError as e: - if "zero-size array to reduction operation maximum which has no " \ - "identity" in e.message: - raise ValueError( - "%s removed all features." % self.__class__.__name__) + if ( + "zero-size array to reduction operation maximum which has no " + "identity" in e.message + ): + raise ValueError("%s removed all features." % self.__class__.__name__) else: raise e if Xt.shape[1] == 0: - raise ValueError( - "%s removed all features." % self.__class__.__name__) + raise ValueError("%s removed all features." % self.__class__.__name__) return Xt @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'SR', - 'name': 'Univariate Feature Selection based on rates', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': True, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT,)} + return { + "shortname": "SR", + "name": "Univariate Feature Selection based on rates", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": True, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): alpha = UniformFloatHyperparameter( - name="alpha", lower=0.01, upper=0.5, default_value=0.1) + name="alpha", lower=0.01, upper=0.5, default_value=0.1 + ) - if dataset_properties is not None and dataset_properties.get('sparse'): - choices = ['mutual_info_regression', 'f_regression'] + if dataset_properties is not None and dataset_properties.get("sparse"): + choices = ["mutual_info_regression", "f_regression"] else: - choices = ['f_regression'] + choices = ["f_regression"] score_func = CategoricalHyperparameter( - name="score_func", - choices=choices, - default_value="f_regression") + name="score_func", choices=choices, default_value="f_regression" + ) - mode = CategoricalHyperparameter('mode', ['fpr', 'fdr', 'fwe'], 'fpr') + mode = CategoricalHyperparameter("mode", ["fpr", "fdr", "fwe"], "fpr") cs = ConfigurationSpace() cs.add_hyperparameter(alpha) @@ -96,8 +106,8 @@ def get_hyperparameter_search_space(dataset_properties=None): cs.add_hyperparameter(mode) # Mutual info consistently crashes if percentile is not the mode - if 'mutual_info_regression' in choices: - cond = NotEqualsCondition(mode, score_func, 'mutual_info_regression') + if "mutual_info_regression" in choices: + cond = NotEqualsCondition(mode, score_func, "mutual_info_regression") cs.add_condition(cond) return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py b/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py index 078b2b4a2d..4d6f6b7ca9 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py +++ b/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py @@ -2,7 +2,7 @@ from ConfigSpace.hyperparameters import UniformIntegerHyperparameter from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import SPARSE, UNSIGNED_DATA, DENSE, INPUT +from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA class TruncatedSVD(AutoSklearnPreprocessingAlgorithm): @@ -17,7 +17,8 @@ def fit(self, X, Y): self.target_dim = int(self.target_dim) target_dim = min(self.target_dim, X.shape[1] - 1) self.preprocessor = sklearn.decomposition.TruncatedSVD( - target_dim, algorithm='randomized', random_state=self.random_state) + target_dim, algorithm="randomized", random_state=self.random_state + ) # TODO: remove when migrating to sklearn 0.16 # Circumvents a bug in sklearn # https://github.com/scikit-learn/scikit-learn/commit/f08b8c8e52663167819f242f605db39f3b5a6d0c @@ -33,21 +34,24 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'TSVD', - 'name': 'Truncated Singular Value Decomposition', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (SPARSE, UNSIGNED_DATA), - 'output': (DENSE, INPUT)} + return { + "shortname": "TSVD", + "name": "Truncated Singular Value Decomposition", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + "input": (SPARSE, UNSIGNED_DATA), + "output": (DENSE, INPUT), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): target_dim = UniformIntegerHyperparameter( - "target_dim", 10, 256, default_value=128) + "target_dim", 10, 256, default_value=128 + ) cs = ConfigurationSpace() cs.add_hyperparameter(target_dim) return cs diff --git a/autosklearn/pipeline/components/regression/__init__.py b/autosklearn/pipeline/components/regression/__init__.py index 651b49b602..73033467a7 100644 --- a/autosklearn/pipeline/components/regression/__init__.py +++ b/autosklearn/pipeline/components/regression/__init__.py @@ -1,18 +1,25 @@ -from collections import OrderedDict from typing import Type + import os +from collections import OrderedDict -from ..base import AutoSklearnRegressionAlgorithm, find_components, \ - ThirdPartyComponents, AutoSklearnChoice, _addons from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from ..base import ( + AutoSklearnChoice, + AutoSklearnRegressionAlgorithm, + ThirdPartyComponents, + _addons, + find_components, +) + regressor_directory = os.path.split(__file__)[0] -_regressors = find_components(__package__, - regressor_directory, - AutoSklearnRegressionAlgorithm) +_regressors = find_components( + __package__, regressor_directory, AutoSklearnRegressionAlgorithm +) additional_components = ThirdPartyComponents(AutoSklearnRegressionAlgorithm) -_addons['regression'] = additional_components +_addons["regression"] = additional_components def add_regressor(regressor: Type[AutoSklearnRegressionAlgorithm]) -> None: @@ -20,7 +27,6 @@ def add_regressor(regressor: Type[AutoSklearnRegressionAlgorithm]) -> None: class RegressorChoice(AutoSklearnChoice): - @classmethod def get_components(cls): components = OrderedDict() @@ -29,10 +35,9 @@ def get_components(cls): return components @classmethod - def get_available_components(cls, - dataset_properties=None, - include=None, - exclude=None): + def get_available_components( + cls, dataset_properties=None, include=None, exclude=None + ): available_comp = cls.get_components() components_dict = OrderedDict() if dataset_properties is None: @@ -40,13 +45,15 @@ def get_available_components(cls, if include is not None and exclude is not None: raise ValueError( - "The argument include and exclude cannot be used together.") + "The argument include and exclude cannot be used together." + ) if include is not None: for incl in include: if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) + raise ValueError( + "Trying to include unknown component: " "%s" % incl + ) for name in available_comp: if include is not None and name not in include: @@ -60,36 +67,39 @@ def get_available_components(cls, if entry == RegressorChoice: continue - if entry.get_properties()['handles_regression'] is False: + if entry.get_properties()["handles_regression"] is False: continue - if dataset_properties.get('multioutput') is True and \ - entry.get_properties()['handles_multioutput'] is False: + if ( + dataset_properties.get("multioutput") is True + and entry.get_properties()["handles_multioutput"] is False + ): continue components_dict[name] = entry return components_dict - def get_hyperparameter_search_space(self, dataset_properties=None, - default=None, - include=None, - exclude=None): + def get_hyperparameter_search_space( + self, dataset_properties=None, default=None, include=None, exclude=None + ): if include is not None and exclude is not None: - raise ValueError("The argument include and exclude cannot be used together.") + raise ValueError( + "The argument include and exclude cannot be used together." + ) cs = ConfigurationSpace() # Compile a list of all estimator objects for this problem available_estimators = self.get_available_components( - dataset_properties=dataset_properties, - include=include, - exclude=exclude) + dataset_properties=dataset_properties, include=include, exclude=exclude + ) if len(available_estimators) == 0: raise ValueError("No regressors found") if default is None: - defaults = ['random_forest', 'support_vector_regression'] + \ - list(available_estimators.keys()) + defaults = ["random_forest", "support_vector_regression"] + list( + available_estimators.keys() + ) for default_ in defaults: if default_ in available_estimators: if include is not None and default_ not in include: @@ -99,21 +109,25 @@ def get_hyperparameter_search_space(self, dataset_properties=None, default = default_ break - estimator = CategoricalHyperparameter('__choice__', - list(available_estimators.keys()), - default_value=default) + estimator = CategoricalHyperparameter( + "__choice__", list(available_estimators.keys()), default_value=default + ) cs.add_hyperparameter(estimator) for estimator_name in available_estimators.keys(): - estimator_configuration_space = available_estimators[estimator_name].\ - get_hyperparameter_search_space(dataset_properties) - parent_hyperparameter = {'parent': estimator, 'value': estimator_name} - cs.add_configuration_space(estimator_name, estimator_configuration_space, - parent_hyperparameter=parent_hyperparameter) + estimator_configuration_space = available_estimators[ + estimator_name + ].get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {"parent": estimator, "value": estimator_name} + cs.add_configuration_space( + estimator_name, + estimator_configuration_space, + parent_hyperparameter=parent_hyperparameter, + ) return cs def estimator_supports_iterative_fit(self): - return hasattr(self.choice, 'iterative_fit') + return hasattr(self.choice, "iterative_fit") def get_max_iter(self): if self.estimator_supports_iterative_fit(): diff --git a/autosklearn/pipeline/components/regression/adaboost.py b/autosklearn/pipeline/components/regression/adaboost.py index 2eb58ae2ea..e78a57e6a2 100644 --- a/autosklearn/pipeline/components/regression/adaboost.py +++ b/autosklearn/pipeline/components/regression/adaboost.py @@ -1,9 +1,12 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA class AdaboostRegressor(AutoSklearnRegressionAlgorithm): @@ -22,15 +25,14 @@ def fit(self, X, y): self.n_estimators = int(self.n_estimators) self.learning_rate = float(self.learning_rate) self.max_depth = int(self.max_depth) - base_estimator = sklearn.tree.DecisionTreeRegressor( - max_depth=self.max_depth) + base_estimator = sklearn.tree.DecisionTreeRegressor(max_depth=self.max_depth) self.estimator = sklearn.ensemble.AdaBoostRegressor( base_estimator=base_estimator, n_estimators=self.n_estimators, learning_rate=self.learning_rate, loss=self.loss, - random_state=self.random_state + random_state=self.random_state, ) if y.ndim == 2 and y.shape[1] == 1: @@ -46,16 +48,18 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'AB', - 'name': 'AdaBoost Regressor', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS, )} + return { + "shortname": "AB", + "name": "AdaBoost Regressor", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): @@ -63,16 +67,19 @@ def get_hyperparameter_search_space(dataset_properties=None): # base_estimator = Constant(name="base_estimator", value="None") n_estimators = UniformIntegerHyperparameter( - name="n_estimators", lower=50, upper=500, default_value=50, - log=False) + name="n_estimators", lower=50, upper=500, default_value=50, log=False + ) learning_rate = UniformFloatHyperparameter( - name="learning_rate", lower=0.01, upper=2, default_value=0.1, - log=True) + name="learning_rate", lower=0.01, upper=2, default_value=0.1, log=True + ) loss = CategoricalHyperparameter( - name="loss", choices=["linear", "square", "exponential"], - default_value="linear") + name="loss", + choices=["linear", "square", "exponential"], + default_value="linear", + ) max_depth = UniformIntegerHyperparameter( - name="max_depth", lower=1, upper=10, default_value=1, log=False) + name="max_depth", lower=1, upper=10, default_value=1, log=False + ) cs.add_hyperparameters([n_estimators, learning_rate, loss, max_depth]) return cs diff --git a/autosklearn/pipeline/components/regression/ard_regression.py b/autosklearn/pipeline/components/regression/ard_regression.py index 46dcac5d93..219cb775af 100644 --- a/autosklearn/pipeline/components/regression/ard_regression.py +++ b/autosklearn/pipeline/components/regression/ard_regression.py @@ -1,15 +1,27 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UnParametrizedHyperparameter +from ConfigSpace.hyperparameters import ( + UniformFloatHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA from autosklearn.util.common import check_for_bool class ARDRegression(AutoSklearnRegressionAlgorithm): - def __init__(self, n_iter, tol, alpha_1, alpha_2, lambda_1, lambda_2, - threshold_lambda, fit_intercept, random_state=None): + def __init__( + self, + n_iter, + tol, + alpha_1, + alpha_2, + lambda_1, + lambda_2, + threshold_lambda, + fit_intercept, + random_state=None, + ): self.random_state = random_state self.estimator = None @@ -46,7 +58,7 @@ def fit(self, X, y): fit_intercept=True, normalize=False, copy_X=False, - verbose=False + verbose=False, ) if y.ndim == 2 and y.shape[1] == 1: @@ -62,43 +74,71 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'ARD', - 'name': 'ARD Regression', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'prefers_data_normalized': True, - 'is_deterministic': True, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "ARD", + "name": "ARD Regression", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": False, + "prefers_data_normalized": True, + "is_deterministic": True, + "input": (DENSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_iter = UnParametrizedHyperparameter("n_iter", value=300) - tol = UniformFloatHyperparameter("tol", 10 ** -5, 10 ** -1, - default_value=10 ** -3, log=True) - alpha_1 = UniformFloatHyperparameter(name="alpha_1", lower=10 ** -10, - upper=10 ** -3, default_value=10 ** -6) - alpha_2 = UniformFloatHyperparameter(name="alpha_2", log=True, - lower=10 ** -10, upper=10 ** -3, - default_value=10 ** -6) - lambda_1 = UniformFloatHyperparameter(name="lambda_1", log=True, - lower=10 ** -10, upper=10 ** -3, - default_value=10 ** -6) - lambda_2 = UniformFloatHyperparameter(name="lambda_2", log=True, - lower=10 ** -10, upper=10 ** -3, - default_value=10 ** -6) - threshold_lambda = UniformFloatHyperparameter(name="threshold_lambda", - log=True, - lower=10 ** 3, - upper=10 ** 5, - default_value=10 ** 4) + tol = UniformFloatHyperparameter( + "tol", 10**-5, 10**-1, default_value=10**-3, log=True + ) + alpha_1 = UniformFloatHyperparameter( + name="alpha_1", lower=10**-10, upper=10**-3, default_value=10**-6 + ) + alpha_2 = UniformFloatHyperparameter( + name="alpha_2", + log=True, + lower=10**-10, + upper=10**-3, + default_value=10**-6, + ) + lambda_1 = UniformFloatHyperparameter( + name="lambda_1", + log=True, + lower=10**-10, + upper=10**-3, + default_value=10**-6, + ) + lambda_2 = UniformFloatHyperparameter( + name="lambda_2", + log=True, + lower=10**-10, + upper=10**-3, + default_value=10**-6, + ) + threshold_lambda = UniformFloatHyperparameter( + name="threshold_lambda", + log=True, + lower=10**3, + upper=10**5, + default_value=10**4, + ) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") - cs.add_hyperparameters([n_iter, tol, alpha_1, alpha_2, lambda_1, - lambda_2, threshold_lambda, fit_intercept]) + cs.add_hyperparameters( + [ + n_iter, + tol, + alpha_1, + alpha_2, + lambda_1, + lambda_2, + threshold_lambda, + fit_intercept, + ] + ) return cs diff --git a/autosklearn/pipeline/components/regression/decision_tree.py b/autosklearn/pipeline/components/regression/decision_tree.py index 5ecbd254be..db59767587 100644 --- a/autosklearn/pipeline/components/regression/decision_tree.py +++ b/autosklearn/pipeline/components/regression/decision_tree.py @@ -1,20 +1,31 @@ import numpy as np - from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter, \ - UnParametrizedHyperparameter, Constant +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) -from autosklearn.pipeline.components.base import \ - AutoSklearnRegressionAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_none class DecisionTree(AutoSklearnRegressionAlgorithm): - def __init__(self, criterion, max_features, max_depth_factor, - min_samples_split, min_samples_leaf, min_weight_fraction_leaf, - max_leaf_nodes, min_impurity_decrease, random_state=None): + def __init__( + self, + criterion, + max_features, + max_depth_factor, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_leaf_nodes, + min_impurity_decrease, + random_state=None, + ): self.criterion = criterion self.max_features = max_features self.max_depth_factor = max_depth_factor @@ -36,8 +47,8 @@ def fit(self, X, y, sample_weight=None): num_features = X.shape[1] self.max_depth_factor = int(self.max_depth_factor) max_depth_factor = max( - 1, - int(np.round(self.max_depth_factor * num_features, 0))) + 1, int(np.round(self.max_depth_factor * num_features, 0)) + ) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) if check_none(self.max_leaf_nodes): @@ -55,7 +66,8 @@ def fit(self, X, y, sample_weight=None): max_leaf_nodes=self.max_leaf_nodes, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, - random_state=self.random_state) + random_state=self.random_state, + ) if y.ndim == 2 and y.shape[1] == 1: y = y.flatten() @@ -70,38 +82,53 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'DT', - 'name': 'Decision Tree Classifier', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': True, - 'is_deterministic': False, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "DT", + "name": "Decision Tree Classifier", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": True, + "is_deterministic": False, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() - criterion = CategoricalHyperparameter('criterion', - ['mse', 'friedman_mse', 'mae']) - max_features = Constant('max_features', 1.0) + criterion = CategoricalHyperparameter( + "criterion", ["mse", "friedman_mse", "mae"] + ) + max_features = Constant("max_features", 1.0) max_depth_factor = UniformFloatHyperparameter( - 'max_depth_factor', 0., 2., default_value=0.5) + "max_depth_factor", 0.0, 2.0, default_value=0.5 + ) min_samples_split = UniformIntegerHyperparameter( - "min_samples_split", 2, 20, default_value=2) + "min_samples_split", 2, 20, default_value=2 + ) min_samples_leaf = UniformIntegerHyperparameter( - "min_samples_leaf", 1, 20, default_value=1) + "min_samples_leaf", 1, 20, default_value=1 + ) min_weight_fraction_leaf = Constant("min_weight_fraction_leaf", 0.0) max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") min_impurity_decrease = UnParametrizedHyperparameter( - 'min_impurity_decrease', 0.0) + "min_impurity_decrease", 0.0 + ) - cs.add_hyperparameters([criterion, max_features, max_depth_factor, - min_samples_split, min_samples_leaf, - min_weight_fraction_leaf, max_leaf_nodes, - min_impurity_decrease]) + cs.add_hyperparameters( + [ + criterion, + max_features, + max_depth_factor, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_leaf_nodes, + min_impurity_decrease, + ] + ) return cs diff --git a/autosklearn/pipeline/components/regression/extra_trees.py b/autosklearn/pipeline/components/regression/extra_trees.py index a676f0483d..c4646a2709 100644 --- a/autosklearn/pipeline/components/regression/extra_trees.py +++ b/autosklearn/pipeline/components/regression/extra_trees.py @@ -1,12 +1,16 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter, UnParametrizedHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import ( AutoSklearnRegressionAlgorithm, IterativeComponent, ) -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none @@ -14,10 +18,22 @@ class ExtraTreesRegressor( IterativeComponent, AutoSklearnRegressionAlgorithm, ): - def __init__(self, criterion, min_samples_leaf, - min_samples_split, max_features, bootstrap, max_leaf_nodes, - max_depth, min_weight_fraction_leaf, min_impurity_decrease, - oob_score=False, n_jobs=1, random_state=None, verbose=0): + def __init__( + self, + criterion, + min_samples_leaf, + min_samples_split, + max_features, + bootstrap, + max_leaf_nodes, + max_depth, + min_weight_fraction_leaf, + min_impurity_decrease, + oob_score=False, + n_jobs=1, + random_state=None, + verbose=0, + ): self.n_estimators = self.get_max_iter() self.criterion = criterion @@ -53,7 +69,8 @@ def iterative_fit(self, X, y, n_iter=1, refit=False): if self.criterion not in ("mse", "friedman_mse", "mae"): raise ValueError( "'criterion' is not in ('mse', 'friedman_mse', " - "'mae): %s" % self.criterion) + "'mae): %s" % self.criterion + ) if check_none(self.max_depth): self.max_depth = None @@ -75,25 +92,28 @@ def iterative_fit(self, X, y, n_iter=1, refit=False): self.n_jobs = int(self.n_jobs) self.verbose = int(self.verbose) - self.estimator = ETR(n_estimators=n_iter, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - bootstrap=self.bootstrap, - max_features=self.max_features, - max_leaf_nodes=self.max_leaf_nodes, - min_weight_fraction_leaf=self.min_weight_fraction_leaf, - min_impurity_decrease=self.min_impurity_decrease, - oob_score=self.oob_score, - n_jobs=self.n_jobs, - verbose=self.verbose, - random_state=self.random_state, - warm_start=True) + self.estimator = ETR( + n_estimators=n_iter, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + bootstrap=self.bootstrap, + max_features=self.max_features, + max_leaf_nodes=self.max_leaf_nodes, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + min_impurity_decrease=self.min_impurity_decrease, + oob_score=self.oob_score, + n_jobs=self.n_jobs, + verbose=self.verbose, + random_state=self.random_state, + warm_start=True, + ) else: self.estimator.n_estimators += n_iter - self.estimator.n_estimators = min(self.estimator.n_estimators, - self.n_estimators) + self.estimator.n_estimators = min( + self.estimator.n_estimators, self.n_estimators + ) if y.ndim == 2 and y.shape[1] == 1: y = y.flatten() @@ -114,44 +134,62 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'ET', - 'name': 'Extra Trees Regressor', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "ET", + "name": "Extra Trees Regressor", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": True, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() - criterion = CategoricalHyperparameter("criterion", - ['mse', 'friedman_mse', 'mae']) + criterion = CategoricalHyperparameter( + "criterion", ["mse", "friedman_mse", "mae"] + ) max_features = UniformFloatHyperparameter( - "max_features", 0.1, 1.0, default_value=1) + "max_features", 0.1, 1.0, default_value=1 + ) max_depth = UnParametrizedHyperparameter(name="max_depth", value="None") - min_weight_fraction_leaf = UnParametrizedHyperparameter('min_weight_fraction_leaf', 0.) + min_weight_fraction_leaf = UnParametrizedHyperparameter( + "min_weight_fraction_leaf", 0.0 + ) max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") min_samples_split = UniformIntegerHyperparameter( - "min_samples_split", 2, 20, default_value=2) + "min_samples_split", 2, 20, default_value=2 + ) min_samples_leaf = UniformIntegerHyperparameter( - "min_samples_leaf", 1, 20, default_value=1) + "min_samples_leaf", 1, 20, default_value=1 + ) min_impurity_decrease = UnParametrizedHyperparameter( - 'min_impurity_decrease', 0.0 + "min_impurity_decrease", 0.0 ) bootstrap = CategoricalHyperparameter( - "bootstrap", ["True", "False"], default_value="False") + "bootstrap", ["True", "False"], default_value="False" + ) - cs.add_hyperparameters([criterion, max_features, - max_depth, max_leaf_nodes, min_samples_split, - min_samples_leaf, min_impurity_decrease, min_weight_fraction_leaf, - bootstrap]) + cs.add_hyperparameters( + [ + criterion, + max_features, + max_depth, + max_leaf_nodes, + min_samples_split, + min_samples_leaf, + min_impurity_decrease, + min_weight_fraction_leaf, + bootstrap, + ] + ) return cs diff --git a/autosklearn/pipeline/components/regression/gaussian_process.py b/autosklearn/pipeline/components/regression/gaussian_process.py index c587b13b0e..1acf238cd1 100644 --- a/autosklearn/pipeline/components/regression/gaussian_process.py +++ b/autosklearn/pipeline/components/regression/gaussian_process.py @@ -2,7 +2,7 @@ from ConfigSpace.hyperparameters import UniformFloatHyperparameter from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA class GaussianProcess(AutoSklearnRegressionAlgorithm): @@ -22,19 +22,19 @@ def fit(self, X, y): n_features = X.shape[1] kernel = sklearn.gaussian_process.kernels.RBF( - length_scale=[1.0]*n_features, - length_scale_bounds=[(self.thetaL, self.thetaU)]*n_features + length_scale=[1.0] * n_features, + length_scale_bounds=[(self.thetaL, self.thetaU)] * n_features, ) # Instanciate a Gaussian Process model self.estimator = sklearn.gaussian_process.GaussianProcessRegressor( kernel=kernel, n_restarts_optimizer=10, - optimizer='fmin_l_bfgs_b', + optimizer="fmin_l_bfgs_b", alpha=self.alpha, copy_X_train=True, random_state=self.random_state, - normalize_y=True + normalize_y=True, ) if y.ndim == 2 and y.shape[1] == 1: @@ -51,25 +51,30 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'GP', - 'name': 'Gaussian Process', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "GP", + "name": "Gaussian Process", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": True, + "is_deterministic": True, + "input": (DENSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): alpha = UniformFloatHyperparameter( - name="alpha", lower=1e-14, upper=1.0, default_value=1e-8, log=True) + name="alpha", lower=1e-14, upper=1.0, default_value=1e-8, log=True + ) thetaL = UniformFloatHyperparameter( - name="thetaL", lower=1e-10, upper=1e-3, default_value=1e-6, log=True) + name="thetaL", lower=1e-10, upper=1e-3, default_value=1e-6, log=True + ) thetaU = UniformFloatHyperparameter( - name="thetaU", lower=1.0, upper=100000, default_value=100000.0, log=True) + name="thetaU", lower=1.0, upper=100000, default_value=100000.0, log=True + ) cs = ConfigurationSpace() cs.add_hyperparameters([alpha, thetaL, thetaU]) diff --git a/autosklearn/pipeline/components/regression/gradient_boosting.py b/autosklearn/pipeline/components/regression/gradient_boosting.py index ad57596b9a..b7503f5fd0 100644 --- a/autosklearn/pipeline/components/regression/gradient_boosting.py +++ b/autosklearn/pipeline/components/regression/gradient_boosting.py @@ -1,16 +1,19 @@ import numpy as np - -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter, Constant, \ - UnParametrizedHyperparameter from ConfigSpace.conditions import EqualsCondition, InCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import ( AutoSklearnRegressionAlgorithm, IterativeComponent, ) -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA from autosklearn.util.common import check_none @@ -18,10 +21,23 @@ class GradientBoosting( IterativeComponent, AutoSklearnRegressionAlgorithm, ): - def __init__(self, loss, learning_rate, min_samples_leaf, max_depth, - max_leaf_nodes, max_bins, l2_regularization, early_stop, tol, scoring, - n_iter_no_change=0, validation_fraction=None, random_state=None, - verbose=0): + def __init__( + self, + loss, + learning_rate, + min_samples_leaf, + max_depth, + max_leaf_nodes, + max_bins, + l2_regularization, + early_stop, + tol, + scoring, + n_iter_no_change=0, + validation_fraction=None, + random_state=None, + verbose=0, + ): self.loss = loss self.learning_rate = learning_rate self.max_iter = self.get_max_iter() @@ -48,7 +64,7 @@ def get_current_iter(self): return self.estimator.n_iter_ def iterative_fit(self, X, y, n_iter=2, refit=False): - """ Set n_iter=2 for the same reason as for SGD """ + """Set n_iter=2 for the same reason as for SGD""" import sklearn.ensemble from sklearn.experimental import enable_hist_gradient_boosting # noqa @@ -106,8 +122,7 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): ) else: self.estimator.max_iter += n_iter - self.estimator.max_iter = min(self.estimator.max_iter, - self.max_iter) + self.estimator.max_iter = min(self.estimator.max_iter, self.max_iter) if y.ndim == 2 and y.shape[1] == 1: y = y.flatten() @@ -125,7 +140,7 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): def configuration_fully_fitted(self): if self.estimator is None: return False - elif not hasattr(self, 'fully_fit_'): + elif not hasattr(self, "fully_fit_"): return False else: return self.fully_fit_ @@ -137,54 +152,79 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'GB', - 'name': 'Gradient Boosting Regressor', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "GB", + "name": "Gradient Boosting Regressor", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = CategoricalHyperparameter( - "loss", ["least_squares"], default_value="least_squares") + "loss", ["least_squares"], default_value="least_squares" + ) learning_rate = UniformFloatHyperparameter( - name="learning_rate", lower=0.01, upper=1, default_value=0.1, log=True) + name="learning_rate", lower=0.01, upper=1, default_value=0.1, log=True + ) min_samples_leaf = UniformIntegerHyperparameter( - name="min_samples_leaf", lower=1, upper=200, default_value=20, log=True) - max_depth = UnParametrizedHyperparameter( - name="max_depth", value="None") + name="min_samples_leaf", lower=1, upper=200, default_value=20, log=True + ) + max_depth = UnParametrizedHyperparameter(name="max_depth", value="None") max_leaf_nodes = UniformIntegerHyperparameter( - name="max_leaf_nodes", lower=3, upper=2047, default_value=31, log=True) + name="max_leaf_nodes", lower=3, upper=2047, default_value=31, log=True + ) max_bins = Constant("max_bins", 255) l2_regularization = UniformFloatHyperparameter( - name="l2_regularization", lower=1E-10, upper=1, default_value=1E-10, log=True) + name="l2_regularization", + lower=1e-10, + upper=1, + default_value=1e-10, + log=True, + ) early_stop = CategoricalHyperparameter( - name="early_stop", choices=["off", "valid", "train"], default_value="off") - tol = UnParametrizedHyperparameter( - name="tol", value=1e-7) - scoring = UnParametrizedHyperparameter( - name="scoring", value="loss") + name="early_stop", choices=["off", "valid", "train"], default_value="off" + ) + tol = UnParametrizedHyperparameter(name="tol", value=1e-7) + scoring = UnParametrizedHyperparameter(name="scoring", value="loss") n_iter_no_change = UniformIntegerHyperparameter( - name="n_iter_no_change", lower=1, upper=20, default_value=10) + name="n_iter_no_change", lower=1, upper=20, default_value=10 + ) validation_fraction = UniformFloatHyperparameter( - name="validation_fraction", lower=0.01, upper=0.4, default_value=0.1) - - cs.add_hyperparameters([loss, learning_rate, min_samples_leaf, - max_depth, max_leaf_nodes, max_bins, l2_regularization, - early_stop, tol, scoring, n_iter_no_change, - validation_fraction]) + name="validation_fraction", lower=0.01, upper=0.4, default_value=0.1 + ) + + cs.add_hyperparameters( + [ + loss, + learning_rate, + min_samples_leaf, + max_depth, + max_leaf_nodes, + max_bins, + l2_regularization, + early_stop, + tol, + scoring, + n_iter_no_change, + validation_fraction, + ] + ) n_iter_no_change_cond = InCondition( - n_iter_no_change, early_stop, ["valid", "train"]) + n_iter_no_change, early_stop, ["valid", "train"] + ) validation_fraction_cond = EqualsCondition( - validation_fraction, early_stop, "valid") + validation_fraction, early_stop, "valid" + ) cs.add_conditions([n_iter_no_change_cond, validation_fraction_cond]) diff --git a/autosklearn/pipeline/components/regression/k_nearest_neighbors.py b/autosklearn/pipeline/components/regression/k_nearest_neighbors.py index e4943e2ca5..83c13cd191 100644 --- a/autosklearn/pipeline/components/regression/k_nearest_neighbors.py +++ b/autosklearn/pipeline/components/regression/k_nearest_neighbors.py @@ -1,9 +1,11 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import CategoricalHyperparameter, \ - UniformIntegerHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA class KNearestNeighborsRegressor(AutoSklearnRegressionAlgorithm): @@ -19,11 +21,9 @@ def fit(self, X, y): self.n_neighbors = int(self.n_neighbors) self.p = int(self.p) - self.estimator = \ - sklearn.neighbors.KNeighborsRegressor( - n_neighbors=self.n_neighbors, - weights=self.weights, - p=self.p) + self.estimator = sklearn.neighbors.KNeighborsRegressor( + n_neighbors=self.n_neighbors, weights=self.weights, p=self.p + ) if y.ndim == 2 and y.shape[1] == 1: y = y.flatten() @@ -38,25 +38,29 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'KNN', - 'name': 'K-Nearest Neighbor Classification', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "KNN", + "name": "K-Nearest Neighbor Classification", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": True, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_neighbors = UniformIntegerHyperparameter( - name="n_neighbors", lower=1, upper=100, log=True, default_value=1) + name="n_neighbors", lower=1, upper=100, log=True, default_value=1 + ) weights = CategoricalHyperparameter( - name="weights", choices=["uniform", "distance"], default_value="uniform") + name="weights", choices=["uniform", "distance"], default_value="uniform" + ) p = CategoricalHyperparameter(name="p", choices=[1, 2], default_value=2) cs.add_hyperparameters([n_neighbors, weights, p]) diff --git a/autosklearn/pipeline/components/regression/liblinear_svr.py b/autosklearn/pipeline/components/regression/liblinear_svr.py index 73c1550ff3..e129331298 100644 --- a/autosklearn/pipeline/components/regression/liblinear_svr.py +++ b/autosklearn/pipeline/components/regression/liblinear_svr.py @@ -1,18 +1,29 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter, Constant -from ConfigSpace.forbidden import ForbiddenEqualsClause, \ - ForbiddenAndConjunction +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, +) from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool class LibLinear_SVR(AutoSklearnRegressionAlgorithm): # Liblinear is not deterministic as it uses a RNG inside - def __init__(self, loss, epsilon, dual, tol, C, fit_intercept, - intercept_scaling, random_state=None): + def __init__( + self, + loss, + epsilon, + dual, + tol, + C, + fit_intercept, + intercept_scaling, + random_state=None, + ): self.epsilon = epsilon self.loss = loss self.dual = dual @@ -34,14 +45,16 @@ def fit(self, X, y): self.fit_intercept = check_for_bool(self.fit_intercept) self.intercept_scaling = float(self.intercept_scaling) - self.estimator = sklearn.svm.LinearSVR(epsilon=self.epsilon, - loss=self.loss, - dual=self.dual, - tol=self.tol, - C=self.C, - fit_intercept=self.fit_intercept, - intercept_scaling=self.intercept_scaling, - random_state=self.random_state) + self.estimator = sklearn.svm.LinearSVR( + epsilon=self.epsilon, + loss=self.loss, + dual=self.dual, + tol=self.tol, + C=self.C, + fit_intercept=self.fit_intercept, + intercept_scaling=self.intercept_scaling, + random_state=self.random_state, + ) if y.ndim == 2 and y.shape[1] == 1: y = y.flatten() @@ -56,41 +69,47 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'Liblinear-SVR', - 'name': 'Liblinear Support Vector Regression', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': False, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "Liblinear-SVR", + "name": "Liblinear Support Vector Regression", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": False, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() - C = UniformFloatHyperparameter( - "C", 0.03125, 32768, log=True, default_value=1.0) + C = UniformFloatHyperparameter("C", 0.03125, 32768, log=True, default_value=1.0) loss = CategoricalHyperparameter( - "loss", ["epsilon_insensitive", "squared_epsilon_insensitive"], - default_value="squared_epsilon_insensitive") + "loss", + ["epsilon_insensitive", "squared_epsilon_insensitive"], + default_value="squared_epsilon_insensitive", + ) # Random Guess epsilon = UniformFloatHyperparameter( - name="epsilon", lower=0.001, upper=1, default_value=0.1, log=True) + name="epsilon", lower=0.001, upper=1, default_value=0.1, log=True + ) dual = Constant("dual", "False") # These are set ad-hoc tol = UniformFloatHyperparameter( - "tol", 1e-5, 1e-1, default_value=1e-4, log=True) + "tol", 1e-5, 1e-1, default_value=1e-4, log=True + ) fit_intercept = Constant("fit_intercept", "True") intercept_scaling = Constant("intercept_scaling", 1) - cs.add_hyperparameters([C, loss, epsilon, dual, tol, fit_intercept, - intercept_scaling]) + cs.add_hyperparameters( + [C, loss, epsilon, dual, tol, fit_intercept, intercept_scaling] + ) dual_and_loss = ForbiddenAndConjunction( ForbiddenEqualsClause(dual, "False"), - ForbiddenEqualsClause(loss, "epsilon_insensitive") + ForbiddenEqualsClause(loss, "epsilon_insensitive"), ) cs.add_forbidden_clause(dual_and_loss) diff --git a/autosklearn/pipeline/components/regression/libsvm_svr.py b/autosklearn/pipeline/components/regression/libsvm_svr.py index 6b6c70415c..d4173d7f01 100644 --- a/autosklearn/pipeline/components/regression/libsvm_svr.py +++ b/autosklearn/pipeline/components/regression/libsvm_svr.py @@ -1,20 +1,35 @@ import resource import sys +from ConfigSpace.conditions import EqualsCondition, InCondition from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.conditions import InCondition, EqualsCondition -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter, \ - UnParametrizedHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) + from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none class LibSVM_SVR(AutoSklearnRegressionAlgorithm): - def __init__(self, kernel, C, epsilon, tol, shrinking, gamma=0.1, - degree=3, coef0=0.0, verbose=False, - max_iter=-1, random_state=None): + def __init__( + self, + kernel, + C, + epsilon, + tol, + shrinking, + gamma=0.1, + degree=3, + coef0=0.0, + verbose=False, + max_iter=-1, + random_state=None, + ): self.kernel = kernel self.C = C self.epsilon = epsilon @@ -31,9 +46,9 @@ def __init__(self, kernel, C, epsilon, tol, shrinking, gamma=0.1, def fit(self, X, y): import sklearn.svm - # Calculate the size of the kernel cache (in MB) for sklearn's LibSVM. The cache size is - # calculated as 2/3 of the available memory (which is calculated as the memory limit minus - # the used memory) + # Calculate the size of the kernel cache (in MB) for sklearn's LibSVM. + # The cache size is calculated as 2/3 of the available memory + # (which is calculated as the memory limit minus the used memory) try: # Retrieve memory limits imposed on the process soft, hard = resource.getrlimit(resource.RLIMIT_AS) @@ -45,9 +60,9 @@ def fit(self, X, y): # Retrieve memory used by this process maxrss = resource.getrusage(resource.RUSAGE_SELF)[2] / 1024 - # In MacOS, the MaxRSS output of resource.getrusage in bytes; on other platforms, - # it's in kilobytes - if sys.platform == 'darwin': + # In MacOS, the MaxRSS output of resource.getrusage in bytes; + # on other platforms, it's in kilobytes + if sys.platform == "darwin": maxrss = maxrss / 1024 cache_size = (soft - maxrss) / 1.5 @@ -83,7 +98,7 @@ def fit(self, X, y): coef0=self.coef0, cache_size=cache_size, verbose=self.verbose, - max_iter=self.max_iter + max_iter=self.max_iter, ) self.scaler = sklearn.preprocessing.StandardScaler(copy=True) @@ -119,56 +134,70 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'SVR', - 'name': 'Support Vector Regression', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'prefers_data_normalized': True, - 'is_deterministic': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "SVR", + "name": "Support Vector Regression", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": False, + "prefers_data_normalized": True, + "is_deterministic": True, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): C = UniformFloatHyperparameter( - name="C", lower=0.03125, upper=32768, log=True, default_value=1.0) + name="C", lower=0.03125, upper=32768, log=True, default_value=1.0 + ) # Random Guess - epsilon = UniformFloatHyperparameter(name="epsilon", lower=0.001, - upper=1, default_value=0.1, - log=True) + epsilon = UniformFloatHyperparameter( + name="epsilon", lower=0.001, upper=1, default_value=0.1, log=True + ) kernel = CategoricalHyperparameter( - name="kernel", choices=['linear', 'poly', 'rbf', 'sigmoid'], - default_value="rbf") + name="kernel", + choices=["linear", "poly", "rbf", "sigmoid"], + default_value="rbf", + ) degree = UniformIntegerHyperparameter( - name="degree", lower=2, upper=5, default_value=3) + name="degree", lower=2, upper=5, default_value=3 + ) gamma = UniformFloatHyperparameter( - name="gamma", lower=3.0517578125e-05, upper=8, log=True, default_value=0.1) + name="gamma", lower=3.0517578125e-05, upper=8, log=True, default_value=0.1 + ) # TODO this is totally ad-hoc coef0 = UniformFloatHyperparameter( - name="coef0", lower=-1, upper=1, default_value=0) + name="coef0", lower=-1, upper=1, default_value=0 + ) # probability is no hyperparameter, but an argument to the SVM algo shrinking = CategoricalHyperparameter( - name="shrinking", choices=["True", "False"], default_value="True") + name="shrinking", choices=["True", "False"], default_value="True" + ) tol = UniformFloatHyperparameter( - name="tol", lower=1e-5, upper=1e-1, default_value=1e-3, log=True) + name="tol", lower=1e-5, upper=1e-1, default_value=1e-3, log=True + ) max_iter = UnParametrizedHyperparameter("max_iter", -1) cs = ConfigurationSpace() - cs.add_hyperparameters([C, kernel, degree, gamma, coef0, shrinking, - tol, max_iter, epsilon]) + cs.add_hyperparameters( + [C, kernel, degree, gamma, coef0, shrinking, tol, max_iter, epsilon] + ) degree_depends_on_poly = EqualsCondition(degree, kernel, "poly") - gamma_depends_on_kernel = InCondition(child=gamma, parent=kernel, - values=('poly', 'rbf')) - coef0_depends_on_kernel = InCondition(child=coef0, parent=kernel, - values=('poly', 'sigmoid')) - cs.add_conditions([degree_depends_on_poly, gamma_depends_on_kernel, - coef0_depends_on_kernel]) + gamma_depends_on_kernel = InCondition( + child=gamma, parent=kernel, values=("poly", "rbf") + ) + coef0_depends_on_kernel = InCondition( + child=coef0, parent=kernel, values=("poly", "sigmoid") + ) + cs.add_conditions( + [degree_depends_on_poly, gamma_depends_on_kernel, coef0_depends_on_kernel] + ) return cs diff --git a/autosklearn/pipeline/components/regression/mlp.py b/autosklearn/pipeline/components/regression/mlp.py index 8eec40a2cc..645c29403a 100644 --- a/autosklearn/pipeline/components/regression/mlp.py +++ b/autosklearn/pipeline/components/regression/mlp.py @@ -1,27 +1,43 @@ import numpy as np - -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, UnParametrizedHyperparameter, Constant, \ - CategoricalHyperparameter from ConfigSpace.conditions import InCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + Constant, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import ( AutoSklearnRegressionAlgorithm, IterativeComponent, ) -from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, PREDICTIONS +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool -class MLPRegressor( - IterativeComponent, - AutoSklearnRegressionAlgorithm -): - def __init__(self, hidden_layer_depth, num_nodes_per_layer, activation, alpha, - learning_rate_init, early_stopping, solver, batch_size, - n_iter_no_change, tol, shuffle, beta_1, beta_2, epsilon, - validation_fraction=None, random_state=None, verbose=0): +class MLPRegressor(IterativeComponent, AutoSklearnRegressionAlgorithm): + def __init__( + self, + hidden_layer_depth, + num_nodes_per_layer, + activation, + alpha, + learning_rate_init, + early_stopping, + solver, + batch_size, + n_iter_no_change, + tol, + shuffle, + beta_1, + beta_2, + epsilon, + validation_fraction=None, + random_state=None, + verbose=0, + ): self.hidden_layer_depth = hidden_layer_depth self.num_nodes_per_layer = num_nodes_per_layer self.max_iter = self.get_max_iter() @@ -52,11 +68,10 @@ def get_current_iter(self): return self.estimator.n_iter_ def iterative_fit(self, X, y, n_iter=2, refit=False): - """ - Set n_iter=2 for the same reason as for SGD - """ - from sklearn.neural_network import MLPRegressor + """Set n_iter=2 for the same reason as for SGD""" import sklearn.preprocessing + from sklearn.neural_network import MLPRegressor + n_iter = max(n_iter, 2) if refit: @@ -69,8 +84,9 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): self.max_iter = int(self.max_iter) self.hidden_layer_depth = int(self.hidden_layer_depth) self.num_nodes_per_layer = int(self.num_nodes_per_layer) - self.hidden_layer_sizes = tuple(self.num_nodes_per_layer - for i in range(self.hidden_layer_depth)) + self.hidden_layer_sizes = tuple( + self.num_nodes_per_layer for i in range(self.hidden_layer_depth) + ) self.activation = str(self.activation) self.alpha = float(self.alpha) self.learning_rate_init = float(self.learning_rate_init) @@ -86,7 +102,9 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): self.n_iter_no_change = int(self.n_iter_no_change) self.early_stopping_val = True else: - raise ValueError("Set early stopping to unknown value %s" % self.early_stopping) + raise ValueError( + "Set early stopping to unknown value %s" % self.early_stopping + ) # elif self.early_stopping == "off": # self.validation_fraction = 0 # self.tol = 10000 @@ -172,7 +190,7 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): def configuration_fully_fitted(self): if self.estimator is None: return False - elif not hasattr(self, '_fully_fit'): + elif not hasattr(self, "_fully_fit"): return False else: return self._fully_fit @@ -193,43 +211,56 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'MLP', - 'name': 'Multilayer Percepton', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "MLP", + "name": "Multilayer Percepton", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() - hidden_layer_depth = UniformIntegerHyperparameter(name="hidden_layer_depth", - lower=1, upper=3, default_value=1) - num_nodes_per_layer = UniformIntegerHyperparameter(name="num_nodes_per_layer", - lower=16, upper=264, default_value=32, - log=True) - activation = CategoricalHyperparameter(name="activation", choices=['tanh', 'relu'], - default_value='tanh') - alpha = UniformFloatHyperparameter(name="alpha", lower=1e-7, upper=1e-1, default_value=1e-4, - log=True) - - learning_rate_init = UniformFloatHyperparameter(name="learning_rate_init", - lower=1e-4, upper=0.5, default_value=1e-3, - log=True) + hidden_layer_depth = UniformIntegerHyperparameter( + name="hidden_layer_depth", lower=1, upper=3, default_value=1 + ) + num_nodes_per_layer = UniformIntegerHyperparameter( + name="num_nodes_per_layer", lower=16, upper=264, default_value=32, log=True + ) + activation = CategoricalHyperparameter( + name="activation", choices=["tanh", "relu"], default_value="tanh" + ) + alpha = UniformFloatHyperparameter( + name="alpha", lower=1e-7, upper=1e-1, default_value=1e-4, log=True + ) + + learning_rate_init = UniformFloatHyperparameter( + name="learning_rate_init", + lower=1e-4, + upper=0.5, + default_value=1e-3, + log=True, + ) # Not allowing to turn off early stopping - early_stopping = CategoricalHyperparameter(name="early_stopping", - choices=["valid", "train"], # , "off"], - default_value="valid") + early_stopping = CategoricalHyperparameter( + name="early_stopping", + choices=["valid", "train"], # , "off"], + default_value="valid", + ) # Constants - n_iter_no_change = Constant(name="n_iter_no_change", value=32) # default=10 is too low + n_iter_no_change = Constant( + name="n_iter_no_change", value=32 + ) # default=10 is too low validation_fraction = Constant(name="validation_fraction", value=0.1) tol = UnParametrizedHyperparameter(name="tol", value=1e-4) - solver = Constant(name="solver", value='adam') + solver = Constant(name="solver", value="adam") # Relying on sklearn defaults for now batch_size = UnParametrizedHyperparameter(name="batch_size", value="auto") @@ -247,17 +278,33 @@ def get_hyperparameter_search_space(dataset_properties=None): # max_fun --> only used when solver=lbfgs # activation=["identity", "logistic"] --> not useful for classification - cs.add_hyperparameters([hidden_layer_depth, num_nodes_per_layer, - activation, alpha, - learning_rate_init, early_stopping, - n_iter_no_change, validation_fraction, tol, - solver, batch_size, shuffle, - beta_1, beta_2, epsilon]) - - validation_fraction_cond = InCondition(validation_fraction, early_stopping, ["valid"]) + cs.add_hyperparameters( + [ + hidden_layer_depth, + num_nodes_per_layer, + activation, + alpha, + learning_rate_init, + early_stopping, + n_iter_no_change, + validation_fraction, + tol, + solver, + batch_size, + shuffle, + beta_1, + beta_2, + epsilon, + ] + ) + + validation_fraction_cond = InCondition( + validation_fraction, early_stopping, ["valid"] + ) cs.add_conditions([validation_fraction_cond]) # We always use early stopping - # n_iter_no_change_cond = InCondition(n_iter_no_change, early_stopping, ["valid", "train"]) + # n_iter_no_change_cond = \ + # InCondition(n_iter_no_change, early_stopping, ["valid", "train"]) # tol_cond = InCondition(n_iter_no_change, early_stopping, ["valid", "train"]) # cs.add_conditions([n_iter_no_change_cond, tol_cond]) diff --git a/autosklearn/pipeline/components/regression/random_forest.py b/autosklearn/pipeline/components/regression/random_forest.py index eeaddb9e1a..128113fc43 100644 --- a/autosklearn/pipeline/components/regression/random_forest.py +++ b/autosklearn/pipeline/components/regression/random_forest.py @@ -1,12 +1,16 @@ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter, UnParametrizedHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import ( AutoSklearnRegressionAlgorithm, IterativeComponent, ) -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none @@ -14,10 +18,20 @@ class RandomForest( IterativeComponent, AutoSklearnRegressionAlgorithm, ): - def __init__(self, criterion, max_features, - max_depth, min_samples_split, min_samples_leaf, - min_weight_fraction_leaf, bootstrap, max_leaf_nodes, - min_impurity_decrease, random_state=None, n_jobs=1): + def __init__( + self, + criterion, + max_features, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + bootstrap, + max_leaf_nodes, + min_impurity_decrease, + random_state=None, + n_jobs=1, + ): self.n_estimators = self.get_max_iter() self.criterion = criterion self.max_features = max_features @@ -79,11 +93,13 @@ def iterative_fit(self, X, y, n_iter=1, refit=False): min_impurity_decrease=self.min_impurity_decrease, random_state=self.random_state, n_jobs=self.n_jobs, - warm_start=True) + warm_start=True, + ) else: self.estimator.n_estimators += n_iter - self.estimator.n_estimators = min(self.estimator.n_estimators, - self.n_estimators) + self.estimator.n_estimators = min( + self.estimator.n_estimators, self.n_estimators + ) if y.ndim == 2 and y.shape[1] == 1: y = y.flatten() @@ -104,45 +120,63 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'RF', - 'name': 'Random Forest Regressor', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': True, - 'prefers_data_normalized': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "RF", + "name": "Random Forest Regressor", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": True, + "prefers_data_normalized": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() - criterion = CategoricalHyperparameter("criterion", - ['mse', 'friedman_mse', 'mae']) + criterion = CategoricalHyperparameter( + "criterion", ["mse", "friedman_mse", "mae"] + ) # In contrast to the random forest classifier we want to use more max_features # and therefore have this not on a sqrt scale max_features = UniformFloatHyperparameter( - "max_features", 0.1, 1.0, default_value=1.0) + "max_features", 0.1, 1.0, default_value=1.0 + ) max_depth = UnParametrizedHyperparameter("max_depth", "None") min_samples_split = UniformIntegerHyperparameter( - "min_samples_split", 2, 20, default_value=2) + "min_samples_split", 2, 20, default_value=2 + ) min_samples_leaf = UniformIntegerHyperparameter( - "min_samples_leaf", 1, 20, default_value=1) - min_weight_fraction_leaf = \ - UnParametrizedHyperparameter("min_weight_fraction_leaf", 0.) + "min_samples_leaf", 1, 20, default_value=1 + ) + min_weight_fraction_leaf = UnParametrizedHyperparameter( + "min_weight_fraction_leaf", 0.0 + ) max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") min_impurity_decrease = UnParametrizedHyperparameter( - 'min_impurity_decrease', 0.0) + "min_impurity_decrease", 0.0 + ) bootstrap = CategoricalHyperparameter( - "bootstrap", ["True", "False"], default_value="True") - - cs.add_hyperparameters([criterion, max_features, - max_depth, min_samples_split, min_samples_leaf, - min_weight_fraction_leaf, max_leaf_nodes, - min_impurity_decrease, bootstrap]) + "bootstrap", ["True", "False"], default_value="True" + ) + + cs.add_hyperparameters( + [ + criterion, + max_features, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_leaf_nodes, + min_impurity_decrease, + bootstrap, + ] + ) return cs diff --git a/autosklearn/pipeline/components/regression/sgd.py b/autosklearn/pipeline/components/regression/sgd.py index 8b3e7dbd34..3b3f939fa8 100644 --- a/autosklearn/pipeline/components/regression/sgd.py +++ b/autosklearn/pipeline/components/regression/sgd.py @@ -1,13 +1,16 @@ +from ConfigSpace.conditions import EqualsCondition, InCondition from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter, UnParametrizedHyperparameter -from ConfigSpace.conditions import InCondition, EqualsCondition +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UnParametrizedHyperparameter, +) from autosklearn.pipeline.components.base import ( AutoSklearnRegressionAlgorithm, IterativeComponent, ) -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE +from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool @@ -15,9 +18,21 @@ class SGD( IterativeComponent, AutoSklearnRegressionAlgorithm, ): - def __init__(self, loss, penalty, alpha, fit_intercept, tol, - learning_rate, l1_ratio=0.15, epsilon=0.1, - eta0=0.01, power_t=0.5, average=False, random_state=None): + def __init__( + self, + loss, + penalty, + alpha, + fit_intercept, + tol, + learning_rate, + l1_ratio=0.15, + epsilon=0.1, + eta0=0.01, + power_t=0.5, + average=False, + random_state=None, + ): self.max_iter = self.get_max_iter() self.loss = loss self.penalty = penalty @@ -43,8 +58,8 @@ def get_current_iter(self): return self.n_iter_ def iterative_fit(self, X, y, n_iter=2, refit=False): - from sklearn.linear_model import SGDRegressor import sklearn.preprocessing + from sklearn.linear_model import SGDRegressor # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually @@ -62,32 +77,31 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): self.fully_fit_ = False self.alpha = float(self.alpha) - self.l1_ratio = float( - self.l1_ratio) if self.l1_ratio is not None else 0.15 - self.epsilon = float( - self.epsilon) if self.epsilon is not None else 0.1 + self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None else 0.15 + self.epsilon = float(self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) - self.power_t = float( - self.power_t) if self.power_t is not None else 0.25 + self.power_t = float(self.power_t) if self.power_t is not None else 0.25 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) - self.estimator = SGDRegressor(loss=self.loss, - penalty=self.penalty, - alpha=self.alpha, - fit_intercept=self.fit_intercept, - max_iter=n_iter, - tol=self.tol, - learning_rate=self.learning_rate, - l1_ratio=self.l1_ratio, - epsilon=self.epsilon, - eta0=self.eta0, - power_t=self.power_t, - shuffle=True, - average=self.average, - random_state=self.random_state, - warm_start=True) + self.estimator = SGDRegressor( + loss=self.loss, + penalty=self.penalty, + alpha=self.alpha, + fit_intercept=self.fit_intercept, + max_iter=n_iter, + tol=self.tol, + learning_rate=self.learning_rate, + l1_ratio=self.l1_ratio, + epsilon=self.epsilon, + eta0=self.eta0, + power_t=self.power_t, + shuffle=True, + average=self.average, + random_state=self.random_state, + warm_start=True, + ) self.scaler = sklearn.preprocessing.StandardScaler(copy=True) @@ -119,7 +133,8 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): self.estimator._validate_params() self.estimator._partial_fit( - X, y_scaled, + X, + y_scaled, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, @@ -127,11 +142,14 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): max_iter=n_iter, sample_weight=None, coef_init=None, - intercept_init=None + intercept_init=None, ) self.n_iter_ += self.estimator.n_iter_ - if self.estimator.max_iter >= self.max_iter or self.estimator.max_iter > self.n_iter_: + if ( + self.estimator.max_iter >= self.max_iter + or self.estimator.max_iter > self.n_iter_ + ): self.fully_fit_ = True return self @@ -139,7 +157,7 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): def configuration_fully_fitted(self): if self.estimator is None: return False - elif not hasattr(self, 'fully_fit_'): + elif not hasattr(self, "fully_fit_"): return False else: return self.fully_fit_ @@ -152,18 +170,19 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'SGD Regressor', - 'name': 'Stochastic Gradient Descent Regressor', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'handles_sparse': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - } + return { + "shortname": "SGD Regressor", + "name": "Stochastic Gradient Descent Regressor", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "handles_sparse": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): @@ -171,34 +190,58 @@ def get_hyperparameter_search_space(dataset_properties=None): loss = CategoricalHyperparameter( "loss", - ["squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"], + [ + "squared_loss", + "huber", + "epsilon_insensitive", + "squared_epsilon_insensitive", + ], default_value="squared_loss", - ) + ) penalty = CategoricalHyperparameter( - "penalty", ["l1", "l2", "elasticnet"], default_value="l2") + "penalty", ["l1", "l2", "elasticnet"], default_value="l2" + ) alpha = UniformFloatHyperparameter( - "alpha", 1e-7, 1e-1, log=True, default_value=0.0001) + "alpha", 1e-7, 1e-1, log=True, default_value=0.0001 + ) l1_ratio = UniformFloatHyperparameter( - "l1_ratio", 1e-9, 1., log=True, default_value=0.15) - fit_intercept = UnParametrizedHyperparameter( - "fit_intercept", "True") + "l1_ratio", 1e-9, 1.0, log=True, default_value=0.15 + ) + fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") tol = UniformFloatHyperparameter( - "tol", 1e-5, 1e-1, default_value=1e-4, log=True) + "tol", 1e-5, 1e-1, default_value=1e-4, log=True + ) epsilon = UniformFloatHyperparameter( - "epsilon", 1e-5, 1e-1, default_value=0.1, log=True) + "epsilon", 1e-5, 1e-1, default_value=0.1, log=True + ) learning_rate = CategoricalHyperparameter( - "learning_rate", ["optimal", "invscaling", "constant"], - default_value="invscaling") + "learning_rate", + ["optimal", "invscaling", "constant"], + default_value="invscaling", + ) eta0 = UniformFloatHyperparameter( - "eta0", 1e-7, 1e-1, default_value=0.01, log=True) - power_t = UniformFloatHyperparameter( - "power_t", 1e-5, 1, default_value=0.25) + "eta0", 1e-7, 1e-1, default_value=0.01, log=True + ) + power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default_value=0.25) average = CategoricalHyperparameter( - "average", ["False", "True"], default_value="False") - - cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept, - tol, epsilon, learning_rate, eta0, - power_t, average]) + "average", ["False", "True"], default_value="False" + ) + + cs.add_hyperparameters( + [ + loss, + penalty, + alpha, + l1_ratio, + fit_intercept, + tol, + epsilon, + learning_rate, + eta0, + power_t, + average, + ] + ) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") @@ -206,17 +249,16 @@ def get_hyperparameter_search_space(dataset_properties=None): epsilon, loss, ["huber", "epsilon_insensitive", "squared_epsilon_insensitive"], - ) + ) # eta0 is only relevant if learning_rate!='optimal' according to code # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/ # linear_model/sgd_fast.pyx#L603 - eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", - "constant"]) - power_t_condition = EqualsCondition(power_t, learning_rate, - "invscaling") + eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", "constant"]) + power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") - cs.add_conditions([elasticnet, epsilon_condition, power_t_condition, - eta0_in_inv_con]) + cs.add_conditions( + [elasticnet, epsilon_condition, power_t_condition, eta0_in_inv_con] + ) return cs diff --git a/autosklearn/pipeline/constants.py b/autosklearn/pipeline/constants.py index 924baa185a..9dea960847 100644 --- a/autosklearn/pipeline/constants.py +++ b/autosklearn/pipeline/constants.py @@ -7,24 +7,29 @@ MULTIOUTPUT_REGRESSION = 5 REGRESSION_TASKS = [REGRESSION, MULTIOUTPUT_REGRESSION] -CLASSIFICATION_TASKS = [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, - MULTILABEL_CLASSIFICATION] +CLASSIFICATION_TASKS = [ + BINARY_CLASSIFICATION, + MULTICLASS_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, +] TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS -TASK_TYPES_TO_STRING = \ - {BINARY_CLASSIFICATION: "binary.classification", - MULTICLASS_CLASSIFICATION: "multiclass.classification", - MULTILABEL_CLASSIFICATION: "multilabel.classification", - REGRESSION: "regression", - MULTIOUTPUT_REGRESSION: "multioutput.regression"} +TASK_TYPES_TO_STRING = { + BINARY_CLASSIFICATION: "binary.classification", + MULTICLASS_CLASSIFICATION: "multiclass.classification", + MULTILABEL_CLASSIFICATION: "multilabel.classification", + REGRESSION: "regression", + MULTIOUTPUT_REGRESSION: "multioutput.regression", +} -STRING_TO_TASK_TYPES = \ - {"binary.classification": BINARY_CLASSIFICATION, - "multiclass.classification": MULTICLASS_CLASSIFICATION, - "multilabel.classification": MULTILABEL_CLASSIFICATION, - "regression": REGRESSION, - "multioutput.regression": MULTIOUTPUT_REGRESSION} +STRING_TO_TASK_TYPES = { + "binary.classification": BINARY_CLASSIFICATION, + "multiclass.classification": MULTICLASS_CLASSIFICATION, + "multilabel.classification": MULTILABEL_CLASSIFICATION, + "regression": REGRESSION, + "multioutput.regression": MULTIOUTPUT_REGRESSION, +} DENSE = 6 SPARSE = 7 @@ -34,10 +39,11 @@ SIGNED_DATA = 10 UNSIGNED_DATA = 11 -DATASET_PROPERTIES_TO_STRING = \ - {DENSE: 'dense', - SPARSE: 'sparse', - PREDICTIONS: 'predictions', - INPUT: 'input', - SIGNED_DATA: 'signed data', - UNSIGNED_DATA: 'unsigned data'} +DATASET_PROPERTIES_TO_STRING = { + DENSE: "dense", + SPARSE: "sparse", + PREDICTIONS: "predictions", + INPUT: "input", + SIGNED_DATA: "signed data", + UNSIGNED_DATA: "unsigned data", +} diff --git a/autosklearn/pipeline/create_searchspace_util.py b/autosklearn/pipeline/create_searchspace_util.py index 8c2ac33f0f..dff69acc6e 100644 --- a/autosklearn/pipeline/create_searchspace_util.py +++ b/autosklearn/pipeline/create_searchspace_util.py @@ -1,18 +1,21 @@ import itertools import numpy as np +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause -from ConfigSpace.forbidden import ForbiddenAndConjunction -from ConfigSpace.forbidden import ForbiddenEqualsClause +from autosklearn.pipeline.constants import ( + DENSE, + INPUT, + PREDICTIONS, + SIGNED_DATA, + SPARSE, + UNSIGNED_DATA, +) -from autosklearn.pipeline.constants import \ - SIGNED_DATA, UNSIGNED_DATA, PREDICTIONS, INPUT, DENSE, SPARSE - -def get_match_array(pipeline, dataset_properties, - include=None, exclude=None): - sparse = dataset_properties.get('sparse') - signed = dataset_properties.get('signed') +def get_match_array(pipeline, dataset_properties, include=None, exclude=None): + sparse = dataset_properties.get("sparse") + signed = dataset_properties.get("signed") # Duck typing, not sure if it's good... node_i_is_choice = [] @@ -24,18 +27,24 @@ def get_match_array(pipeline, dataset_properties, is_choice = hasattr(node, "get_available_components") node_i_is_choice.append(is_choice) - node_include = include.get( - node_name) if include is not None else None - node_exclude = exclude.get( - node_name) if exclude is not None else None + node_include = include.get(node_name) if include is not None else None + node_exclude = exclude.get(node_name) if exclude is not None else None if is_choice: - node_i_choices_names.append(list(node.get_available_components( - dataset_properties, include=node_include, - exclude=node_exclude).keys())) - node_i_choices.append(list(node.get_available_components( - dataset_properties, include=node_include, - exclude=node_exclude).values())) + node_i_choices_names.append( + list( + node.get_available_components( + dataset_properties, include=node_include, exclude=node_exclude + ).keys() + ) + ) + node_i_choices.append( + list( + node.get_available_components( + dataset_properties, include=node_include, exclude=node_exclude + ).values() + ) + ) else: node_i_choices.append([node]) @@ -47,20 +56,24 @@ def get_match_array(pipeline, dataset_properties, pipeline_idxs = [range(dim) for dim in matches_dimensions] for pipeline_instantiation_idxs in itertools.product(*pipeline_idxs): - pipeline_instantiation = [node_i_choices[i][idx] for i, idx in - enumerate(pipeline_instantiation_idxs)] + pipeline_instantiation = [ + node_i_choices[i][idx] for i, idx in enumerate(pipeline_instantiation_idxs) + ] data_is_sparse = sparse dataset_is_signed = signed for node in pipeline_instantiation: - node_input = node.get_properties()['input'] - node_output = node.get_properties()['output'] + node_input = node.get_properties()["input"] + node_output = node.get_properties()["output"] # First check if these two instantiations of this node can work # together. Do this in multiple if statements to maintain # readability - if (data_is_sparse and SPARSE not in node_input) or \ - not data_is_sparse and DENSE not in node_input: + if ( + (data_is_sparse and SPARSE not in node_input) + or not data_is_sparse + and DENSE not in node_input + ): matches[pipeline_instantiation_idxs] = 0 break # No need to check if the node can handle SIGNED_DATA; this is @@ -69,10 +82,16 @@ def get_match_array(pipeline, dataset_properties, matches[pipeline_instantiation_idxs] = 0 break - if (INPUT in node_output and DENSE not in node_output and SPARSE not in node_output) \ - or PREDICTIONS in node_output \ - or (not data_is_sparse and DENSE in node_input and DENSE in node_output) \ - or (data_is_sparse and SPARSE in node_input and SPARSE in node_output): + if ( + ( + INPUT in node_output + and DENSE not in node_output + and SPARSE not in node_output + ) + or PREDICTIONS in node_output + or (not data_is_sparse and DENSE in node_input and DENSE in node_output) + or (data_is_sparse and SPARSE in node_input and SPARSE in node_output) + ): # Don't change the data_is_sparse flag pass elif data_is_sparse and DENSE in node_output: @@ -87,8 +106,11 @@ def get_match_array(pipeline, dataset_properties, if PREDICTIONS in node_output: pass - elif (INPUT in node_output and SIGNED_DATA not in node_output and - UNSIGNED_DATA not in node_output): + elif ( + INPUT in node_output + and SIGNED_DATA not in node_output + and UNSIGNED_DATA not in node_output + ): pass elif SIGNED_DATA in node_output: dataset_is_signed = True @@ -103,27 +125,32 @@ def get_match_array(pipeline, dataset_properties, return matches -def find_active_choices(matches, node, node_idx, dataset_properties, include=None, exclude=None): +def find_active_choices( + matches, node, node_idx, dataset_properties, include=None, exclude=None +): if not hasattr(node, "get_available_components"): raise ValueError() - available_components = node.get_available_components(dataset_properties, - include=include, - exclude=exclude) - assert matches.shape[node_idx] == len(available_components), \ - (matches.shape[node_idx], len(available_components)) + available_components = node.get_available_components( + dataset_properties, include=include, exclude=exclude + ) + assert matches.shape[node_idx] == len(available_components), ( + matches.shape[node_idx], + len(available_components), + ) choices = [] for c_idx, component in enumerate(available_components): - slices = tuple(slice(None) if idx != node_idx else slice(c_idx, c_idx+1) - for idx in range(len(matches.shape))) + slices = tuple( + slice(None) if idx != node_idx else slice(c_idx, c_idx + 1) + for idx in range(len(matches.shape)) + ) if np.sum(matches[slices]) > 0: choices.append(component) return choices -def add_forbidden(conf_space, pipeline, matches, dataset_properties, - include, exclude): +def add_forbidden(conf_space, pipeline, matches, dataset_properties, include, exclude): # Not sure if this works for 3D node_i_is_choice = [] node_i_choices_names = [] @@ -134,18 +161,20 @@ def add_forbidden(conf_space, pipeline, matches, dataset_properties, is_choice = hasattr(node, "get_available_components") node_i_is_choice.append(is_choice) - node_include = include.get( - node_name) if include is not None else None - node_exclude = exclude.get( - node_name) if exclude is not None else None + node_include = include.get(node_name) if include is not None else None + node_exclude = exclude.get(node_name) if exclude is not None else None if is_choice: - node_i_choices_names.append(node.get_available_components( - dataset_properties, include=node_include, - exclude=node_exclude).keys()) - node_i_choices.append(node.get_available_components( - dataset_properties, include=node_include, - exclude=node_exclude).values()) + node_i_choices_names.append( + node.get_available_components( + dataset_properties, include=node_include, exclude=node_exclude + ).keys() + ) + node_i_choices.append( + node.get_available_components( + dataset_properties, include=node_include, exclude=node_exclude + ).values() + ) else: node_i_choices_names.append([node_name]) @@ -185,8 +214,8 @@ def add_forbidden(conf_space, pipeline, matches, dataset_properties, for idx in indices: node = all_nodes[idx] available_components = node.get_available_components( - dataset_properties, - include=node_i_choices_names[idx]) + dataset_properties, include=node_i_choices_names[idx] + ) assert len(available_components) > 0, len(available_components) skip_array_shape.append(len(available_components)) num_node_choices.append(range(len(available_components))) @@ -198,9 +227,11 @@ def add_forbidden(conf_space, pipeline, matches, dataset_properties, for node_idx, choice_idx in enumerate(product): node_idx += start_idx slices_ = tuple( - slice(None) if idx != node_idx else - slice(choice_idx, choice_idx + 1) for idx in - range(len(matches.shape))) + slice(None) + if idx != node_idx + else slice(choice_idx, choice_idx + 1) + for idx in range(len(matches.shape)) + ) if np.sum(matches[slices_]) == 0: skip_array[product] = 1 @@ -210,10 +241,13 @@ def add_forbidden(conf_space, pipeline, matches, dataset_properties, continue slices = tuple( - slice(None) if idx not in indices else - slice(product[idx - start_idx], - product[idx - start_idx] + 1) for idx in - range(len(matches.shape))) + slice(None) + if idx not in indices + else slice( + product[idx - start_idx], product[idx - start_idx] + 1 + ) + for idx in range(len(matches.shape)) + ) # This prints the affected nodes # print [node_choice_names[i][product[i]] @@ -221,9 +255,12 @@ def add_forbidden(conf_space, pipeline, matches, dataset_properties, # np.sum(matches[slices]) if np.sum(matches[slices]) == 0: - constraint = tuple([(node_names[i], - node_choice_names[i][product[i]]) - for i in range(len(product))]) + constraint = tuple( + [ + (node_names[i], node_choice_names[i][product[i]]) + for i in range(len(product)) + ] + ) # Check if a more general constraint/forbidden clause # was already added @@ -231,8 +268,12 @@ def add_forbidden(conf_space, pipeline, matches, dataset_properties, for constraint_length in range(2, len(constraint)): constr_starts = len(constraint) - constraint_length + 1 for constraint_start_idx in range(constr_starts): - constraint_end_idx = constraint_start_idx + constraint_length - sub_constraint = constraint[constraint_start_idx:constraint_end_idx] + constraint_end_idx = ( + constraint_start_idx + constraint_length + ) + sub_constraint = constraint[ + constraint_start_idx:constraint_end_idx + ] if sub_constraint in constraints: continue_ = True break @@ -246,9 +287,13 @@ def add_forbidden(conf_space, pipeline, matches, dataset_properties, forbiddens = [] for i in range(len(product)): forbiddens.append( - ForbiddenEqualsClause(conf_space.get_hyperparameter( - node_names[i] + ":__choice__"), - node_choice_names[i][product[i]])) + ForbiddenEqualsClause( + conf_space.get_hyperparameter( + node_names[i] + ":__choice__" + ), + node_choice_names[i][product[i]], + ) + ) forbidden = ForbiddenAndConjunction(*forbiddens) conf_space.add_forbidden_clause(forbidden) diff --git a/autosklearn/pipeline/implementations/CategoryShift.py b/autosklearn/pipeline/implementations/CategoryShift.py index 3eee659d3f..4c504cf666 100644 --- a/autosklearn/pipeline/implementations/CategoryShift.py +++ b/autosklearn/pipeline/implementations/CategoryShift.py @@ -1,12 +1,11 @@ import numpy as np from scipy import sparse -from sklearn.utils import check_array from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import check_array class CategoryShift(BaseEstimator, TransformerMixin): - """ Add 3 to every category. - """ + """Add 3 to every category.""" def __init__(self, random_state=None): self.random_state = random_state @@ -15,14 +14,16 @@ def _convert_and_check_X(self, X): X_data = X.data if sparse.issparse(X) else X # Check if data is numeric and positive - if X_data.dtype.kind not in set('buif') or np.nanmin(X_data) < 0: - raise ValueError('Categories should be non-negative numbers. ' - 'NOTE: floats will be casted to integers.') + if X_data.dtype.kind not in set("buif") or np.nanmin(X_data) < 0: + raise ValueError( + "Categories should be non-negative numbers. " + "NOTE: floats will be casted to integers." + ) # Use check_array to make sure we are using the right kind of sparse array # Notice that we cannot convert the array to integer right now. That would get # rid of the np.nans and we need them later on for the imputation. - X = check_array(X, accept_sparse='csc', force_all_finite=False, copy=True) + X = check_array(X, accept_sparse="csc", force_all_finite=False, copy=True) return X def fit(self, X, y=None): diff --git a/autosklearn/pipeline/implementations/MinorityCoalescer.py b/autosklearn/pipeline/implementations/MinorityCoalescer.py index 989cb4a9c9..e9bed0349e 100644 --- a/autosklearn/pipeline/implementations/MinorityCoalescer.py +++ b/autosklearn/pipeline/implementations/MinorityCoalescer.py @@ -4,7 +4,7 @@ class MinorityCoalescer(BaseEstimator, TransformerMixin): - """ Group together categories which occurence is less than a specified + """Group together categories which occurence is less than a specified minimum fraction. Coalesced categories get index of one. """ @@ -31,7 +31,8 @@ def fit(self, X, y=None): indptr_start = X.indptr[column] indptr_end = X.indptr[column + 1] unique, counts = np.unique( - X.data[indptr_start:indptr_end], return_counts=True) + X.data[indptr_start:indptr_end], return_counts=True + ) colsize = indptr_end - indptr_start else: unique, counts = np.unique(X[:, column], return_counts=True) @@ -61,11 +62,15 @@ def transform(self, X): indptr_start = X.indptr[column] indptr_end = X.indptr[column + 1] X.data[indptr_start:indptr_end][ - X.data[indptr_start:indptr_end] == unique_value] = 1 + X.data[indptr_start:indptr_end] == unique_value + ] = 1 else: unique = np.unique(X[:, column]) - unique_values = [unique_value for unique_value in unique - if unique_value not in self.do_not_coalesce_[column]] + unique_values = [ + unique_value + for unique_value in unique + if unique_value not in self.do_not_coalesce_[column] + ] mask = np.isin(X[:, column], unique_values) X[mask, column] = 1 return X diff --git a/autosklearn/pipeline/implementations/SparseOneHotEncoder.py b/autosklearn/pipeline/implementations/SparseOneHotEncoder.py index beee99261b..18d491596c 100644 --- a/autosklearn/pipeline/implementations/SparseOneHotEncoder.py +++ b/autosklearn/pipeline/implementations/SparseOneHotEncoder.py @@ -43,8 +43,7 @@ def fit(self, X, y=None): def _check_X(self, X): if not sparse.issparse(X): raise TypeError("SparseOneHotEncoder requires X to be sparse") - X = check_array(X, accept_sparse='csc', force_all_finite=False, - dtype=np.int32) + X = check_array(X, accept_sparse="csc", force_all_finite=False, dtype=np.int32) if X.min() < 0: raise ValueError("X needs to contain only non-negative integers.") @@ -63,15 +62,17 @@ def fit_transform(self, X, y=None): row_indices = X.indices column_indices = [] for i in range(len(X.indptr) - 1): - nbr = X.indptr[i+1] - X.indptr[i] + nbr = X.indptr[i + 1] - X.indptr[i] column_indices_ = [indices[i]] * nbr - column_indices_ += X.data[X.indptr[i]:X.indptr[i+1]] + column_indices_ += X.data[X.indptr[i] : X.indptr[i + 1]] column_indices.extend(column_indices_) data = np.ones(X.data.size) - out = sparse.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=np.int32).tocsc() + out = sparse.coo_matrix( + (data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=np.int32, + ).tocsc() mask = np.array(out.sum(axis=0)).ravel() != 0 active_features = np.where(mask)[0] @@ -85,9 +86,10 @@ def transform(self, X): n_samples, n_features = X.shape indices = self.feature_indices_ if n_features != indices.shape[0] - 1: - raise ValueError("X has different shape than during fitting." - " Expected %d, got %d." - % (indices.shape[0] - 1, n_features)) + raise ValueError( + "X has different shape than during fitting." + " Expected %d, got %d." % (indices.shape[0] - 1, n_features) + ) n_values_check = X.max(axis=0).toarray().flatten() + 1 @@ -99,7 +101,7 @@ def transform(self, X): for i, n_value_check in enumerate(n_values_check): if (n_value_check - 1) >= self.n_values_[i]: indptr_start = X.indptr[i] - indptr_end = X.indptr[i+1] + indptr_end = X.indptr[i + 1] zeros_mask = X.data[indptr_start:indptr_end] >= self.n_values_[i] X.data[indptr_start:indptr_end][zeros_mask] = 0 @@ -108,13 +110,15 @@ def transform(self, X): for i in range(len(X.indptr) - 1): nbr = X.indptr[i + 1] - X.indptr[i] column_indices_ = [indices[i]] * nbr - column_indices_ += X.data[X.indptr[i]:X.indptr[i + 1]] + column_indices_ += X.data[X.indptr[i] : X.indptr[i + 1]] column_indices.extend(column_indices_) data = np.ones(X.data.size) - out = sparse.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=np.int32).tocsc() + out = sparse.coo_matrix( + (data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=np.int32, + ).tocsc() out = out[:, self.active_features_] return out.tocsr() diff --git a/autosklearn/pipeline/implementations/__init__.py b/autosklearn/pipeline/implementations/__init__.py index 8f0ce6cb7c..92bf78f389 100644 --- a/autosklearn/pipeline/implementations/__init__.py +++ b/autosklearn/pipeline/implementations/__init__.py @@ -1 +1 @@ -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/autosklearn/pipeline/implementations/util.py b/autosklearn/pipeline/implementations/util.py index cacf3a9b5d..4ebc01ad39 100644 --- a/autosklearn/pipeline/implementations/util.py +++ b/autosklearn/pipeline/implementations/util.py @@ -18,7 +18,7 @@ def softmax(df): def convert_multioutput_multiclass_to_multilabel(probas): - """ Converts the model predicted probabilities to useable format. + """Converts the model predicted probabilities to useable format. In some cases, models predicted_proba can output an array of shape (2, n_samples, n_labels) where the 2 stands for the probability of positive @@ -55,8 +55,10 @@ def convert_multioutput_multiclass_to_multilabel(probas): # In case multioutput-multiclass input was used, where we have # a probability for each class elif n_probabilities > 2: - raise ValueError('Multioutput-Multiclass supported by ' - 'scikit-learn, but not by auto-sklearn!') + raise ValueError( + "Multioutput-Multiclass supported by " + "scikit-learn, but not by auto-sklearn!" + ) else: RuntimeError(f"Unkown predict_proba output={probas}") @@ -64,7 +66,7 @@ def convert_multioutput_multiclass_to_multilabel(probas): elif isinstance(probas, np.ndarray): if len(probas.shape) > 2: - raise ValueError('New unsupported sklearn output!') + raise ValueError("New unsupported sklearn output!") else: return probas diff --git a/autosklearn/pipeline/regression.py b/autosklearn/pipeline/regression.py index af2885be4d..638f8ae3cb 100644 --- a/autosklearn/pipeline/regression.py +++ b/autosklearn/pipeline/regression.py @@ -1,21 +1,19 @@ +from typing import Optional, Union + import copy from itertools import product -from typing import Optional, Union import numpy as np +from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause from sklearn.base import RegressorMixin -from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction - -from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice - - -from ConfigSpace.configuration_space import ConfigurationSpace, Configuration -from autosklearn.pipeline.components import regression as \ - regression_components -from autosklearn.pipeline.components import feature_preprocessing as \ - feature_preprocessing_components from autosklearn.pipeline.base import BasePipeline +from autosklearn.pipeline.components import ( + feature_preprocessing as feature_preprocessing_components, +) +from autosklearn.pipeline.components import regression as regression_components +from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice from autosklearn.pipeline.constants import SPARSE @@ -66,6 +64,7 @@ class SimpleRegressionPipeline(RegressorMixin, BasePipeline): -------- """ + def __init__( self, config: Optional[Configuration] = None, @@ -74,30 +73,34 @@ def __init__( include=None, exclude=None, random_state: Optional[Union[int, np.random.RandomState]] = None, - init_params=None + init_params=None, ): self._output_dtype = np.float32 if dataset_properties is None: dataset_properties = dict() - if 'target_type' not in dataset_properties: - dataset_properties['target_type'] = 'regression' + if "target_type" not in dataset_properties: + dataset_properties["target_type"] = "regression" super().__init__( - config=config, steps=steps, + config=config, + steps=steps, dataset_properties=dataset_properties, - include=include, exclude=exclude, random_state=random_state, - init_params=init_params) + include=include, + exclude=exclude, + random_state=random_state, + init_params=init_params, + ) def fit_estimator(self, X, y, **fit_params): self.y_max_ = np.nanmax(y) self.y_min_ = np.nanmin(y) - return super(SimpleRegressionPipeline, self).fit_estimator( - X, y, **fit_params) + return super(SimpleRegressionPipeline, self).fit_estimator(X, y, **fit_params) def iterative_fit(self, X, y, n_iter=1, **fit_params): self.y_max_ = np.nanmax(y) self.y_min_ = np.nanmin(y) return super(SimpleRegressionPipeline, self).iterative_fit( - X, y, n_iter=n_iter, **fit_params) + X, y, n_iter=n_iter, **fit_params + ) def predict(self, X, batch_size=None): y = super().predict(X, batch_size=batch_size) @@ -108,8 +111,9 @@ def predict(self, X, batch_size=None): y[y < (0.5 * self.y_min_)] = 0.5 * self.y_min_ return y - def _get_hyperparameter_search_space(self, include=None, exclude=None, - dataset_properties=None): + def _get_hyperparameter_search_space( + self, include=None, exclude=None, dataset_properties=None + ): """Return the configuration space for the CASH problem. Parameters @@ -134,43 +138,57 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, if dataset_properties is None or not isinstance(dataset_properties, dict): dataset_properties = dict() - if 'target_type' not in dataset_properties: - dataset_properties['target_type'] = 'regression' - if dataset_properties['target_type'] != 'regression': - dataset_properties['target_type'] = 'regression' + if "target_type" not in dataset_properties: + dataset_properties["target_type"] = "regression" + if dataset_properties["target_type"] != "regression": + dataset_properties["target_type"] = "regression" - if 'sparse' not in dataset_properties: + if "sparse" not in dataset_properties: # This dataset is probably dense - dataset_properties['sparse'] = False + dataset_properties["sparse"] = False cs = self._get_base_search_space( - cs=cs, dataset_properties=dataset_properties, - exclude=exclude, include=include, pipeline=self.steps) + cs=cs, + dataset_properties=dataset_properties, + exclude=exclude, + include=include, + pipeline=self.steps, + ) - regressors = cs.get_hyperparameter('regressor:__choice__').choices - preprocessors = cs.get_hyperparameter('feature_preprocessor:__choice__').choices + regressors = cs.get_hyperparameter("regressor:__choice__").choices + preprocessors = cs.get_hyperparameter("feature_preprocessor:__choice__").choices available_regressors = self._final_estimator.get_available_components( - dataset_properties) + dataset_properties + ) - possible_default_regressor = copy.copy(list( - available_regressors.keys())) - default = cs.get_hyperparameter('regressor:__choice__').default_value - del possible_default_regressor[ - possible_default_regressor.index(default)] + possible_default_regressor = copy.copy(list(available_regressors.keys())) + default = cs.get_hyperparameter("regressor:__choice__").default_value + del possible_default_regressor[possible_default_regressor.index(default)] # A regressor which can handle sparse data after the densifier is # forbidden for memory issues for key in regressors: - if SPARSE in available_regressors[key].get_properties(dataset_properties=None)['input']: - if 'densifier' in preprocessors: + if ( + SPARSE + in available_regressors[key].get_properties(dataset_properties=None)[ + "input" + ] + ): + if "densifier" in preprocessors: while True: try: forb_reg = ForbiddenEqualsClause( - cs.get_hyperparameter('regressor:__choice__'), key) - forb_fpp = ForbiddenEqualsClause(cs.get_hyperparameter( - 'feature_preprocessor:__choice__'), 'densifier') + cs.get_hyperparameter("regressor:__choice__"), key + ) + forb_fpp = ForbiddenEqualsClause( + cs.get_hyperparameter( + "feature_preprocessor:__choice__" + ), + "densifier", + ) cs.add_forbidden_clause( - ForbiddenAndConjunction(forb_reg, forb_fpp)) + ForbiddenAndConjunction(forb_reg, forb_fpp) + ) # Success break except ValueError: @@ -179,16 +197,25 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, default = possible_default_regressor.pop() except IndexError: raise ValueError( - "Cannot find a legal default configuration.") + "Cannot find a legal default configuration." + ) cs.get_hyperparameter( - 'regressor:__choice__').default_value = default + "regressor:__choice__" + ).default_value = default # which would take too long # Combinations of tree-based models with feature learning: regressors_ = [ - "adaboost", "ard_regression", "decision_tree", - "extra_trees", "gaussian_process", "gradient_boosting", - "k_nearest_neighbors", "libsvm_svr", "mlp", "random_forest" + "adaboost", + "ard_regression", + "decision_tree", + "extra_trees", + "gaussian_process", + "gradient_boosting", + "k_nearest_neighbors", + "libsvm_svr", + "mlp", + "random_forest", ] feature_learning_ = ["kitchen_sinks", "kernel_pca", "nystroem_sampler"] @@ -199,11 +226,19 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, continue while True: try: - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - "regressor:__choice__"), r), - ForbiddenEqualsClause(cs.get_hyperparameter( - "feature_preprocessor:__choice__"), f))) + cs.add_forbidden_clause( + ForbiddenAndConjunction( + ForbiddenEqualsClause( + cs.get_hyperparameter("regressor:__choice__"), r + ), + ForbiddenEqualsClause( + cs.get_hyperparameter( + "feature_preprocessor:__choice__" + ), + f, + ), + ) + ) break except KeyError: break @@ -212,10 +247,10 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, try: default = possible_default_regressor.pop() except IndexError: - raise ValueError( - "Cannot find a legal default configuration.") + raise ValueError("Cannot find a legal default configuration.") cs.get_hyperparameter( - 'regressor:__choice__').default_value = default + "regressor:__choice__" + ).default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties @@ -227,33 +262,34 @@ def _get_estimator_components(self): def _get_pipeline_steps(self, dataset_properties, init_params=None): steps = [] - default_dataset_properties = {'target_type': 'regression'} + default_dataset_properties = {"target_type": "regression"} if dataset_properties is not None and isinstance(dataset_properties, dict): default_dataset_properties.update(dataset_properties) - steps.extend([ - [ - 'data_preprocessor', - DataPreprocessorChoice( - dataset_properties=default_dataset_properties, - random_state=self.random_state - ) - ], - [ - 'feature_preprocessor', - feature_preprocessing_components.FeaturePreprocessorChoice( - dataset_properties=default_dataset_properties, - random_state=self.random_state - ) - ], + steps.extend( [ - 'regressor', - regression_components.RegressorChoice( - default_dataset_properties, - random_state=self.random_state - ) + [ + "data_preprocessor", + DataPreprocessorChoice( + dataset_properties=default_dataset_properties, + random_state=self.random_state, + ), + ], + [ + "feature_preprocessor", + feature_preprocessing_components.FeaturePreprocessorChoice( + dataset_properties=default_dataset_properties, + random_state=self.random_state, + ), + ], + [ + "regressor", + regression_components.RegressorChoice( + default_dataset_properties, random_state=self.random_state + ), + ], ] - ]) + ) return steps diff --git a/autosklearn/pipeline/util.py b/autosklearn/pipeline/util.py index 1a70deb30b..228c31357d 100644 --- a/autosklearn/pipeline/util.py +++ b/autosklearn/pipeline/util.py @@ -18,8 +18,7 @@ def find_sklearn_classes(class_): for root, dirs, files in os.walk(sklearn_path): all_subdirectories.append(root) - for module_loader, module_name, ispkg in \ - pkgutil.iter_modules(all_subdirectories): + for module_loader, module_name, ispkg in pkgutil.iter_modules(all_subdirectories): # Work around some issues... if module_name in ["hmm", "mixture"]: @@ -33,8 +32,7 @@ def find_sklearn_classes(class_): pkg = importlib.import_module(full_module_name) for member_name, obj in inspect.getmembers(pkg): - if inspect.isclass(obj) and \ - issubclass(obj, class_): + if inspect.isclass(obj) and issubclass(obj, class_): classifier = obj # print member_name, obj classifiers.add(classifier) @@ -44,15 +42,20 @@ def find_sklearn_classes(class_): print(classifier) -def get_dataset(dataset='iris', make_sparse=False, add_NaNs=False, - train_size_maximum=150, make_multilabel=False, - make_binary=False): +def get_dataset( + dataset="iris", + make_sparse=False, + add_NaNs=False, + train_size_maximum=150, + make_multilabel=False, + make_binary=False, +): iris = getattr(sklearn.datasets, "load_%s" % dataset)() X = iris.data.astype(np.float32) Y = iris.target rs = np.random.RandomState(42) indices = np.arange(X.shape[0]) - train_size = min(int(len(indices) / 3. * 2.), train_size_maximum) + train_size = min(int(len(indices) / 3.0 * 2.0), train_size_maximum) rs.shuffle(indices) X = X[indices] Y = Y[indices] @@ -76,8 +79,10 @@ def get_dataset(dataset='iris', make_sparse=False, add_NaNs=False, X_test.eliminate_zeros() if make_binary and make_multilabel: - raise ValueError('Can convert dataset only to one of the two ' - 'options binary or multilabel!') + raise ValueError( + "Can convert dataset only to one of the two " + "options binary or multilabel!" + ) if make_binary: Y_train[Y_train > 1] = 1 @@ -97,23 +102,31 @@ def get_dataset(dataset='iris', make_sparse=False, add_NaNs=False, return X_train, Y_train, X_test, Y_test -def _test_classifier(classifier, dataset='iris', sparse=False, - train_size_maximum=150, make_multilabel=False, - make_binary=False): - X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, - make_sparse=sparse, - train_size_maximum=train_size_maximum, - make_multilabel=make_multilabel, - make_binary=make_binary) +def _test_classifier( + classifier, + dataset="iris", + sparse=False, + train_size_maximum=150, + make_multilabel=False, + make_binary=False, +): + X_train, Y_train, X_test, Y_test = get_dataset( + dataset=dataset, + make_sparse=sparse, + train_size_maximum=train_size_maximum, + make_multilabel=make_multilabel, + make_binary=make_binary, + ) configuration_space = classifier.get_hyperparameter_search_space( - dataset_properties={'sparse': sparse} + dataset_properties={"sparse": sparse} ) default_config = configuration_space.get_default_configuration() classifier = classifier(random_state=0, **default_config) - if hasattr(classifier, 'iterative_fit'): + if hasattr(classifier, "iterative_fit"): + class counter(object): def __init__(self, func): self.n_calls = 0 @@ -122,11 +135,12 @@ def __init__(self, func): def __call__(self, *args, **kwargs): self.n_calls += 1 return self.func(*args, **kwargs) + classifier.iterative_fit = counter(classifier.iterative_fit) predictor = classifier.fit(X_train, Y_train) - if hasattr(classifier, 'iterative_fit'): + if hasattr(classifier, "iterative_fit"): n_calls = classifier.iterative_fit.n_calls else: n_calls = None @@ -135,11 +149,10 @@ def __call__(self, *args, **kwargs): return predictions, Y_test, n_calls -def _test_classifier_iterative_fit(classifier, dataset='iris', sparse=False): - X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, - make_sparse=sparse) +def _test_classifier_iterative_fit(classifier, dataset="iris", sparse=False): + X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, make_sparse=sparse) configuration_space = classifier.get_hyperparameter_search_space( - dataset_properties={'sparse': sparse} + dataset_properties={"sparse": sparse} ) default_config = configuration_space.get_default_configuration() @@ -148,7 +161,7 @@ def _test_classifier_iterative_fit(classifier, dataset='iris', sparse=False): iteration = 2 while not classifier.configuration_fully_fitted(): - n_iter = int(2 ** iteration / 2) + n_iter = int(2**iteration / 2) classifier.iterative_fit(X_train, Y_train, n_iter=n_iter) iteration += 1 @@ -156,15 +169,21 @@ def _test_classifier_iterative_fit(classifier, dataset='iris', sparse=False): return predictions, Y_test, classifier -def _test_classifier_predict_proba(classifier, dataset='iris', sparse=False, - train_size_maximum=150, - make_multilabel=False, - make_binary=False): - X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, - make_sparse=sparse, - train_size_maximum=train_size_maximum, - make_multilabel=make_multilabel, - make_binary=make_binary) +def _test_classifier_predict_proba( + classifier, + dataset="iris", + sparse=False, + train_size_maximum=150, + make_multilabel=False, + make_binary=False, +): + X_train, Y_train, X_test, Y_test = get_dataset( + dataset=dataset, + make_sparse=sparse, + train_size_maximum=train_size_maximum, + make_multilabel=make_multilabel, + make_binary=make_binary, + ) configuration_space = classifier.get_hyperparameter_search_space() default_config = configuration_space.get_default_configuration() @@ -175,11 +194,12 @@ def _test_classifier_predict_proba(classifier, dataset='iris', sparse=False, return predictions, Y_test -def _test_preprocessing(Preprocessor, dataset='iris', make_sparse=False, - train_size_maximum=150): - X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, - make_sparse=make_sparse, - train_size_maximum=train_size_maximum) +def _test_preprocessing( + Preprocessor, dataset="iris", make_sparse=False, train_size_maximum=150 +): + X_train, Y_train, X_test, Y_test = get_dataset( + dataset=dataset, make_sparse=make_sparse, train_size_maximum=train_size_maximum + ) original_X_train = X_train.copy() configuration_space = Preprocessor.get_hyperparameter_search_space() default_config = configuration_space.get_default_configuration() @@ -191,8 +211,9 @@ def _test_preprocessing(Preprocessor, dataset='iris', make_sparse=False, class PreprocessingTestCase(unittest.TestCase): - def _test_preprocessing_dtype(self, Preprocessor, add_NaNs=False, - test_sparse=True, dataset='iris'): + def _test_preprocessing_dtype( + self, Preprocessor, add_NaNs=False, test_sparse=True, dataset="iris" + ): # Dense # np.float32 X_train, Y_train, X_test, Y_test = get_dataset(dataset, add_NaNs=add_NaNs) @@ -220,8 +241,9 @@ def _test_preprocessing_dtype(self, Preprocessor, add_NaNs=False, if test_sparse is True: # Sparse # np.float32 - X_train, Y_train, X_test, Y_test = get_dataset(dataset, make_sparse=True, - add_NaNs=add_NaNs) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset, make_sparse=True, add_NaNs=add_NaNs + ) self.assertEqual(X_train.dtype, np.float32) configuration_space = Preprocessor.get_hyperparameter_search_space() default_config = configuration_space.get_default_configuration() @@ -232,9 +254,9 @@ def _test_preprocessing_dtype(self, Preprocessor, add_NaNs=False, # self.assertEqual(Xt.dtype, np.float32) # np.float64 - X_train, Y_train, X_test, Y_test = get_dataset(dataset, - make_sparse=True, - add_NaNs=add_NaNs) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset, make_sparse=True, add_NaNs=add_NaNs + ) X_train = X_train.astype(np.float64) configuration_space = Preprocessor.get_hyperparameter_search_space() default_config = configuration_space.get_default_configuration() @@ -245,9 +267,8 @@ def _test_preprocessing_dtype(self, Preprocessor, add_NaNs=False, # self.assertEqual(Xt.dtype, np.float64) -def _test_regressor(Regressor, dataset='diabetes', sparse=False): - X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, - make_sparse=sparse) +def _test_regressor(Regressor, dataset="diabetes", sparse=False): + X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, make_sparse=sparse) configuration_space = Regressor.get_hyperparameter_search_space() default_config = configuration_space.get_default_configuration() @@ -258,7 +279,8 @@ def _test_regressor(Regressor, dataset='diabetes', sparse=False): X_test_hash = hash(str(X_test)) Y_train_hash = hash(str(Y_train)) - if hasattr(regressor, 'iterative_fit'): + if hasattr(regressor, "iterative_fit"): + class counter(object): def __init__(self, func): self.n_calls = 0 @@ -272,24 +294,25 @@ def __call__(self, *args, **kwargs): predictor = regressor.fit(X_train, Y_train) - if hasattr(regressor, 'iterative_fit'): + if hasattr(regressor, "iterative_fit"): n_calls = regressor.iterative_fit.n_calls else: n_calls = None predictions = predictor.predict(X_test) - if X_train_hash != hash(str(X_train)) or \ - X_test_hash != hash(str(X_test)) or \ - Y_train_hash != hash(str(Y_train)): + if ( + X_train_hash != hash(str(X_train)) + or X_test_hash != hash(str(X_test)) + or Y_train_hash != hash(str(Y_train)) + ): raise ValueError("Model modified data") return predictions, Y_test, n_calls -def _test_regressor_iterative_fit(Regressor, dataset='diabetes', sparse=False): - X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, - make_sparse=sparse) +def _test_regressor_iterative_fit(Regressor, dataset="diabetes", sparse=False): + X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, make_sparse=sparse) configuration_space = Regressor.get_hyperparameter_search_space( - dataset_properties={'sparse': sparse} + dataset_properties={"sparse": sparse} ) default_config = configuration_space.get_default_configuration() regressor = Regressor(random_state=0, **default_config) @@ -297,7 +320,7 @@ def _test_regressor_iterative_fit(Regressor, dataset='diabetes', sparse=False): regressor.iterative_fit(X_train, Y_train, n_iter=2, refit=True) iteration = 2 while not regressor.configuration_fully_fitted(): - n_iter = int(2 ** iteration / 2) + n_iter = int(2**iteration / 2) regressor.iterative_fit(X_train, Y_train, n_iter=n_iter) iteration += 1 diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index 3cb823f2ff..b179efa8d0 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -1,4 +1,6 @@ +import typing from typing import Dict, List, Optional + import copy import json import logging @@ -6,151 +8,191 @@ import os import time import traceback -import typing import warnings import dask.distributed import pynisher - +from smac.callbacks import IncorporateRunResultCallback from smac.facade.smac_ac_facade import SMAC4AC -from smac.intensification.simple_intensifier import SimpleIntensifier from smac.intensification.intensification import Intensifier +from smac.intensification.simple_intensifier import SimpleIntensifier from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost from smac.scenario.scenario import Scenario -from smac.tae.serial_runner import SerialRunner from smac.tae.dask_runner import DaskParallelRunner -from smac.callbacks import IncorporateRunResultCallback - +from smac.tae.serial_runner import SerialRunner import autosklearn.metalearning -from autosklearn.constants import MULTILABEL_CLASSIFICATION, \ - BINARY_CLASSIFICATION, TASK_TYPES_TO_STRING, CLASSIFICATION_TASKS, \ - MULTICLASS_CLASSIFICATION, REGRESSION, MULTIOUTPUT_REGRESSION -from autosklearn.ensemble_builder import EnsembleBuilderManager -from autosklearn.metalearning.mismbo import suggest_via_metalearning +from autosklearn.constants import ( + BINARY_CLASSIFICATION, + CLASSIFICATION_TASKS, + MULTICLASS_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, + MULTIOUTPUT_REGRESSION, + REGRESSION, + TASK_TYPES_TO_STRING, +) from autosklearn.data.abstract_data_manager import AbstractDataManager +from autosklearn.ensemble_builder import EnsembleBuilderManager from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash +from autosklearn.metalearning.metafeatures.metafeatures import ( + calculate_all_metafeatures_encoded_labels, + calculate_all_metafeatures_with_labels, +) +from autosklearn.metalearning.metalearning.meta_base import MetaBase +from autosklearn.metalearning.mismbo import suggest_via_metalearning from autosklearn.util.logging_ import get_named_client_logger from autosklearn.util.parallel import preload_modules -from autosklearn.metalearning.metalearning.meta_base import MetaBase -from autosklearn.metalearning.metafeatures.metafeatures import \ - calculate_all_metafeatures_with_labels, calculate_all_metafeatures_encoded_labels EXCLUDE_META_FEATURES_CLASSIFICATION = { - 'Landmark1NN', - 'LandmarkDecisionNodeLearner', - 'LandmarkDecisionTree', - 'LandmarkLDA', - 'LandmarkNaiveBayes', - 'LandmarkRandomNodeLearner', - 'PCAFractionOfComponentsFor95PercentVariance', - 'PCAKurtosisFirstPC', - 'PCASkewnessFirstPC', - 'PCA', + "Landmark1NN", + "LandmarkDecisionNodeLearner", + "LandmarkDecisionTree", + "LandmarkLDA", + "LandmarkNaiveBayes", + "LandmarkRandomNodeLearner", + "PCAFractionOfComponentsFor95PercentVariance", + "PCAKurtosisFirstPC", + "PCASkewnessFirstPC", + "PCA", } EXCLUDE_META_FEATURES_REGRESSION = { - 'Landmark1NN', - 'LandmarkDecisionNodeLearner', - 'LandmarkDecisionTree', - 'LandmarkLDA', - 'LandmarkNaiveBayes', - 'PCAFractionOfComponentsFor95PercentVariance', - 'PCAKurtosisFirstPC', - 'PCASkewnessFirstPC', - 'NumberOfClasses', - 'ClassOccurences', - 'ClassProbabilityMin', - 'ClassProbabilityMax', - 'ClassProbabilityMean', - 'ClassProbabilitySTD', - 'ClassEntropy', - 'LandmarkRandomNodeLearner', - 'PCA', + "Landmark1NN", + "LandmarkDecisionNodeLearner", + "LandmarkDecisionTree", + "LandmarkLDA", + "LandmarkNaiveBayes", + "PCAFractionOfComponentsFor95PercentVariance", + "PCAKurtosisFirstPC", + "PCASkewnessFirstPC", + "NumberOfClasses", + "ClassOccurences", + "ClassProbabilityMin", + "ClassProbabilityMax", + "ClassProbabilityMean", + "ClassProbabilitySTD", + "ClassEntropy", + "LandmarkRandomNodeLearner", + "PCA", } def get_send_warnings_to_logger(logger): def _send_warnings_to_log(message, category, filename, lineno, file, line): - logger.debug('%s:%s: %s:%s', filename, lineno, category.__name__, message) + logger.debug("%s:%s: %s:%s", filename, lineno, category.__name__, message) + return _send_warnings_to_log # metalearning helpers -def _calculate_metafeatures(data_feat_type, data_info_task, basename, - x_train, y_train, watcher, logger_): +def _calculate_metafeatures( + data_feat_type, data_info_task, basename, x_train, y_train, watcher, logger_ +): with warnings.catch_warnings(): warnings.showwarning = get_send_warnings_to_logger(logger_) # == Calculate metafeatures - task_name = 'CalculateMetafeatures' + task_name = "CalculateMetafeatures" watcher.start_task(task_name) - categorical = {col: True if feat_type.lower() == 'categorical' else False - for col, feat_type in data_feat_type.items()} + categorical = { + col: True if feat_type.lower() == "categorical" else False + for col, feat_type in data_feat_type.items() + } - EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION \ - if data_info_task in CLASSIFICATION_TASKS else EXCLUDE_META_FEATURES_REGRESSION + EXCLUDE_META_FEATURES = ( + EXCLUDE_META_FEATURES_CLASSIFICATION + if data_info_task in CLASSIFICATION_TASKS + else EXCLUDE_META_FEATURES_REGRESSION + ) - if data_info_task in [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, - MULTILABEL_CLASSIFICATION, REGRESSION, - MULTIOUTPUT_REGRESSION]: - logger_.info('Start calculating metafeatures for %s', basename) + if data_info_task in [ + MULTICLASS_CLASSIFICATION, + BINARY_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, + REGRESSION, + MULTIOUTPUT_REGRESSION, + ]: + logger_.info("Start calculating metafeatures for %s", basename) result = calculate_all_metafeatures_with_labels( - x_train, y_train, categorical=categorical, + x_train, + y_train, + categorical=categorical, dataset_name=basename, - dont_calculate=EXCLUDE_META_FEATURES, logger=logger_) + dont_calculate=EXCLUDE_META_FEATURES, + logger=logger_, + ) for key in list(result.metafeature_values.keys()): - if result.metafeature_values[key].type_ != 'METAFEATURE': + if result.metafeature_values[key].type_ != "METAFEATURE": del result.metafeature_values[key] else: result = None - logger_.info('Metafeatures not calculated') + logger_.info("Metafeatures not calculated") watcher.stop_task(task_name) logger_.info( - 'Calculating Metafeatures (categorical attributes) took %5.2f', - watcher.wall_elapsed(task_name)) + "Calculating Metafeatures (categorical attributes) took %5.2f", + watcher.wall_elapsed(task_name), + ) return result -def _calculate_metafeatures_encoded(data_feat_type, basename, x_train, y_train, watcher, - task, logger_): +def _calculate_metafeatures_encoded( + data_feat_type, basename, x_train, y_train, watcher, task, logger_ +): with warnings.catch_warnings(): warnings.showwarning = get_send_warnings_to_logger(logger_) - EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION \ - if task in CLASSIFICATION_TASKS else EXCLUDE_META_FEATURES_REGRESSION + EXCLUDE_META_FEATURES = ( + EXCLUDE_META_FEATURES_CLASSIFICATION + if task in CLASSIFICATION_TASKS + else EXCLUDE_META_FEATURES_REGRESSION + ) - task_name = 'CalculateMetafeaturesEncoded' + task_name = "CalculateMetafeaturesEncoded" watcher.start_task(task_name) - categorical = {col: True if feat_type.lower() == 'categorical' else False - for col, feat_type in data_feat_type.items()} + categorical = { + col: True if feat_type.lower() == "categorical" else False + for col, feat_type in data_feat_type.items() + } result = calculate_all_metafeatures_encoded_labels( - x_train, y_train, categorical=categorical, - dataset_name=basename, dont_calculate=EXCLUDE_META_FEATURES, logger=logger_) + x_train, + y_train, + categorical=categorical, + dataset_name=basename, + dont_calculate=EXCLUDE_META_FEATURES, + logger=logger_, + ) for key in list(result.metafeature_values.keys()): - if result.metafeature_values[key].type_ != 'METAFEATURE': + if result.metafeature_values[key].type_ != "METAFEATURE": del result.metafeature_values[key] watcher.stop_task(task_name) logger_.info( - 'Calculating Metafeatures (encoded attributes) took %5.2fsec', - watcher.wall_elapsed(task_name)) + "Calculating Metafeatures (encoded attributes) took %5.2fsec", + watcher.wall_elapsed(task_name), + ) return result -def _get_metalearning_configurations(meta_base, basename, metric, - configuration_space, - task, - initial_configurations_via_metalearning, - is_sparse, - watcher, logger): - task_name = 'InitialConfigurations' +def _get_metalearning_configurations( + meta_base, + basename, + metric, + configuration_space, + task, + initial_configurations_via_metalearning, + is_sparse, + watcher, + logger, +): + task_name = "InitialConfigurations" watcher.start_task(task_name) try: metalearning_configurations = suggest_via_metalearning( - meta_base, basename, metric, + meta_base, + basename, + metric, task, is_sparse == 1, initial_configurations_via_metalearning, @@ -165,16 +207,21 @@ def _get_metalearning_configurations(meta_base, basename, metric, return metalearning_configurations -def _print_debug_info_of_init_configuration(initial_configurations, basename, - time_for_task, logger, watcher): - logger.debug('Initial Configurations: (%d)' % len(initial_configurations)) +def _print_debug_info_of_init_configuration( + initial_configurations, basename, time_for_task, logger, watcher +): + logger.debug("Initial Configurations: (%d)" % len(initial_configurations)) for initial_configuration in initial_configurations: logger.debug(initial_configuration) - logger.debug('Looking for initial configurations took %5.2fsec', - watcher.wall_elapsed('InitialConfigurations')) + logger.debug( + "Looking for initial configurations took %5.2fsec", + watcher.wall_elapsed("InitialConfigurations"), + ) logger.info( - 'Time left for %s after finding initial configurations: %5.2fsec', - basename, time_for_task - watcher.wall_elapsed(basename)) + "Time left for %s after finding initial configurations: %5.2fsec", + basename, + time_for_task - watcher.wall_elapsed(basename), + ) def get_smac_object( @@ -186,7 +233,7 @@ def get_smac_object( n_jobs, dask_client, ): - if len(scenario_dict['instances']) > 1: + if len(scenario_dict["instances"]) > 1: intensifier = Intensifier else: intensifier = SimpleIntensifier @@ -213,35 +260,37 @@ def get_smac_object( class AutoMLSMBO(object): - - def __init__(self, config_space, dataset_name, - backend, - total_walltime_limit, - func_eval_time_limit, - memory_limit, - metric, - watcher, - n_jobs, - dask_client: dask.distributed.Client, - port: int, - start_num_run=1, - data_memory_limit=None, - num_metalearning_cfgs=25, - config_file=None, - seed=1, - metadata_directory=None, - resampling_strategy='holdout', - resampling_strategy_args=None, - include: Optional[Dict[str, List[str]]] = None, - exclude: Optional[Dict[str, List[str]]] = None, - disable_file_output=False, - smac_scenario_args=None, - get_smac_object_callback=None, - scoring_functions=None, - pynisher_context='spawn', - ensemble_callback: typing.Optional[EnsembleBuilderManager] = None, - trials_callback: typing.Optional[IncorporateRunResultCallback] = None - ): + def __init__( + self, + config_space, + dataset_name, + backend, + total_walltime_limit, + func_eval_time_limit, + memory_limit, + metric, + watcher, + n_jobs, + dask_client: dask.distributed.Client, + port: int, + start_num_run=1, + data_memory_limit=None, + num_metalearning_cfgs=25, + config_file=None, + seed=1, + metadata_directory=None, + resampling_strategy="holdout", + resampling_strategy_args=None, + include: Optional[Dict[str, List[str]]] = None, + exclude: Optional[Dict[str, List[str]]] = None, + disable_file_output=False, + smac_scenario_args=None, + get_smac_object_callback=None, + scoring_functions=None, + pynisher_context="spawn", + ensemble_callback: typing.Optional[EnsembleBuilderManager] = None, + trials_callback: typing.Optional[IncorporateRunResultCallback] = None, + ): super(AutoMLSMBO, self).__init__() # data related self.dataset_name = dataset_name @@ -289,7 +338,11 @@ def __init__(self, config_space, dataset_name, self.trials_callback = trials_callback dataset_name_ = "" if dataset_name is None else dataset_name - logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name_) + logger_name = "%s(%d):%s" % ( + self.__class__.__name__, + self.seed, + ":" + dataset_name_, + ) if port is None: self.logger = logging.getLogger(__name__) else: @@ -308,7 +361,7 @@ def reset_data_manager(self, max_mem=None): else: self.datamanager = self.backend.load_datamanager() - self.task = self.datamanager.info['task'] + self.task = self.datamanager.info["task"] def collect_metalearning_suggestions(self, meta_base): metalearning_configurations = _get_metalearning_configurations( @@ -317,16 +370,18 @@ def collect_metalearning_suggestions(self, meta_base): metric=self.metric, configuration_space=self.config_space, task=self.task, - is_sparse=self.datamanager.info['is_sparse'], + is_sparse=self.datamanager.info["is_sparse"], initial_configurations_via_metalearning=self.num_metalearning_cfgs, watcher=self.watcher, - logger=self.logger) + logger=self.logger, + ) _print_debug_info_of_init_configuration( metalearning_configurations, self.dataset_name, self.total_walltime_limit, self.logger, - self.watcher) + self.watcher, + ) return metalearning_configurations @@ -336,23 +391,24 @@ def _calculate_metafeatures_with_limits(self, time_limit): try: context = multiprocessing.get_context(self.pynisher_context) preload_modules(context) - safe_mf = pynisher.enforce_limits(mem_in_mb=self.memory_limit, - wall_time_in_s=int(time_limit), - grace_period_in_s=30, - context=context, - logger=self.logger)( - _calculate_metafeatures) + safe_mf = pynisher.enforce_limits( + mem_in_mb=self.memory_limit, + wall_time_in_s=int(time_limit), + grace_period_in_s=30, + context=context, + logger=self.logger, + )(_calculate_metafeatures) res = safe_mf( data_feat_type=self.datamanager.feat_type, - data_info_task=self.datamanager.info['task'], - x_train=self.datamanager.data['X_train'], - y_train=self.datamanager.data['Y_train'], + data_info_task=self.datamanager.info["task"], + x_train=self.datamanager.data["X_train"], + y_train=self.datamanager.data["Y_train"], basename=self.dataset_name, watcher=self.watcher, - logger_=self.logger + logger_=self.logger, ) except Exception as e: - self.logger.error('Error getting metafeatures: %s', str(e)) + self.logger.error("Error getting metafeatures: %s", str(e)) return res @@ -362,30 +418,30 @@ def _calculate_metafeatures_encoded_with_limits(self, time_limit): try: context = multiprocessing.get_context(self.pynisher_context) preload_modules(context) - safe_mf = pynisher.enforce_limits(mem_in_mb=self.memory_limit, - wall_time_in_s=int(time_limit), - grace_period_in_s=30, - context=context, - logger=self.logger)( - _calculate_metafeatures_encoded) + safe_mf = pynisher.enforce_limits( + mem_in_mb=self.memory_limit, + wall_time_in_s=int(time_limit), + grace_period_in_s=30, + context=context, + logger=self.logger, + )(_calculate_metafeatures_encoded) res = safe_mf( data_feat_type=self.datamanager.feat_type, - task=self.datamanager.info['task'], - x_train=self.datamanager.data['X_train'], - y_train=self.datamanager.data['Y_train'], + task=self.datamanager.info["task"], + x_train=self.datamanager.data["X_train"], + y_train=self.datamanager.data["Y_train"], basename=self.dataset_name, watcher=self.watcher, - logger_=self.logger + logger_=self.logger, ) except Exception as e: - self.logger.error('Error getting metafeatures (encoded) : %s', - str(e)) + self.logger.error("Error getting metafeatures (encoded) : %s", str(e)) return res def run_smbo(self): - self.watcher.start_task('SMBO') + self.watcher.start_task("SMBO") # == first things first: load the datamanager self.reset_data_manager() @@ -401,14 +457,14 @@ def run_smbo(self): metalearning_configurations = self.get_metalearning_suggestions() - if self.resampling_strategy in ['partial-cv', - 'partial-cv-iterative-fit']: - num_folds = self.resampling_strategy_args['folds'] - instances = [[json.dumps({'task_id': self.dataset_name, - 'fold': fold_number})] - for fold_number in range(num_folds)] + if self.resampling_strategy in ["partial-cv", "partial-cv-iterative-fit"]: + num_folds = self.resampling_strategy_args["folds"] + instances = [ + [json.dumps({"task_id": self.dataset_name, "fold": fold_number})] + for fold_number in range(num_folds) + ] else: - instances = [[json.dumps({'task_id': self.dataset_name})]] + instances = [[json.dumps({"task_id": self.dataset_name})]] # TODO rebuild target algorithm to be it's own target algorithm # evaluator, which takes into account that a run can be killed prior @@ -428,62 +484,64 @@ def run_smbo(self): scoring_functions=self.scoring_functions, port=self.port, pynisher_context=self.pynisher_context, - **self.resampling_strategy_args + **self.resampling_strategy_args, ) ta = ExecuteTaFuncWithQueue startup_time = self.watcher.wall_elapsed(self.dataset_name) total_walltime_limit = self.total_walltime_limit - startup_time - 5 scenario_dict = { - 'abort_on_first_run_crash': False, - 'save-results-instantly': True, - 'cs': self.config_space, - 'cutoff_time': self.func_eval_time_limit, - 'deterministic': 'true', - 'instances': instances, - 'memory_limit': self.memory_limit, - 'output-dir': self.backend.get_smac_output_directory(), - 'run_obj': 'quality', - 'wallclock_limit': total_walltime_limit, - 'cost_for_crash': self.worst_possible_result, + "abort_on_first_run_crash": False, + "save-results-instantly": True, + "cs": self.config_space, + "cutoff_time": self.func_eval_time_limit, + "deterministic": "true", + "instances": instances, + "memory_limit": self.memory_limit, + "output-dir": self.backend.get_smac_output_directory(), + "run_obj": "quality", + "wallclock_limit": total_walltime_limit, + "cost_for_crash": self.worst_possible_result, } if self.smac_scenario_args is not None: for arg in [ - 'abort_on_first_run_crash', - 'cs', - 'deterministic', - 'instances', - 'output-dir', - 'run_obj', - 'shared-model', - 'cost_for_crash', + "abort_on_first_run_crash", + "cs", + "deterministic", + "instances", + "output-dir", + "run_obj", + "shared-model", + "cost_for_crash", ]: if arg in self.smac_scenario_args: - self.logger.warning('Cannot override scenario argument %s, ' - 'will ignore this.', arg) + self.logger.warning( + "Cannot override scenario argument %s, " "will ignore this.", + arg, + ) del self.smac_scenario_args[arg] for arg in [ - 'cutoff_time', - 'memory_limit', - 'wallclock_limit', + "cutoff_time", + "memory_limit", + "wallclock_limit", ]: if arg in self.smac_scenario_args: self.logger.warning( - 'Overriding scenario argument %s: %s with value %s', + "Overriding scenario argument %s: %s with value %s", arg, scenario_dict[arg], - self.smac_scenario_args[arg] + self.smac_scenario_args[arg], ) scenario_dict.update(self.smac_scenario_args) smac_args = { - 'scenario_dict': scenario_dict, - 'seed': seed, - 'ta': ta, - 'ta_kwargs': ta_kwargs, - 'metalearning_configurations': metalearning_configurations, - 'n_jobs': self.n_jobs, - 'dask_client': self.dask_client, + "scenario_dict": scenario_dict, + "seed": seed, + "ta": ta, + "ta_kwargs": ta_kwargs, + "metalearning_configurations": metalearning_configurations, + "n_jobs": self.n_jobs, + "dask_client": self.dask_client, } if self.get_smac_object_callback is not None: smac = self.get_smac_object_callback(**smac_args) @@ -516,25 +574,33 @@ def get_metalearning_suggestions(self): # If metadata directory is None, use default if self.metadata_directory is None: metalearning_directory = os.path.dirname( - autosklearn.metalearning.__file__) + autosklearn.metalearning.__file__ + ) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( - metalearning_directory, 'files', - '%s_%s_%s' % (self.metric, TASK_TYPES_TO_STRING[meta_task], - 'sparse' if self.datamanager.info['is_sparse'] - else 'dense')) + metalearning_directory, + "files", + "%s_%s_%s" + % ( + self.metric, + TASK_TYPES_TO_STRING[meta_task], + "sparse" if self.datamanager.info["is_sparse"] else "dense", + ), + ) self.metadata_directory = metadata_directory # If metadata directory is specified by user, # then verify that it exists. else: if not os.path.exists(self.metadata_directory): - raise ValueError('The specified metadata directory \'%s\' ' - 'does not exist!' % self.metadata_directory) + raise ValueError( + "The specified metadata directory '%s' " + "does not exist!" % self.metadata_directory + ) else: # There is no multilabel data in OpenML @@ -545,51 +611,66 @@ def get_metalearning_suggestions(self): metadata_directory = os.path.join( self.metadata_directory, - '%s_%s_%s' % (self.metric, TASK_TYPES_TO_STRING[meta_task], - 'sparse' if self.datamanager.info['is_sparse'] - else 'dense')) + "%s_%s_%s" + % ( + self.metric, + TASK_TYPES_TO_STRING[meta_task], + "sparse" if self.datamanager.info["is_sparse"] else "dense", + ), + ) # Check that the metadata directory has the correct # subdirectory needed for this dataset. - if os.path.basename(metadata_directory) not in \ - os.listdir(self.metadata_directory): - raise ValueError('The specified metadata directory ' - '\'%s\' does not have the correct ' - 'subdirectory \'%s\'' % - (self.metadata_directory, - os.path.basename(metadata_directory)) - ) + if os.path.basename(metadata_directory) not in os.listdir( + self.metadata_directory + ): + raise ValueError( + "The specified metadata directory " + "'%s' does not have the correct " + "subdirectory '%s'" + % ( + self.metadata_directory, + os.path.basename(metadata_directory), + ) + ) self.metadata_directory = metadata_directory if os.path.exists(self.metadata_directory): - self.logger.info('Metadata directory: %s', - self.metadata_directory) - meta_base = MetaBase(self.config_space, self.metadata_directory, self.logger) + self.logger.info("Metadata directory: %s", self.metadata_directory) + meta_base = MetaBase( + self.config_space, self.metadata_directory, self.logger + ) - metafeature_calculation_time_limit = int( - self.total_walltime_limit / 4) + metafeature_calculation_time_limit = int(self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( - metafeature_calculation_time_limit) + metafeature_calculation_time_limit + ) metafeature_calculation_end_time = time.time() - metafeature_calculation_time_limit = \ - metafeature_calculation_time_limit - ( - metafeature_calculation_end_time - - metafeature_calculation_start_time) + metafeature_calculation_time_limit = ( + metafeature_calculation_time_limit + - ( + metafeature_calculation_end_time + - metafeature_calculation_start_time + ) + ) if metafeature_calculation_time_limit < 1: self.logger.warning( - 'Time limit for metafeature calculation less ' - 'than 1 seconds (%f). Skipping calculation ' - 'of metafeatures for encoded dataset.', - metafeature_calculation_time_limit) + "Time limit for metafeature calculation less " + "than 1 seconds (%f). Skipping calculation " + "of metafeatures for encoded dataset.", + metafeature_calculation_time_limit, + ) meta_features_encoded = None else: with warnings.catch_warnings(): warnings.showwarning = get_send_warnings_to_logger(self.logger) - meta_features_encoded = \ + meta_features_encoded = ( self._calculate_metafeatures_encoded_with_limits( - metafeature_calculation_time_limit) + metafeature_calculation_time_limit + ) + ) # In case there is a problem calculating the encoded meta-features if meta_features is None: @@ -598,26 +679,28 @@ def get_metalearning_suggestions(self): else: if meta_features_encoded is not None: meta_features.metafeature_values.update( - meta_features_encoded.metafeature_values) + meta_features_encoded.metafeature_values + ) if meta_features is not None: meta_base.add_dataset(self.dataset_name, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( - features=list(meta_features.keys())) - all_metafeatures.fillna(all_metafeatures.mean(), - inplace=True) + features=list(meta_features.keys()) + ) + all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) with warnings.catch_warnings(): warnings.showwarning = get_send_warnings_to_logger(self.logger) - metalearning_configurations = self.collect_metalearning_suggestions( - meta_base) + metalearning_configurations = ( + self.collect_metalearning_suggestions(meta_base) + ) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() - self.logger.info('%s', meta_features) + self.logger.info("%s", meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary @@ -627,13 +710,15 @@ def get_metalearning_suggestions(self): meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append( - meta_features[meta_feature_name].value) + meta_features[meta_feature_name].value + ) self.logger.info(list(meta_features_dict.keys())) else: meta_features = None - self.logger.warning('Could not find meta-data directory %s' % - metadata_directory) + self.logger.warning( + "Could not find meta-data directory %s" % metadata_directory + ) else: meta_features = None diff --git a/autosklearn/util/__init__.py b/autosklearn/util/__init__.py index 6eeff57a7d..9f2d05ccd5 100644 --- a/autosklearn/util/__init__.py +++ b/autosklearn/util/__init__.py @@ -1,7 +1,7 @@ # -*- encoding: utf-8 -*- import re - -SUBPATTERN = r'((?P==|>=|>|<)(?P(\d+)?(\.[a-zA-Z0-9]+)?(\.[a-zA-Z0-9]+)?))' +SUBPATTERN = r"((?P==|>=|>|<)(?P(\d+)?(\.[a-zA-Z0-9]+)?(\.[a-zA-Z0-9]+)?))" # noqa: E501 RE_PATTERN = re.compile( - r'^(?P[\w\-]+)%s?(,%s)?$' % (SUBPATTERN % (1, 1), SUBPATTERN % (2, 2))) + r"^(?P[\w\-]+)%s?(,%s)?$" % (SUBPATTERN % (1, 1), SUBPATTERN % (2, 2)) +) diff --git a/autosklearn/util/common.py b/autosklearn/util/common.py index 4905d0eaa8..ddee4dc1ab 100644 --- a/autosklearn/util/common.py +++ b/autosklearn/util/common.py @@ -5,22 +5,21 @@ import numpy as np -__all__ = [ - 'check_pid', - 'warn_if_not_float' -] +__all__ = ["check_pid", "warn_if_not_float"] -def warn_if_not_float(X: np.ndarray, estimator: str = 'This algorithm') -> bool: +def warn_if_not_float(X: np.ndarray, estimator: str = "This algorithm") -> bool: """Warning utility function to check that data type is floating point. Returns True if a warning was raised (i.e. the input is not float) and False otherwise, for easier input validation. """ if not isinstance(estimator, str): estimator = estimator.__class__.__name__ - if X.dtype.kind != 'f': - warnings.warn("%s assumes floating point values as input, " - "got %s" % (estimator, X.dtype)) + if X.dtype.kind != "f": + warnings.warn( + "%s assumes floating point values as input, " + "got %s" % (estimator, X.dtype) + ) return True return False diff --git a/autosklearn/util/data.py b/autosklearn/util/data.py index ff6eb3c337..bdd4cf31b2 100644 --- a/autosklearn/util/data.py +++ b/autosklearn/util/data.py @@ -1,4 +1,3 @@ -import warnings from typing import ( Any, Dict, @@ -10,21 +9,18 @@ Tuple, Type, Union, - cast + cast, ) -import numpy as np +import warnings +import numpy as np import pandas as pd - from scipy.sparse import spmatrix - from sklearn.model_selection import train_test_split -from autosklearn.data.validation import SUPPORTED_FEAT_TYPES from autosklearn.evaluation.splitter import CustomStratifiedShuffleSplit - # TODO: TypedDict with python 3.8 # # When upgrading to python 3.8 as minimum version, this should be a TypedDict @@ -32,15 +28,14 @@ DatasetCompressionSpec = Dict[str, Union[float, List[str]]] # Default specification for arg `dataset_compression` -default_dataset_compression_arg: DatasetCompressionSpec = { +default_dataset_compression_arg: DatasetCompressionSpec = { "memory_allocation": 0.1, - "methods": ["precision", "subsample"] + "methods": ["precision", "subsample"], } def validate_dataset_compression_arg( - dataset_compression: Mapping[str, Any], - memory_limit: int + dataset_compression: Mapping[str, Any], memory_limit: int ) -> DatasetCompressionSpec: """Validates and return a correct dataset_compression argument @@ -58,22 +53,24 @@ def validate_dataset_compression_arg( """ if isinstance(dataset_compression, Mapping): # Fill with defaults if they don't exist - dataset_compression = { - **default_dataset_compression_arg, - **dataset_compression - } + dataset_compression = {**default_dataset_compression_arg, **dataset_compression} + + parsed_keys = set(dataset_compression.keys()) + default_keys = set(default_dataset_compression_arg.keys()) # Must contain known keys - if set(dataset_compression.keys()) != set(default_dataset_compression_arg.keys()): + if parsed_keys != default_keys: raise ValueError( - f"Unknown key in dataset_compression, {list(dataset_compression.keys())}." - f"\nPossible keys are {list(default_dataset_compression_arg.keys())}" + f"Unknown key(s) in ``dataset_compression``, {parsed_keys}." + f"\nPossible keys are {default_keys}" ) memory_allocation = dataset_compression["memory_allocation"] # "memory_allocation" must be float or int - if not (isinstance(memory_allocation, float) or isinstance(memory_allocation, int)): + if not ( + isinstance(memory_allocation, float) or isinstance(memory_allocation, int) + ): raise ValueError( "key 'memory_allocation' must be an `int` or `float`" f"\ntype = {memory_allocation}" @@ -89,9 +86,11 @@ def validate_dataset_compression_arg( ) # "memory_allocation" if absolute, should be > 0 and < memory_limit - if isinstance(memory_allocation, int) and not (0 < memory_allocation < memory_limit): + if isinstance(memory_allocation, int) and not ( + 0 < memory_allocation < memory_limit + ): raise ValueError( - f"key 'memory_allocation' if int must be in (0, memory_limit={memory_limit})" + f"key 'memory_allocation' if int must be in (0, {memory_limit})" f"\nmemory_allocation = {memory_allocation}" f"\ndataset_compression = {dataset_compression}" ) @@ -109,11 +108,13 @@ def validate_dataset_compression_arg( # "methods" must contain known methods if any( - method not in cast(Sequence, default_dataset_compression_arg["methods"]) # mypy + method + not in cast(Sequence, default_dataset_compression_arg["methods"]) # mypy for method in dataset_compression["methods"] ): + valid_methods = default_dataset_compression_arg["methods"] raise ValueError( - f"key 'methods' can only contain {default_dataset_compression_arg['methods']}" + f"key 'methods' can only contain {valid_methods}" f"\nmethods = {dataset_compression['methods']}" f"\ndataset_compression = {dataset_compression}" ) @@ -160,10 +161,10 @@ class _DtypeReductionMapping(Mapping): # provide only as much precision as np.longdouble, # that is, 80 bits on most x86 machines and 64 bits # in standard Windows builds. - if hasattr(np, 'float96'): + if hasattr(np, "float96"): _mapping[np.float96] = np.float64 - if hasattr(np, 'float128'): + if hasattr(np, "float128"): _mapping[np.float128] = np.float64 @classmethod @@ -191,8 +192,10 @@ def binarization(array: Union[List, np.ndarray]) -> np.ndarray: # into 1 and the min into 0 array = np.array(array, dtype=float) # conversion needed to use np.inf if len(np.unique(array)) > 2: - raise ValueError('The argument must be a binary-class datafile. ' - '{} classes detected'.format(len(np.unique(array)))) + raise ValueError( + "The argument must be a binary-class datafile. " + "{} classes detected".format(len(np.unique(array))) + ) # manipulation which aims at avoid error in data # with for example classes '1' and '2'. @@ -252,8 +255,8 @@ def subsample( is_classification: bool, sample_size: Union[float, int], random_state: Optional[Union[int, np.random.RandomState]] = None, -) -> Tuple[SUPPORTED_FEAT_TYPES, Union[List, np.ndarray, pd.DataFrame, pd.Series]]: - """ Subsamples data returning the same type as it recieved. +) -> Tuple[Union[np.ndarray, spmatrix], np.ndarray]: + """Subsamples data returning the same type as it recieved. If `is_classification`, we split using a stratified shuffle split which preserves unique labels in the training set. @@ -298,8 +301,7 @@ def subsample( if is_classification: splitter = CustomStratifiedShuffleSplit( - train_size=sample_size, - random_state=random_state + train_size=sample_size, random_state=random_state ) left_idxs, _ = next(splitter.split(X=X, y=y)) @@ -319,7 +321,8 @@ def subsample( else: X, _, y, _ = train_test_split( # type: ignore - X, y, + X, + y, train_size=sample_size, random_state=random_state, ) @@ -330,7 +333,7 @@ def subsample( def reduce_precision( X: Union[np.ndarray, spmatrix] ) -> Tuple[Union[np.ndarray, spmatrix], Type]: - """ Reduces the precision of a np.ndarray or spmatrix containing floats + """Reduces the precision of a np.ndarray or spmatrix containing floats Parameters ---------- @@ -343,8 +346,10 @@ def reduce_precision( Returns the reduced data X along with the dtype it was reduced to. """ if X.dtype not in supported_precision_reductions: - raise ValueError(f"X.dtype = {X.dtype} not equal to any supported" - f" {supported_precision_reductions}") + raise ValueError( + f"X.dtype = {X.dtype} not equal to any supported" + f" {supported_precision_reductions}" + ) precision = reduction_mapping[X.dtype] return X.astype(precision), precision @@ -356,10 +361,10 @@ def reduce_dataset_size_if_too_large( memory_limit: int, is_classification: bool, random_state: Union[int, np.random.RandomState] = None, - operations: List[str] = ['precision', 'subsample'], + operations: List[str] = ["precision", "subsample"], memory_allocation: Union[int, float] = 0.1, ) -> Tuple[Union[np.ndarray, spmatrix], np.ndarray]: - f""" Reduces the size of the dataset if it's too close to the memory limit. + f"""Reduces the size of the dataset if it's too close to the memory limit. Follows the order of the operations passed in and retains the type of its input. @@ -408,8 +413,8 @@ def reduce_dataset_size_if_too_large( **subsample** - Reduce the amount of samples of the dataset such that it fits into the allocated memory. - Ensures stratification and that unique labels are present + Reduce the amount of samples of the dataset such that it fits into the allocated + memory. Ensures stratification and that unique labels are present memory_allocation: Union[int, float] = 0.1 The amount of memory to allocate to the dataset. A float specifys that the @@ -437,27 +442,31 @@ def reduce_dataset_size_if_too_large( allocated_memory = memory_allocation else: - raise ValueError(f"Unknown type for `memory_allocation` {type(memory_allocation)}") + raise ValueError( + f"Unknown type for `memory_allocation` {type(memory_allocation)}" + ) - if 'precision' in operations and X.dtype not in supported_precision_reductions: + if "precision" in operations and X.dtype not in supported_precision_reductions: raise ValueError(f"Unsupported type `{X.dtype}` for precision reduction") def megabytes(arr: Union[np.ndarray, spmatrix]) -> float: - return (arr.nbytes if isinstance(X, np.ndarray) else arr.data.nbytes) / (2**20) + return (arr.nbytes if isinstance(X, np.ndarray) else arr.data.nbytes) / ( + 2**20 + ) for operation in operations: - if operation == 'precision': + if operation == "precision": # If the dataset is too big for the allocated memory, # we then try to reduce the precision if it's a high precision dataset if megabytes(X) > allocated_memory: X, precision = reduce_precision(X) warnings.warn( - f'Dataset too large for allocated memory {allocated_memory}MB, ' - f'reduced the precision from {X.dtype} to {precision}', + f"Dataset too large for allocated memory {allocated_memory}MB, " + f"reduced the precision from {X.dtype} to {precision}", ) - elif operation == 'subsample': + elif operation == "subsample": # If the dataset is still too big such that we couldn't fit # into the allocated memory, we subsample it so that it does if megabytes(X) > allocated_memory: @@ -470,16 +479,18 @@ def megabytes(arr: Union[np.ndarray, spmatrix]) -> float: # Tried the generic `def subsample(X: T) -> T` approach but it was # failing elsewhere, keeping it simple for now X, y = subsample( # type: ignore - X, y, + X, + y, sample_size=sample_percentage, is_classification=is_classification, - random_state=random_state + random_state=random_state, ) n_samples_after = X.shape[0] warnings.warn( - f"Dataset too large for allocated memory {allocated_memory}MB, reduced" - f" number of samples from {n_samples_before} to {n_samples_after}." + f"Dataset too large for allocated memory {allocated_memory}MB," + f" reduced number of samples from {n_samples_before} to" + f" {n_samples_after}." ) else: diff --git a/autosklearn/util/dependencies.py b/autosklearn/util/dependencies.py index d213000871..fb9037450b 100644 --- a/autosklearn/util/dependencies.py +++ b/autosklearn/util/dependencies.py @@ -1,6 +1,7 @@ +from typing import List, Optional, Union, no_type_check + import importlib from distutils.version import LooseVersion -from typing import List, Optional, Union, no_type_check import pkg_resources @@ -19,12 +20,12 @@ def verify_packages(packages: Optional[Union[str, List[str]]]) -> None: match = RE_PATTERN.match(package) if match: - name = match.group('name') - operation = match.group('operation1') - version = match.group('version1') + name = match.group("name") + operation = match.group("operation1") + version = match.group("version1") _verify_package(name, operation, version) else: - raise ValueError('Unable to read requirement: %s' % package) + raise ValueError("Unable to read requirement: %s" % package) # Module has no attribute __version__ wa @@ -45,48 +46,56 @@ def _verify_package(name: str, operation: Optional[str], version: str) -> None: required_version = LooseVersion(version) - if operation == '==': + if operation == "==": check = required_version == installed_version - elif operation == '>': + elif operation == ">": check = installed_version > required_version - elif operation == '<': + elif operation == "<": check = installed_version < required_version - elif operation == '>=': - check = installed_version > required_version or \ - installed_version == required_version + elif operation == ">=": + check = ( + installed_version > required_version + or installed_version == required_version + ) else: - raise NotImplementedError( - 'operation \'%s\' is not supported' % operation) + raise NotImplementedError("operation '%s' is not supported" % operation) if not check: - raise IncorrectPackageVersionError(name, installed_version, operation, - required_version) + raise IncorrectPackageVersionError( + name, installed_version, operation, required_version + ) class MissingPackageError(Exception): - error_message = 'Mandatory package \'{name}\' not found!' + error_message = "Mandatory package '{name}' not found!" def __init__(self, package_name: str): self.package_name = package_name super(MissingPackageError, self).__init__( - self.error_message.format(name=package_name)) + self.error_message.format(name=package_name) + ) class IncorrectPackageVersionError(Exception): - error_message = "found '{name}' version {installed_version} but requires {name} version " \ - "{operation}{required_version}" - - def __init__(self, - package_name: str, - installed_version: Union[str, LooseVersion], - operation: Optional[str], - required_version: Union[str, LooseVersion] - ): + error_message = ( + "found '{name}' version {installed_version} but requires {name} version " + "{operation}{required_version}" + ) + + def __init__( + self, + package_name: str, + installed_version: Union[str, LooseVersion], + operation: Optional[str], + required_version: Union[str, LooseVersion], + ): self.package_name = package_name self.installed_version = installed_version self.operation = operation self.required_version = required_version - message = self.error_message.format(name=package_name, - installed_version=installed_version, - operation=operation, - required_version=required_version) + message = self.error_message.format( + name=package_name, + installed_version=installed_version, + operation=operation, + required_version=required_version, + ) super(IncorrectPackageVersionError, self).__init__(message) diff --git a/autosklearn/util/logging_.py b/autosklearn/util/logging_.py index fc298cd053..a85e4a80d6 100644 --- a/autosklearn/util/logging_.py +++ b/autosklearn/util/logging_.py @@ -1,4 +1,6 @@ # -*- encoding: utf-8 -*- +from typing import Any, Dict, Iterator, Optional, TextIO, Type, cast + import logging import logging.config import logging.handlers @@ -12,7 +14,6 @@ import threading import warnings from contextlib import contextmanager -from typing import Any, Dict, Iterator, Optional, TextIO, Type, cast import yaml @@ -26,41 +27,43 @@ def setup_logger( # logging_config must be a dictionary object specifying the configuration # for the loggers to be used in auto-sklearn. if logging_config is None: - with open(os.path.join(os.path.dirname(__file__), 'logging.yaml'), 'r') as fh: + with open(os.path.join(os.path.dirname(__file__), "logging.yaml"), "r") as fh: logging_config = yaml.safe_load(fh) # Make sure we have a filename handler - if 'handlers' not in logging_config: - logging_config['handlers'] = {} - if 'file_handler' not in logging_config['handlers']: - logging_config['handlers']['file_handler'] = { - 'class': 'logging.FileHandler', - 'level': 'DEBUG', - 'filename': 'autosklearn.log' + if "handlers" not in logging_config: + logging_config["handlers"] = {} + if "file_handler" not in logging_config["handlers"]: + logging_config["handlers"]["file_handler"] = { + "class": "logging.FileHandler", + "level": "DEBUG", + "filename": "autosklearn.log", } - if 'distributed_logfile' not in logging_config['handlers']: + if "distributed_logfile" not in logging_config["handlers"]: # We have to create a file handler - logging_config['handlers']['distributed_logfile'] = { - 'class': 'logging.FileHandler', - 'level': 'DEBUG', - 'filename': 'distributed.log' + logging_config["handlers"]["distributed_logfile"] = { + "class": "logging.FileHandler", + "level": "DEBUG", + "filename": "distributed.log", } if filename is None: - filename = logging_config['handlers']['file_handler']['filename'] + filename = logging_config["handlers"]["file_handler"]["filename"] if distributedlog_filename is None: - distributedlog_filename = logging_config['handlers']['distributed_logfile']['filename'] + distributedlog_filename = logging_config["handlers"]["distributed_logfile"][ + "filename" + ] # Make path absolute only if required # This is needed because this function might be called multiple times with the same # dict, and we don't want /path/path/.log but rather just /path/.log - if os.path.sep not in logging_config['handlers']['file_handler']['filename']: - logging_config['handlers']['file_handler']['filename'] = os.path.join( + if os.path.sep not in logging_config["handlers"]["file_handler"]["filename"]: + logging_config["handlers"]["file_handler"]["filename"] = os.path.join( output_dir, filename ) - if os.path.sep not in logging_config['handlers']['distributed_logfile']['filename']: - logging_config['handlers']['distributed_logfile']['filename'] = os.path.join( + if os.path.sep not in logging_config["handlers"]["distributed_logfile"]["filename"]: + logging_config["handlers"]["distributed_logfile"]["filename"] = os.path.join( output_dir, distributedlog_filename ) logging.config.dictConfig(logging_config) @@ -70,13 +73,12 @@ def _create_logger(name: str) -> logging.Logger: return logging.getLogger(name) -def get_logger(name: str) -> 'PickableLoggerAdapter': +def get_logger(name: str) -> "PickableLoggerAdapter": logger = PickableLoggerAdapter(name) return logger class PickableLoggerAdapter(object): - def __init__(self, name: str): self.name = name self.logger = _create_logger(name) @@ -90,7 +92,7 @@ def __getstate__(self) -> Dict[str, Any]: Dictionary, representing the object state to be pickled. Ignores the self.logger field and only returns the logger name. """ - return {'name': self.name} + return {"name": self.name} def __setstate__(self, state: Dict[str, Any]) -> None: """ @@ -102,7 +104,7 @@ def __setstate__(self, state: Dict[str, Any]) -> None: state - dictionary, containing the logger name. """ - self.name = state['name'] + self.name = state["name"] self.logger = _create_logger(self.name) def debug(self, msg: str, *args: Any, **kwargs: Any) -> None: @@ -132,20 +134,16 @@ def isEnabledFor(self, level: int) -> bool: def get_named_client_logger( name: str, - host: str = 'localhost', + host: str = "localhost", port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, -) -> 'PicklableClientLogger': - logger = PicklableClientLogger( - name=name, - host=host, - port=port - ) +) -> "PicklableClientLogger": + logger = PicklableClientLogger(name=name, host=host, port=port) return logger def _get_named_client_logger( name: str, - host: str = 'localhost', + host: str = "localhost", port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, ) -> logging.Logger: """ @@ -176,7 +174,7 @@ def _get_named_client_logger( # We add client not only to identify that this is the client # communication part of the logger, but to make sure we have # a new singleton with the desired socket handlers - local_logger = _create_logger('Client-' + name) + local_logger = _create_logger("Client-" + name) local_logger.propagate = False local_logger.setLevel(logging.DEBUG) @@ -184,8 +182,9 @@ def _get_named_client_logger( # Ignore mypy logging.handlers.SocketHandler has no attribute port # This is not the case clearly, yet MyPy assumes this is not the case # Even when using direct casting or getattr - ports = [getattr(handler, 'port', None - ) for handler in local_logger.handlers] # type: ignore[attr-defined] + ports = [ + getattr(handler, "port", None) for handler in local_logger.handlers + ] # type: ignore[attr-defined] except AttributeError: # We do not want to log twice but adding multiple times the same # handler. So we check to what ports we communicate to @@ -201,16 +200,11 @@ def _get_named_client_logger( class PicklableClientLogger(PickableLoggerAdapter): - def __init__(self, name: str, host: str, port: int): self.name = name self.host = host self.port = port - self.logger = _get_named_client_logger( - name=name, - host=host, - port=port - ) + self.logger = _get_named_client_logger(name=name, host=host, port=port) def __getstate__(self) -> Dict[str, Any]: """ @@ -222,9 +216,9 @@ def __getstate__(self) -> Dict[str, Any]: the self.logger field and only returns the logger name. """ return { - 'name': self.name, - 'host': self.host, - 'port': self.port, + "name": self.name, + "host": self.host, + "port": self.port, } def __setstate__(self, state: Dict[str, Any]) -> None: @@ -237,9 +231,9 @@ def __setstate__(self, state: Dict[str, Any]) -> None: state - dictionary, containing the logger name. """ - self.name = state['name'] - self.host = state['host'] - self.port = state['port'] + self.name = state["name"] + self.host = state["host"] + self.port = state["port"] self.logger = _get_named_client_logger( name=self.name, host=self.host, @@ -264,7 +258,7 @@ def handle(self) -> None: chunk = self.connection.recv(4) # type: ignore[attr-defined] if len(chunk) < 4: break - slen = struct.unpack('>L', chunk)[0] + slen = struct.unpack(">L", chunk)[0] chunk = self.connection.recv(slen) # type: ignore[attr-defined] while len(chunk) < slen: chunk = chunk + self.connection.recv(slen - len(chunk)) # type: ignore[attr-defined] # noqa: E501 @@ -301,9 +295,9 @@ def start_log_server( logging_config: Dict, output_dir: str, ) -> None: - setup_logger(filename=filename, - logging_config=logging_config, - output_dir=output_dir) + setup_logger( + filename=filename, logging_config=logging_config, output_dir=output_dir + ) while True: # Loop until we find a valid port @@ -334,7 +328,7 @@ class LogRecordSocketReceiver(socketserver.ThreadingTCPServer): def __init__( self, - host: str = 'localhost', + host: str = "localhost", port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, handler: Type[LogRecordStreamHandler] = LogRecordStreamHandler, logname: Optional[str] = None, @@ -347,9 +341,7 @@ def __init__( def serve_until_stopped(self) -> None: while True: - rd, wr, ex = select.select([self.socket.fileno()], - [], [], - self.timeout) + rd, wr, ex = select.select([self.socket.fileno()], [], [], self.timeout) if rd: self.handle_request() if self.event is not None and self.event.is_set(): @@ -358,7 +350,7 @@ def serve_until_stopped(self) -> None: @contextmanager def warnings_to(logger: Optional[PicklableClientLogger] = None) -> Iterator[None]: - """ A context manager to catch warnings and send them to the logger + """A context manager to catch warnings and send them to the logger If no logger is passed, warnings propogate as they normally would. @@ -371,6 +363,7 @@ def warnings_to(logger: Optional[PicklableClientLogger] = None) -> Iterator[None if logger: with warnings.catch_warnings(): + def to_log( logger: PicklableClientLogger, message: str, @@ -378,15 +371,16 @@ def to_log( filename: str, lineno: int, file: Optional[TextIO] = None, - line: Optional[str] = None + line: Optional[str] = None, ) -> None: logger.warning(f"{filename}:{lineno} {category.__name__}:{message}") # Mypy was complaining that logger didn't exist in `to_log` see here: # https://mypy.readthedocs.io/en/stable/common_issues.html#narrowing-and-inner-functions # we explicitly pass it in and have to force it's type with `cast` - warnings.showwarning = lambda *args: \ - to_log(cast(PicklableClientLogger, logger), *args) + warnings.showwarning = lambda *args: to_log( + cast(PicklableClientLogger, logger), *args + ) yield # Else do nothing, warnings go to wherever they would without this context diff --git a/autosklearn/util/parallel.py b/autosklearn/util/parallel.py index 2f0ea6b016..0804588a61 100644 --- a/autosklearn/util/parallel.py +++ b/autosklearn/util/parallel.py @@ -5,16 +5,19 @@ def preload_modules(context: multiprocessing.context.BaseContext) -> None: all_loaded_modules = sys.modules.keys() preload = [ - loaded_module for loaded_module in all_loaded_modules - if loaded_module.split('.')[0] in ( - 'smac', - 'autosklearn', - 'numpy', - 'scipy', - 'pandas', - 'pynisher', - 'sklearn', - 'ConfigSpace', - ) and 'logging' not in loaded_module + loaded_module + for loaded_module in all_loaded_modules + if loaded_module.split(".")[0] + in ( + "smac", + "autosklearn", + "numpy", + "scipy", + "pandas", + "pynisher", + "sklearn", + "ConfigSpace", + ) + and "logging" not in loaded_module ] context.set_forkserver_preload(preload) diff --git a/autosklearn/util/pipeline.py b/autosklearn/util/pipeline.py index c1f5a2ca23..d3291069f5 100755 --- a/autosklearn/util/pipeline.py +++ b/autosklearn/util/pipeline.py @@ -1,29 +1,27 @@ # -*- encoding: utf-8 -*- from typing import Any, Dict, List, Optional, Union -from ConfigSpace.configuration_space import ConfigurationSpace - import numpy as np +from ConfigSpace.configuration_space import ConfigurationSpace from autosklearn.constants import ( BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION, MULTIOUTPUT_REGRESSION, - REGRESSION_TASKS + REGRESSION_TASKS, ) from autosklearn.pipeline.classification import SimpleClassificationPipeline from autosklearn.pipeline.regression import SimpleRegressionPipeline - -__all__ = ['get_configuration_space'] +__all__ = ["get_configuration_space"] def get_configuration_space( info: Dict[str, Any], include: Optional[Dict[str, List[str]]] = None, exclude: Optional[Dict[str, List[str]]] = None, - random_state: Optional[Union[int, np.random.RandomState]] = None + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> ConfigurationSpace: """Get the configuration of a pipeline given some dataset info @@ -46,17 +44,19 @@ def get_configuration_space( ConfigurationSpace The configuration space for the pipeline """ - if info['task'] in REGRESSION_TASKS: + if info["task"] in REGRESSION_TASKS: return _get_regression_configuration_space(info, include, exclude, random_state) else: - return _get_classification_configuration_space(info, include, exclude, random_state) + return _get_classification_configuration_space( + info, include, exclude, random_state + ) def _get_regression_configuration_space( info: Dict[str, Any], include: Optional[Dict[str, List[str]]], exclude: Optional[Dict[str, List[str]]], - random_state: Optional[Union[int, np.random.RandomState]] = None + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> ConfigurationSpace: """Get the configuration of a regression pipeline given some dataset info @@ -79,25 +79,22 @@ def _get_regression_configuration_space( ConfigurationSpace The configuration space for the regression pipeline """ - task_type = info['task'] + task_type = info["task"] sparse = False multioutput = False if task_type == MULTIOUTPUT_REGRESSION: multioutput = True - if info['is_sparse'] == 1: + if info["is_sparse"] == 1: sparse = True - dataset_properties = { - 'multioutput': multioutput, - 'sparse': sparse - } + dataset_properties = {"multioutput": multioutput, "sparse": sparse} configuration_space = SimpleRegressionPipeline( dataset_properties=dataset_properties, include=include, exclude=exclude, - random_state=random_state + random_state=random_state, ).get_hyperparameter_search_space() return configuration_space @@ -106,7 +103,7 @@ def _get_classification_configuration_space( info: Dict[str, Any], include: Optional[Dict[str, List[str]]], exclude: Optional[Dict[str, List[str]]], - random_state: Optional[Union[int, np.random.RandomState]] = None + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> ConfigurationSpace: """Get the configuration of a classification pipeline given some dataset info @@ -129,7 +126,7 @@ def _get_classification_configuration_space( ConfigurationSpace The configuration space for the classification pipeline """ - task_type = info['task'] + task_type = info["task"] multilabel = False multiclass = False @@ -142,18 +139,18 @@ def _get_classification_configuration_space( if task_type == BINARY_CLASSIFICATION: pass - if info['is_sparse'] == 1: + if info["is_sparse"] == 1: sparse = True dataset_properties = { - 'multilabel': multilabel, - 'multiclass': multiclass, - 'sparse': sparse + "multilabel": multilabel, + "multiclass": multiclass, + "sparse": sparse, } return SimpleClassificationPipeline( dataset_properties=dataset_properties, include=include, exclude=exclude, - random_state=random_state + random_state=random_state, ).get_hyperparameter_search_space() diff --git a/autosklearn/util/single_thread_client.py b/autosklearn/util/single_thread_client.py index 5cd7c653f4..057e29a25b 100644 --- a/autosklearn/util/single_thread_client.py +++ b/autosklearn/util/single_thread_client.py @@ -1,4 +1,5 @@ import typing + from pathlib import Path import dask.distributed @@ -9,6 +10,7 @@ class DummyFuture(dask.distributed.Future): A class that mimics a distributed Future, the outcome of performing submit on a distributed client. """ + def __init__(self, result: typing.Any) -> None: self._result = result # type: typing.Any @@ -33,13 +35,24 @@ class SingleThreadedClient(dask.distributed.Client): A class to Mock the Distributed Client class, in case Auto-Sklearn is meant to run in the current Thread. """ + def __init__(self) -> None: # Raise a not implemented error if using a method from Client - implemented_methods = ['submit', 'close', 'shutdown', 'write_scheduler_file', - '_get_scheduler_info', 'nthreads'] - method_list = [func for func in dir(dask.distributed.Client) if callable( - getattr(dask.distributed.Client, func)) and not func.startswith('__')] + implemented_methods = [ + "submit", + "close", + "shutdown", + "write_scheduler_file", + "_get_scheduler_info", + "nthreads", + ] + method_list = [ + func + for func in dir(dask.distributed.Client) + if callable(getattr(dask.distributed.Client, func)) + and not func.startswith("__") + ] for method in method_list: if method in implemented_methods: continue @@ -70,17 +83,17 @@ def write_scheduler_file(self, scheduler_file: str) -> None: def _get_scheduler_info(self) -> typing.Dict: return { - 'workers': ['127.0.0.1'], - 'type': 'Scheduler', + "workers": ["127.0.0.1"], + "type": "Scheduler", } def nthreads(self) -> typing.Dict: return { - '127.0.0.1': 1, + "127.0.0.1": 1, } def __repr__(self) -> str: - return 'SingleThreadedClient()' + return "SingleThreadedClient()" def __del__(self) -> None: pass diff --git a/autosklearn/util/stopwatch.py b/autosklearn/util/stopwatch.py index aff17a1acf..9ced028cd0 100644 --- a/autosklearn/util/stopwatch.py +++ b/autosklearn/util/stopwatch.py @@ -5,10 +5,11 @@ @project: AutoML2015 """ +from typing import Tuple + import sys import time from collections import OrderedDict -from typing import Tuple class TimingTask(object): @@ -31,7 +32,7 @@ def stop(self) -> None: self._cpu_dur = self._cpu_tac - self._cpu_tic self._wall_dur = self._wall_tac - self._wall_tic else: - sys.stdout.write('Task has already stopped\n') + sys.stdout.write("Task has already stopped\n") @property def name(self) -> str: @@ -72,7 +73,7 @@ class StopWatch: def __init__(self) -> None: self._tasks = OrderedDict() - self._tasks['stopwatch_time'] = TimingTask('stopwatch_time') + self._tasks["stopwatch_time"] = TimingTask("stopwatch_time") def insert_task(self, name: str, cpu_dur: float, wall_dur: float) -> None: if name not in self._tasks: @@ -109,20 +110,20 @@ def stop_task(self, name: str) -> None: try: self._tasks[name].stop() except KeyError: - sys.stderr.write('There is no such task: %s\n' % name) + sys.stderr.write("There is no such task: %s\n" % name) def get_cpu_dur(self, name: str) -> float: try: return self._tasks[name].cpu_dur except KeyError: - sys.stderr.write('There is no such task: %s\n' % name) + sys.stderr.write("There is no such task: %s\n" % name) return 0.0 def get_wall_dur(self, name: str) -> float: try: return self._tasks[name].wall_dur except KeyError: - sys.stderr.write('There is no such task: %s\n' % name) + sys.stderr.write("There is no such task: %s\n" % name) return 0.0 def cpu_sum(self) -> float: @@ -134,19 +135,27 @@ def wall_sum(self) -> float: return sum([max(0, self._tasks[tsk].wall_dur) for tsk in self._tasks]) def __repr__(self) -> str: - ret_str = '| %10s | %10s | %10s | %10s | %10s | %10s | %10s |\n' % \ - ('Name', 'CPUStart', 'CPUEnd', 'CPUDur', 'WallStart', - 'WallEnd', - 'WallDur') - ret_str += '+' + '------------+' * 7 + '\n' - offset = self._tasks['stopwatch_time'].wall_tic + ret_str = "| %10s | %10s | %10s | %10s | %10s | %10s | %10s |\n" % ( + "Name", + "CPUStart", + "CPUEnd", + "CPUDur", + "WallStart", + "WallEnd", + "WallDur", + ) + ret_str += "+" + "------------+" * 7 + "\n" + offset = self._tasks["stopwatch_time"].wall_tic for tsk in self._tasks: if self._tasks[tsk].wall_tac: wall_tac = self._tasks[tsk].wall_tac - offset - ret_str += '| %10s | %10.5f | %10.5f | %10.5f | %10s | %10s | %10s |\n' % \ - (tsk, self._tasks[tsk].cpu_tic, self._tasks[tsk].cpu_tac, - self.cpu_elapsed(tsk), - self._tasks[tsk].wall_tic - offset, - wall_tac if self._tasks[tsk].wall_tac else False, - self.wall_elapsed(tsk)) + ret_str += "| %10s | %10.5f | %10.5f | %10.5f | %10s | %10s | %10s |\n" % ( + tsk, + self._tasks[tsk].cpu_tic, + self._tasks[tsk].cpu_tac, + self.cpu_elapsed(tsk), + self._tasks[tsk].wall_tic - offset, + wall_tac if self._tasks[tsk].wall_tac else False, + self.wall_elapsed(tsk), + ) return ret_str diff --git a/doc/conf.py b/doc/conf.py index 5d114b3550..6efcd5c736 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -21,11 +21,12 @@ import sys import sphinx_bootstrap_theme import autosklearn + # Add the parent directory of this file to the PYTHONPATH import os current_directory = os.path.dirname(__file__) -parent_directory = os.path.join(current_directory, '..') +parent_directory = os.path.join(current_directory, "..") parent_directory = os.path.abspath(parent_directory) sys.path.append(parent_directory) @@ -39,15 +40,21 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary', - 'sphinx.ext.doctest', 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', - 'sphinx_gallery.gen_gallery', 'sphinx.ext.autosectionlabel', - 'sphinx_toolbox.collapse', - # sphinx.ext.autosexctionlabel raises duplicate label warnings - # because same section headers are used multiple times throughout - # the documentation. - 'numpydoc'] +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.doctest", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.viewcode", + "sphinx_gallery.gen_gallery", + "sphinx.ext.autosectionlabel", + "sphinx_toolbox.collapse", + # sphinx.ext.autosexctionlabel raises duplicate label warnings + # because same section headers are used multiple times throughout + # the documentation. + "numpydoc", +] from sphinx_gallery.sorting import ExplicitOrder, FileNameSortKey @@ -63,8 +70,9 @@ # Sphinx-gallery configuration. # get current branch -binder_branch = 'master' +binder_branch = "master" import autosklearn + if "dev" in autosklearn.__version__: binder_branch = "development" @@ -72,7 +80,7 @@ # We do this by setting an evironment variable we check and modifying the python config # object. # We have this extra processing as it enters as a raw string and we need a boolean value -gallery_env_var ="SPHINX_GALLERY_PLOT" +gallery_env_var = "SPHINX_GALLERY_PLOT" sphinx_plot_gallery_flag = True if gallery_env_var in os.environ: @@ -82,52 +90,55 @@ elif value in ["True", "true", "1"]: sphinx_plot_gallery_flag = True else: - raise ValueError(f'Env variable {gallery_env_var} must be set to "false" or "true"') + raise ValueError( + f'Env variable {gallery_env_var} must be set to "false" or "true"' + ) sphinx_gallery_conf = { # path to the examples - 'examples_dirs': '../examples', + "examples_dirs": "../examples", # path where to save gallery generated examples - 'gallery_dirs': 'examples', - #TODO: fix back/forward references for the examples. + "gallery_dirs": "examples", + # TODO: fix back/forward references for the examples. #'doc_module': ('autosklearn'), #'reference_url': { # 'autosklearn': None - #}, - 'plot_gallery': sphinx_plot_gallery_flag, - 'backreferences_dir': None, - 'filename_pattern': 'example.*.py$', - 'ignore_pattern': r'custom_metrics\.py|__init__\.py|example_parallel_manual_spawning_python.py', - 'binder': { - # Required keys - 'org': 'automl', - 'repo': 'auto-sklearn', - 'branch': binder_branch, - 'binderhub_url': 'https://mybinder.org', - 'dependencies': ['../.binder/apt.txt', '../.binder/requirements.txt'], - #'filepath_prefix': '' # A prefix to prepend to any filepaths in Binder links. - # Jupyter notebooks for Binder will be copied to this directory (relative to built documentation root). - 'notebooks_dir': 'notebooks/', - 'use_jupyter_lab': True, # Whether Binder links should start Jupyter Lab instead of the Jupyter Notebook interface. - }, + # }, + "plot_gallery": sphinx_plot_gallery_flag, + "backreferences_dir": None, + "filename_pattern": "example.*.py$", + "ignore_pattern": r"custom_metrics\.py|__init__\.py|example_parallel_manual_spawning_python.py", + "binder": { + # Required keys + "org": "automl", + "repo": "auto-sklearn", + "branch": binder_branch, + "binderhub_url": "https://mybinder.org", + "dependencies": ["../.binder/apt.txt", "../.binder/requirements.txt"], + #'filepath_prefix': '' # A prefix to prepend to any filepaths in Binder links. + # Jupyter notebooks for Binder will be copied to this directory (relative to built documentation root). + "notebooks_dir": "notebooks/", + "use_jupyter_lab": True, # Whether Binder links should start Jupyter Lab instead of the Jupyter Notebook interface. + }, } # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'AutoSklearn' -copyright = u"2014-{}, Machine Learning Professorship Freiburg".format( - datetime.datetime.now().year) +project = "AutoSklearn" +copyright = "2014-{}, Machine Learning Professorship Freiburg".format( + datetime.datetime.now().year +) # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -150,7 +161,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build', '_templates', '_static'] +exclude_patterns = ["_build", "_templates", "_static"] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -168,7 +179,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -180,44 +191,37 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'bootstrap' +html_theme = "bootstrap" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { # Navigation bar title. (Default: ``project`` value) - 'navbar_title': "auto-sklearn", - + "navbar_title": "auto-sklearn", # Tab name for entire site. (Default: "Site") # 'navbar_site_name': "Site", - # A list of tuples containting pages to link to. The value should # be in the form [(name, page), ..] - 'navbar_links': [ - ('Start', 'index'), - ('Releases', 'releases'), - ('Installation', 'installation'), - ('Manual', 'manual'), - ('Examples', 'examples/index'), - ('API', 'api'), - ('Extending', 'extending'), - ('FAQ', 'faq'), + "navbar_links": [ + ("Start", "index"), + ("Releases", "releases"), + ("Installation", "installation"), + ("Manual", "manual"), + ("Examples", "examples/index"), + ("API", "api"), + ("Extending", "extending"), + ("FAQ", "faq"), ], - # Render the next and previous page links in navbar. (Default: true) - 'navbar_sidebarrel': False, - + "navbar_sidebarrel": False, # Render the current pages TOC in the navbar. (Default: true) - 'navbar_pagenav': False, - + "navbar_pagenav": False, # Tab name for the current pages TOC. (Default: "Page") - 'navbar_pagenav_name': "On this page", - + "navbar_pagenav_name": "On this page", # Global TOC depth for "site" navbar tab. (Default: 1) # Switching to -1 shows all levels. - 'globaltoc_depth': 1, - + "globaltoc_depth": 1, # Include hidden TOCs in Site navbar? # # Note: If this is "false", you cannot have mixed ``:hidden:`` and @@ -225,29 +229,24 @@ # will break. # # Values: "true" (default) or "false" - 'globaltoc_includehidden': "false", - + "globaltoc_includehidden": "false", # HTML navbar class (Default: "navbar") to attach to
element. # For black navbar, do "navbar navbar-inverse" - 'navbar_class': "navbar", - + "navbar_class": "navbar", # Fix navigation bar to top of page? # Values: "true" (default) or "false" - 'navbar_fixed_top': "true", - + "navbar_fixed_top": "true", # Location of link to source. # Options are "nav" (default), "footer" or anything else to exclude. - 'source_link_position': "footer", - + "source_link_position": "footer", # Bootswatch (http://bootswatch.com/) theme. # # Options are nothing with "" (default) or the name of a valid theme # such as "amelia" or "cosmo". - 'bootswatch_theme': "cosmo", - + "bootswatch_theme": "cosmo", # Choose Bootstrap version. # Values: "3" (default) or "2" (in quotes) - 'bootstrap_version': "3", + "bootstrap_version": "3", } # Add any paths that contain custom themes here, relative to this directory. @@ -288,7 +287,7 @@ # html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -html_sidebars = {'**': ['localtoc.html']} +html_sidebars = {"**": ["localtoc.html"]} # Additional templates that should be rendered to pages, maps page names to # template names. @@ -321,7 +320,7 @@ # html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'AutoSklearndoc' +htmlhelp_basename = "AutoSklearndoc" # -- Options for LaTeX output --------------------------------------------- @@ -337,9 +336,15 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). -latex_documents = [('index', 'AutoSklearn.tex', u'AutoSklearn Documentation', - u'Matthias Feurer, Aaron Klein, Katharina Eggensperger', - 'manual'), ] +latex_documents = [ + ( + "index", + "AutoSklearn.tex", + "AutoSklearn Documentation", + "Matthias Feurer, Aaron Klein, Katharina Eggensperger", + "manual", + ), +] # The name of an image file (relative to this directory) to place at the top of # the title page. @@ -365,8 +370,15 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [('index', 'autosklearn', u'AutoSklearn Documentation', - [u'Matthias Feurer, Aaron Klein, Katharina Eggensperger'], 1)] +man_pages = [ + ( + "index", + "autosklearn", + "AutoSklearn Documentation", + ["Matthias Feurer, Aaron Klein, Katharina Eggensperger"], + 1, + ) +] # If true, show URL addresses after external links. # man_show_urls = False @@ -376,10 +388,17 @@ # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) -texinfo_documents = [('index', 'AutoSklearn', u'AutoSklearn Documentation', - u'Matthias Feurer, Aaron Klein, Katharina Eggensperger', - 'AutoSklearn', 'One line description of project.', - 'Miscellaneous'), ] +texinfo_documents = [ + ( + "index", + "AutoSklearn", + "AutoSklearn Documentation", + "Matthias Feurer, Aaron Klein, Katharina Eggensperger", + "AutoSklearn", + "One line description of project.", + "Miscellaneous", + ), +] # Documents to append as an appendix to all manuals. # texinfo_appendices = [] @@ -396,12 +415,12 @@ # Only the class’ docstring is inserted. This is the default. # You can still document __init__ as a separate method using automethod or # the members option to autoclass. -#"both" +# "both" # Both the class’ and the __init__ method’s docstring are concatenated and # inserted. # "init" # Only the __init__ method’s docstring is inserted. -autoclass_content = 'both' +autoclass_content = "both" def setup(app): diff --git a/examples/20_basic/example_classification.py b/examples/20_basic/example_classification.py index fcb99b65ef..621dcf4f86 100644 --- a/examples/20_basic/example_classification.py +++ b/examples/20_basic/example_classification.py @@ -20,8 +20,9 @@ # ============ X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) ############################################################################ # Build and fit a classifier @@ -30,9 +31,9 @@ automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=30, - tmp_folder='/tmp/autosklearn_classification_example_tmp', + tmp_folder="/tmp/autosklearn_classification_example_tmp", ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") ############################################################################ # View the models found by auto-sklearn @@ -52,4 +53,3 @@ predictions = automl.predict(X_test) print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions)) - diff --git a/examples/20_basic/example_multilabel_classification.py b/examples/20_basic/example_multilabel_classification.py index 835b110ea6..bedf974868 100644 --- a/examples/20_basic/example_multilabel_classification.py +++ b/examples/20_basic/example_multilabel_classification.py @@ -29,8 +29,8 @@ # This is to comply with Scikit-learn requirement: # "Positive classes are indicated with 1 and negative classes with 0 or -1." # More information on: https://scikit-learn.org/stable/modules/multiclass.html -y[y == 'TRUE'] = 1 -y[y == 'FALSE'] = 0 +y[y == "TRUE"] = 1 +y[y == "FALSE"] = 0 y = y.astype(int) # Using type of target is a good way to make sure your data @@ -51,9 +51,9 @@ # Bellow two flags are provided to speed up calculations # Not recommended for a real implementation initial_configurations_via_metalearning=0, - smac_scenario_args={'runcount_limit': 1}, + smac_scenario_args={"runcount_limit": 1}, ) -automl.fit(X_train, y_train, dataset_name='reuters') +automl.fit(X_train, y_train, dataset_name="reuters") ############################################################################ # View the models found by auto-sklearn diff --git a/examples/20_basic/example_multioutput_regression.py b/examples/20_basic/example_multioutput_regression.py index a2e345fcac..cb12643adb 100644 --- a/examples/20_basic/example_multioutput_regression.py +++ b/examples/20_basic/example_multioutput_regression.py @@ -32,9 +32,9 @@ automl = AutoSklearnRegressor( time_left_for_this_task=120, per_run_time_limit=30, - tmp_folder='/tmp/autosklearn_multioutput_regression_example_tmp', + tmp_folder="/tmp/autosklearn_multioutput_regression_example_tmp", ) -automl.fit(X_train, y_train, dataset_name='synthetic') +automl.fit(X_train, y_train, dataset_name="synthetic") ############################################################################ # View the models found by auto-sklearn diff --git a/examples/20_basic/example_regression.py b/examples/20_basic/example_regression.py index 6b47607db0..5ade1c2866 100644 --- a/examples/20_basic/example_regression.py +++ b/examples/20_basic/example_regression.py @@ -21,8 +21,9 @@ X, y = sklearn.datasets.load_diabetes(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) ########################### # Build and fit a regressor @@ -31,9 +32,9 @@ automl = autosklearn.regression.AutoSklearnRegressor( time_left_for_this_task=120, per_run_time_limit=30, - tmp_folder='/tmp/autosklearn_regression_example_tmp', + tmp_folder="/tmp/autosklearn_regression_example_tmp", ) -automl.fit(X_train, y_train, dataset_name='diabetes') +automl.fit(X_train, y_train, dataset_name="diabetes") ############################################################################ # View the models found by auto-sklearn @@ -69,12 +70,12 @@ # than the true value), points above the diagonal were underestimated (predicted value is lower than # the true value). -plt.scatter(train_predictions, y_train, label="Train samples", c='#d95f02') -plt.scatter(test_predictions, y_test, label="Test samples", c='#7570b3') +plt.scatter(train_predictions, y_train, label="Train samples", c="#d95f02") +plt.scatter(test_predictions, y_test, label="Test samples", c="#7570b3") plt.xlabel("Predicted value") plt.ylabel("True value") plt.legend() -plt.plot([30, 400], [30, 400], c='k', zorder=0) +plt.plot([30, 400], [30, 400], c="k", zorder=0) plt.xlim([30, 400]) plt.ylim([30, 400]) plt.tight_layout() diff --git a/examples/40_advanced/custom_metrics.py b/examples/40_advanced/custom_metrics.py index 6b548e5718..c6ad14efdd 100644 --- a/examples/40_advanced/custom_metrics.py +++ b/examples/40_advanced/custom_metrics.py @@ -9,6 +9,7 @@ # Custom metrics definition # ========================= + def accuracy(solution, prediction): # custom function defining accuracy return np.mean(solution == prediction) diff --git a/examples/40_advanced/example_calc_multiple_metrics.py b/examples/40_advanced/example_calc_multiple_metrics.py index c7a4e78503..fa4d17cc1e 100644 --- a/examples/40_advanced/example_calc_multiple_metrics.py +++ b/examples/40_advanced/example_calc_multiple_metrics.py @@ -25,9 +25,9 @@ def error(solution, prediction): def get_metric_result(cv_results): results = pd.DataFrame.from_dict(cv_results) - results = results[results['status'] == "Success"] - cols = ['rank_test_scores', 'param_classifier:__choice__', 'mean_test_score'] - cols.extend([key for key in cv_results.keys() if key.startswith('metric_')]) + results = results[results["status"] == "Success"] + cols = ["rank_test_scores", "param_classifier:__choice__", "mean_test_score"] + cols.extend([key for key in cv_results.keys() if key.startswith("metric_")]) return results[cols] @@ -36,25 +36,26 @@ def get_metric_result(cv_results): # ============ X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) ############################################################################ # Build and fit a classifier # ========================== error_rate = autosklearn.metrics.make_scorer( - name='custom_error', + name="custom_error", score_func=error, optimum=0, greater_is_better=False, needs_proba=False, - needs_threshold=False + needs_threshold=False, ) cls = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=30, - scoring_functions=[balanced_accuracy, precision, recall, f1, error_rate] + scoring_functions=[balanced_accuracy, precision, recall, f1, error_rate], ) cls.fit(X_train, y_train, X_test, y_test) diff --git a/examples/40_advanced/example_debug_logging.py b/examples/40_advanced/example_debug_logging.py index 07e2e3ed99..664ce0b461 100644 --- a/examples/40_advanced/example_debug_logging.py +++ b/examples/40_advanced/example_debug_logging.py @@ -28,8 +28,9 @@ # Load kr-vs-kp dataset from https://www.openml.org/d/3 X, y = data = sklearn.datasets.fetch_openml(data_id=3, return_X_y=True, as_frame=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) ############################################################################ @@ -40,33 +41,31 @@ # We will instead create a custom one as follows: logging_config = { - 'version': 1, - 'disable_existing_loggers': True, - 'formatters': { - 'custom': { + "version": 1, + "disable_existing_loggers": True, + "formatters": { + "custom": { # More format options are available in the official # `documentation `_ - 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" } }, - # Any INFO level msg will be printed to the console - 'handlers': { - 'console': { - 'level': 'INFO', - 'formatter': 'custom', - 'class': 'logging.StreamHandler', - 'stream': 'ext://sys.stdout', + "handlers": { + "console": { + "level": "INFO", + "formatter": "custom", + "class": "logging.StreamHandler", + "stream": "ext://sys.stdout", }, }, - - 'loggers': { - '': { # root logger - 'level': 'DEBUG', + "loggers": { + "": { # root logger + "level": "DEBUG", }, - 'Client-EnsembleBuilder': { - 'level': 'DEBUG', - 'handlers': ['console'], + "Client-EnsembleBuilder": { + "level": "DEBUG", + "handlers": ["console"], }, }, } @@ -80,11 +79,11 @@ # Bellow two flags are provided to speed up calculations # Not recommended for a real implementation initial_configurations_via_metalearning=0, - smac_scenario_args={'runcount_limit': 2}, + smac_scenario_args={"runcount_limit": 2}, # Pass the config file we created logging_config=logging_config, # *auto-sklearn* generates temporal files under tmp_folder - tmp_folder='./tmp_folder', + tmp_folder="./tmp_folder", # By default tmp_folder is deleted. We will preserve it # for debug purposes delete_tmp_folder_after_terminate=False, @@ -101,5 +100,5 @@ # * tmp_folder/smac3-output # Auto-sklearn always outputs to this log file # tmp_folder/AutoML*.log -for filename in pathlib.Path('./tmp_folder').glob('*'): +for filename in pathlib.Path("./tmp_folder").glob("*"): print(filename) diff --git a/examples/40_advanced/example_feature_types.py b/examples/40_advanced/example_feature_types.py index 6317eb5a46..7d22edd715 100644 --- a/examples/40_advanced/example_feature_types.py +++ b/examples/40_advanced/example_feature_types.py @@ -4,9 +4,10 @@ Feature Types ============= -In *auto-sklearn* it is possible to specify the feature types of a dataset when calling the method -:meth:`fit() ` by specifying the argument -``feat_type``. The following example demonstrates a way it can be done. +In *auto-sklearn* it is possible to specify the feature types of a dataset when calling +the method :meth:`fit() ` by +specifying the argument ``feat_type``. +The following example demonstrates a way it can be done. Additionally, you can provide a properly formatted pandas DataFrame, and the feature types will be automatically inferred, as demonstrated in @@ -26,11 +27,12 @@ # ============ # Load Australian dataset from https://www.openml.org/d/40981 bunch = data = sklearn.datasets.fetch_openml(data_id=40981, as_frame=True) -y = bunch['target'].to_numpy() -X = bunch['data'].to_numpy(np.float) +y = bunch["target"].to_numpy() +X = bunch["data"].to_numpy(np.float) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) # Auto-sklearn can automatically recognize categorical/numerical data from a pandas # DataFrame. This example highlights how the user can provide the feature types, @@ -38,8 +40,7 @@ # feat_type is a list that tags each column from a DataFrame/ numpy array / list # with the case-insensitive string categorical or numerical, accordingly. feat_type = [ - 'Categorical' if x.name == 'category' else 'Numerical' - for x in bunch['data'].dtypes + "Categorical" if x.name == "category" else "Numerical" for x in bunch["data"].dtypes ] ############################################################################ @@ -51,7 +52,7 @@ # Bellow two flags are provided to speed up calculations # Not recommended for a real implementation initial_configurations_via_metalearning=0, - smac_scenario_args={'runcount_limit': 1}, + smac_scenario_args={"runcount_limit": 1}, ) cls.fit(X_train, y_train, X_test, y_test, feat_type=feat_type) diff --git a/examples/40_advanced/example_get_pipeline_components.py b/examples/40_advanced/example_get_pipeline_components.py index f7a97ead27..80686889ac 100644 --- a/examples/40_advanced/example_get_pipeline_components.py +++ b/examples/40_advanced/example_get_pipeline_components.py @@ -27,8 +27,9 @@ # ============ X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) ############################################################################ # Build and fit the classifier @@ -40,20 +41,16 @@ disable_evaluator_output=False, # To simplify querying the models in the final ensemble, we # restrict auto-sklearn to use only pca as a preprocessor - include={ - 'feature_preprocessor': ['pca'] - }, + include={"feature_preprocessor": ["pca"]}, ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") ############################################################################ # Predict using the model # ======================= predictions = automl.predict(X_test) -print("Accuracy score:{}".format( - sklearn.metrics.accuracy_score(y_test, predictions)) -) +print("Accuracy score:{}".format(sklearn.metrics.accuracy_score(y_test, predictions))) ############################################################################ @@ -104,7 +101,7 @@ # Let's iterative over all entries for run_key in automl.automl_.runhistory_.data: - print('#########') + print("#########") print(run_key) print(automl.automl_.runhistory_.data[run_key]) @@ -166,7 +163,7 @@ print("Lowest loss:", losses_and_configurations[0][0]) print( "Best configuration:", - automl.automl_.runhistory_.ids_config[losses_and_configurations[0][1]] + automl.automl_.runhistory_.ids_config[losses_and_configurations[0][1]], ) ############################################################################ @@ -188,7 +185,7 @@ # The explained variance ratio per stage for i, (weight, pipeline) in enumerate(automl.get_models_with_weights()): for stage_name, component in pipeline.named_steps.items(): - if 'feature_preprocessor' in stage_name: + if "feature_preprocessor" in stage_name: print( "The {}th pipeline has a explained variance of {}".format( i, @@ -196,6 +193,6 @@ # Access the sklearn object via the choice attribute # We want the explained variance attributed of # each principal component - component.choice.preprocessor.explained_variance_ratio_ + component.choice.preprocessor.explained_variance_ratio_, ) ) diff --git a/examples/40_advanced/example_inspect_predictions.py b/examples/40_advanced/example_inspect_predictions.py index 24e149a37b..cf6de2476f 100644 --- a/examples/40_advanced/example_inspect_predictions.py +++ b/examples/40_advanced/example_inspect_predictions.py @@ -36,9 +36,9 @@ automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=30, - tmp_folder='/tmp/autosklearn_inspect_predictions_example_tmp', + tmp_folder="/tmp/autosklearn_inspect_predictions_example_tmp", ) -automl.fit(X_train, y_train, dataset_name='Run_or_walk_information') +automl.fit(X_train, y_train, dataset_name="Run_or_walk_information") s = automl.score(X_train, y_train) print(f"Train score {s}") @@ -61,16 +61,19 @@ r = permutation_importance(automl, X_test, y_test, n_repeats=10, random_state=0) sort_idx = r.importances_mean.argsort()[::-1] -plt.boxplot(r.importances[sort_idx].T, - labels=[dataset.feature_names[i] for i in sort_idx]) +plt.boxplot( + r.importances[sort_idx].T, labels=[dataset.feature_names[i] for i in sort_idx] +) plt.xticks(rotation=90) plt.tight_layout() plt.show() for i in sort_idx[::-1]: - print(f"{dataset.feature_names[i]:10s}: {r.importances_mean[i]:.3f} +/- " - f"{r.importances_std[i]:.3f}") + print( + f"{dataset.feature_names[i]:10s}: {r.importances_mean[i]:.3f} +/- " + f"{r.importances_std[i]:.3f}" + ) ############################################################################################ # Create partial dependence (PD) and individual conditional expectation (ICE) plots - part 2 @@ -90,11 +93,14 @@ # combining ICE (thin lines) and PD (thick line) features = [1, 2] -plot_partial_dependence(automl, dataset.data, - features=features, - grid_resolution=5, - kind="both", - feature_names=dataset.feature_names) +plot_partial_dependence( + automl, + dataset.data, + features=features, + grid_resolution=5, + kind="both", + feature_names=dataset.feature_names, +) plt.tight_layout() plt.show() @@ -106,9 +112,12 @@ # these features. Again, we'll look at acceleration_y and acceleration_z. features = [[1, 2]] -plot_partial_dependence(automl, dataset.data, - features=features, - grid_resolution=5, - feature_names=dataset.feature_names) +plot_partial_dependence( + automl, + dataset.data, + features=features, + grid_resolution=5, + feature_names=dataset.feature_names, +) plt.tight_layout() plt.show() diff --git a/examples/40_advanced/example_interpretable_models.py b/examples/40_advanced/example_interpretable_models.py index a78695082c..7b551de7b8 100644 --- a/examples/40_advanced/example_interpretable_models.py +++ b/examples/40_advanced/example_interpretable_models.py @@ -29,7 +29,9 @@ # Show available preprocessors # ============================ -from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice +from autosklearn.pipeline.components.feature_preprocessing import ( + FeaturePreprocessorChoice, +) for name in FeaturePreprocessorChoice.get_components(): print(name) @@ -39,8 +41,9 @@ # ============ X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) ############################################################################ # Build and fit a classifier @@ -55,18 +58,18 @@ automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=30, - tmp_folder='/tmp/autosklearn_interpretable_models_example_tmp', + tmp_folder="/tmp/autosklearn_interpretable_models_example_tmp", include={ - 'classifier': [ - 'decision_tree', 'lda', 'sgd' + "classifier": ["decision_tree", "lda", "sgd"], + "feature_preprocessor": [ + "no_preprocessing", + "polynomial", + "select_percentile_classification", ], - 'feature_preprocessor': [ - 'no_preprocessing', 'polynomial', 'select_percentile_classification' - ] }, ensemble_size=1, ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") ############################################################################ # Print the final ensemble constructed by auto-sklearn diff --git a/examples/40_advanced/example_metrics.py b/examples/40_advanced/example_metrics.py index 2cf39f1553..33d0f678fd 100644 --- a/examples/40_advanced/example_metrics.py +++ b/examples/40_advanced/example_metrics.py @@ -51,8 +51,9 @@ def error_wk(solution, prediction, extra_argument): # ============ X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) ############################################################################ # Print a list of available metrics @@ -68,7 +69,7 @@ def error_wk(solution, prediction, extra_argument): # First example: Use predefined accuracy metric # ============================================= -print("#"*80) +print("#" * 80) print("Use predefined accuracy metric") cls = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=60, @@ -87,7 +88,7 @@ def error_wk(solution, prediction, extra_argument): # Second example: Use own accuracy metric # ======================================= -print("#"*80) +print("#" * 80) print("Use self defined accuracy metric") accuracy_scorer = autosklearn.metrics.make_scorer( name="accu", @@ -114,15 +115,15 @@ def error_wk(solution, prediction, extra_argument): # Third example: Use own error metric # =================================== -print("#"*80) +print("#" * 80) print("Use self defined error metric") error_rate = autosklearn.metrics.make_scorer( - name='error', + name="error", score_func=error, optimum=0, greater_is_better=False, needs_proba=False, - needs_threshold=False + needs_threshold=False, ) cls = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=60, @@ -141,7 +142,7 @@ def error_wk(solution, prediction, extra_argument): # Fourth example: Use own accuracy metric with additional argument # ================================================================ -print("#"*80) +print("#" * 80) print("Use self defined accuracy with additional argument") accuracy_scorer = autosklearn.metrics.make_scorer( name="accu_add", @@ -153,10 +154,7 @@ def error_wk(solution, prediction, extra_argument): extra_argument=None, ) cls = autosklearn.classification.AutoSklearnClassifier( - time_left_for_this_task=60, - per_run_time_limit=30, - seed=1, - metric=accuracy_scorer + time_left_for_this_task=60, per_run_time_limit=30, seed=1, metric=accuracy_scorer ) cls.fit(X_train, y_train) @@ -169,7 +167,7 @@ def error_wk(solution, prediction, extra_argument): # Fifth example: Use own accuracy metric with additional argument # =============================================================== -print("#"*80) +print("#" * 80) print("Use self defined error with additional argument") error_rate = autosklearn.metrics.make_scorer( name="error_add", diff --git a/examples/40_advanced/example_pandas_train_test.py b/examples/40_advanced/example_pandas_train_test.py index 910cac4c31..7e584fd8aa 100644 --- a/examples/40_advanced/example_pandas_train_test.py +++ b/examples/40_advanced/example_pandas_train_test.py @@ -58,22 +58,19 @@ # Targets for classification are also automatically encoded # If using fetch_openml, data is already properly encoded, below # is an example for user reference -X = pd.DataFrame( - data=X, - columns=['A' + str(i) for i in range(1, 15)] -) -desired_boolean_columns = ['A1'] -desired_categorical_columns = ['A4', 'A5', 'A6', 'A8', 'A9', 'A11', 'A12'] -desired_numerical_columns = ['A2', 'A3', 'A7', 'A10', 'A13', 'A14'] +X = pd.DataFrame(data=X, columns=["A" + str(i) for i in range(1, 15)]) +desired_boolean_columns = ["A1"] +desired_categorical_columns = ["A4", "A5", "A6", "A8", "A9", "A11", "A12"] +desired_numerical_columns = ["A2", "A3", "A7", "A10", "A13", "A14"] for column in X.columns: if column in desired_boolean_columns: - X[column] = X[column].astype('bool') + X[column] = X[column].astype("bool") elif column in desired_categorical_columns: - X[column] = X[column].astype('category') + X[column] = X[column].astype("category") else: X[column] = pd.to_numeric(X[column]) -y = pd.DataFrame(y, dtype='category') +y = pd.DataFrame(y, dtype="category") X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, test_size=0.5, random_state=3 @@ -100,15 +97,15 @@ ############################################################################ # Plot the ensemble performance # =================================== -# The *performance_over_time_* attribute returns a pandas dataframe, which can +# The *performance_over_time_* attribute returns a pandas dataframe, which can # be directly used for plotting poT = cls.performance_over_time_ poT.plot( - x='Timestamp', - kind='line', + x="Timestamp", + kind="line", legend=True, - title='Auto-sklearn accuracy over time', + title="Auto-sklearn accuracy over time", grid=True, ) plt.show() diff --git a/examples/40_advanced/example_resampling.py b/examples/40_advanced/example_resampling.py index 124316a60a..aa6a272373 100644 --- a/examples/40_advanced/example_resampling.py +++ b/examples/40_advanced/example_resampling.py @@ -22,8 +22,9 @@ # ============ X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) ############################################################################ # Holdout @@ -32,15 +33,15 @@ automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=30, - tmp_folder='/tmp/autosklearn_resampling_example_tmp', + tmp_folder="/tmp/autosklearn_resampling_example_tmp", disable_evaluator_output=False, # 'holdout' with 'train_size'=0.67 is the default argument setting # for AutoSklearnClassifier. It is explicitly specified in this example # for demonstrational purpose. - resampling_strategy='holdout', - resampling_strategy_arguments={'train_size': 0.67}, + resampling_strategy="holdout", + resampling_strategy_arguments={"train_size": 0.67}, ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") ############################################################################ # Get the Score of the final ensemble @@ -57,18 +58,18 @@ automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=30, - tmp_folder='/tmp/autosklearn_resampling_example_tmp', + tmp_folder="/tmp/autosklearn_resampling_example_tmp", disable_evaluator_output=False, - resampling_strategy='cv', - resampling_strategy_arguments={'folds': 5}, + resampling_strategy="cv", + resampling_strategy_arguments={"folds": 5}, ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") # One can use models trained during cross-validation directly to predict # for unseen data. For this, all k models trained during k-fold # cross-validation are considered as a single soft-voting ensemble inside # the ensemble constructed with ensemble selection. -print('Before re-fit') +print("Before re-fit") predictions = automl.predict(X_test) print("Accuracy score CV", sklearn.metrics.accuracy_score(y_test, predictions)) @@ -78,7 +79,7 @@ # During fit(), models are fit on individual cross-validation folds. To use # all available data, we call refit() which trains all models in the # final ensemble on the whole dataset. -print('After re-fit') +print("After re-fit") automl.refit(X_train.copy(), y_train.copy()) predictions = automl.predict(X_test) print("Accuracy score CV", sklearn.metrics.accuracy_score(y_test, predictions)) @@ -106,11 +107,11 @@ automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=30, - tmp_folder='/tmp/autosklearn_resampling_example_tmp', + tmp_folder="/tmp/autosklearn_resampling_example_tmp", disable_evaluator_output=False, resampling_strategy=resampling_strategy, ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") print(automl.sprint_statistics()) @@ -126,4 +127,6 @@ # Obviously, this score is pretty bad as we "destroyed" the dataset by # splitting it on the first feature. predictions = automl.predict(X_test) -print("Accuracy score custom split", sklearn.metrics.accuracy_score(y_test, predictions)) +print( + "Accuracy score custom split", sklearn.metrics.accuracy_score(y_test, predictions) +) diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py index 3d230f4ab0..d216caef7c 100644 --- a/examples/40_advanced/example_single_configuration.py +++ b/examples/40_advanced/example_single_configuration.py @@ -46,9 +46,7 @@ # We will limit the configuration space only to # have RandomForest as a valid model. We recommend enabling all # possible models to get a better performance. - include={ - 'classifier': ['random_forest'] - }, + include={"classifier": ["random_forest"]}, delete_tmp_folder_after_terminate=False, ) @@ -60,17 +58,21 @@ # min_samples_split in the Random Forest. We recommend you to look into # how the ConfigSpace package works here: # https://automl.github.io/ConfigSpace/master/ -cs = cls.get_configuration_space(X, y, dataset_name='kr-vs-kp') +cs = cls.get_configuration_space(X, y, dataset_name="kr-vs-kp") config = cs.sample_configuration() -config._values['classifier:random_forest:min_samples_split'] = 11 +config._values["classifier:random_forest:min_samples_split"] = 11 # Make sure that your changed configuration complies with the configuration space config.is_valid_configuration() -pipeline, run_info, run_value = cls.fit_pipeline(X=X_train, y=y_train, - dataset_name='kr-vs-kp', - config=config, - X_test=X_test, y_test=y_test) +pipeline, run_info, run_value = cls.fit_pipeline( + X=X_train, + y=y_train, + dataset_name="kr-vs-kp", + config=config, + X_test=X_test, + y_test=y_test, +) # This object complies with Scikit-Learn Pipeline API. # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html @@ -84,9 +86,9 @@ # We can make sure that our pipeline configuration was honored as follows print("Passed Configuration:", pipeline.config) -print("Random Forest:", pipeline.named_steps['classifier'].choice.estimator) +print("Random Forest:", pipeline.named_steps["classifier"].choice.estimator) # We can also search for new configurations using the fit() method # Any configurations found by Auto-Sklearn -- even the ones created using # fit_pipeline() are stored to disk and can be used for Ensemble Selection -cs = cls.fit(X, y, dataset_name='kr-vs-kp') +cs = cls.fit(X, y, dataset_name="kr-vs-kp") diff --git a/examples/40_advanced/example_text_preprocessing.py b/examples/40_advanced/example_text_preprocessing.py index f60188781b..76c2d91cfc 100644 --- a/examples/40_advanced/example_text_preprocessing.py +++ b/examples/40_advanced/example_text_preprocessing.py @@ -25,20 +25,28 @@ print(f"{X.info()}\n") # manually convert these to string columns -X = X.astype({'name': 'string', 'ticket': 'string', 'cabin': 'string', 'boat': 'string', - 'home.dest': 'string'}) +X = X.astype( + { + "name": "string", + "ticket": "string", + "cabin": "string", + "boat": "string", + "home.dest": "string", + } +) # now *auto-sklearn* handles the string columns with its text feature preprocessing pipeline -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) cls = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=30, # Bellow two flags are provided to speed up calculations # Not recommended for a real implementation initial_configurations_via_metalearning=0, - smac_scenario_args={'runcount_limit': 1}, + smac_scenario_args={"runcount_limit": 1}, ) cls.fit(X_train, y_train, X_test, y_test) @@ -48,20 +56,24 @@ X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True, as_frame=True) -X = X.select_dtypes(exclude=['object']) +X = X.select_dtypes(exclude=["object"]) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) cls = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=30, # Bellow two flags are provided to speed up calculations # Not recommended for a real implementation initial_configurations_via_metalearning=0, - smac_scenario_args={'runcount_limit': 1}, + smac_scenario_args={"runcount_limit": 1}, ) cls.fit(X_train, y_train, X_test, y_test) predictions = cls.predict(X_test) -print("Accuracy score without text preprocessing", sklearn.metrics.accuracy_score(y_test, predictions)) +print( + "Accuracy score without text preprocessing", + sklearn.metrics.accuracy_score(y_test, predictions), +) diff --git a/examples/60_search/example_parallel_manual_spawning_cli.py b/examples/60_search/example_parallel_manual_spawning_cli.py index 41200cd78c..fa2bff375b 100644 --- a/examples/60_search/example_parallel_manual_spawning_cli.py +++ b/examples/60_search/example_parallel_manual_spawning_cli.py @@ -68,7 +68,7 @@ from autosklearn.classification import AutoSklearnClassifier from autosklearn.constants import MULTICLASS_CLASSIFICATION -tmp_folder = '/tmp/autosklearn_parallel_3_example_tmp' +tmp_folder = "/tmp/autosklearn_parallel_3_example_tmp" worker_processes = [] @@ -83,7 +83,7 @@ # location. This filename is also given to the worker so they can find all # relevant information to connect to the scheduler. -scheduler_file_name = 'scheduler-file.json' +scheduler_file_name = "scheduler-file.json" ############################################################################ @@ -99,12 +99,16 @@ # We will now execute this bash command from within Python to have a # self-contained example: + def cli_start_scheduler(scheduler_file_name): - command = ( - f"dask-scheduler --scheduler-file {scheduler_file_name} --idle-timeout 10" + command = f"dask-scheduler --scheduler-file {scheduler_file_name} --idle-timeout 10" + proc = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + check=True, ) - proc = subprocess.run(command, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, shell=True, check=True) while proc.returncode is None: time.sleep(1) @@ -112,7 +116,7 @@ def cli_start_scheduler(scheduler_file_name): if __name__ == "__main__": process_python_worker = multiprocessing.Process( target=cli_start_scheduler, - args=(scheduler_file_name, ), + args=(scheduler_file_name,), ) process_python_worker.start() worker_processes.append(process_python_worker) @@ -141,22 +145,25 @@ def cli_start_scheduler(scheduler_file_name): # We disable dask's memory management by passing ``--memory-limit`` as # Auto-sklearn does the memory management itself. + def cli_start_worker(scheduler_file_name): command = ( "DASK_DISTRIBUTED__WORKER__DAEMON=False " "dask-worker --nthreads 1 --lifetime 35 --memory-limit 0 " f"--scheduler-file {scheduler_file_name}" ) - proc = subprocess.run(command, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, shell=True) + proc = subprocess.run( + command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True + ) while proc.returncode is None: time.sleep(1) -if __name__ == '__main__': + +if __name__ == "__main__": for _ in range(2): process_cli_worker = multiprocessing.Process( target=cli_start_worker, - args=(scheduler_file_name, ), + args=(scheduler_file_name,), ) process_cli_worker.start() worker_processes.append(process_cli_worker) @@ -178,8 +185,9 @@ def cli_start_worker(scheduler_file_name): # ~~~~~~~~~~~~~~~~~~ if __name__ == "__main__": X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) - X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 + ) automl = AutoSklearnClassifier( delete_tmp_folder_after_terminate=False, @@ -198,7 +206,7 @@ def cli_start_worker(scheduler_file_name): automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, - dataset_name='digits', + dataset_name="digits", ensemble_size=20, ensemble_nbest=50, ) @@ -215,7 +223,7 @@ def cli_start_worker(scheduler_file_name): # This is only necessary if the workers are started from within this python # script. In a real application one would start them directly from the command # line. -if __name__ == '__main__': +if __name__ == "__main__": process_python_worker.join() for process in worker_processes: process.join() diff --git a/examples/60_search/example_parallel_manual_spawning_python.py b/examples/60_search/example_parallel_manual_spawning_python.py index ed723598a9..75c5bcee30 100644 --- a/examples/60_search/example_parallel_manual_spawning_python.py +++ b/examples/60_search/example_parallel_manual_spawning_python.py @@ -58,7 +58,7 @@ from autosklearn.classification import AutoSklearnClassifier from autosklearn.constants import MULTICLASS_CLASSIFICATION -tmp_folder = '/tmp/autosklearn_parallel_2_example_tmp' +tmp_folder = "/tmp/autosklearn_parallel_2_example_tmp" ############################################################################ @@ -73,8 +73,9 @@ # https://docs.dask.org/en/latest/setup/python-advanced.html for further # information. + def start_python_worker(scheduler_address): - dask.config.set({'distributed.worker.daemon': False}) + dask.config.set({"distributed.worker.daemon": False}) async def do_work(): async with dask.distributed.Nanny( @@ -97,14 +98,17 @@ async def do_work(): # To use auto-sklearn in parallel we must guard the code with # ``if __name__ == '__main__'``. We then start a dask cluster as a context, # which means that it is automatically stopped once all computation is done. -if __name__ == '__main__': +if __name__ == "__main__": X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) - X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 + ) # 1. Create a dask scheduler (LocalCluster) with dask.distributed.LocalCluster( - n_workers=0, processes=True, threads_per_worker=1, + n_workers=0, + processes=True, + threads_per_worker=1, ) as cluster: # 2. Start the workers @@ -114,7 +118,7 @@ async def do_work(): for _ in range(2): process_python_worker = multiprocessing.Process( target=start_python_worker, - args=(cluster.scheduler_address, ), + args=(cluster.scheduler_address,), ) process_python_worker.start() worker_processes.append(process_python_worker) @@ -141,7 +145,7 @@ async def do_work(): automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, - dataset_name='digits', + dataset_name="digits", ensemble_size=20, ensemble_nbest=50, ) diff --git a/examples/60_search/example_parallel_n_jobs.py b/examples/60_search/example_parallel_n_jobs.py index b7265ce3fa..1cb5014ca8 100644 --- a/examples/60_search/example_parallel_n_jobs.py +++ b/examples/60_search/example_parallel_n_jobs.py @@ -27,26 +27,27 @@ # Data Loading # ============ X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) ############################################################################ # Build and fit a classifier # ========================== # # To use ``n_jobs_`` we must guard the code -if __name__ == '__main__': +if __name__ == "__main__": automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=30, - tmp_folder='/tmp/autosklearn_parallel_1_example_tmp', + tmp_folder="/tmp/autosklearn_parallel_1_example_tmp", n_jobs=4, # Each one of the 4 jobs is allocated 3GB memory_limit=3072, seed=5, ) - automl.fit(X_train, y_train, dataset_name='breast_cancer') + automl.fit(X_train, y_train, dataset_name="breast_cancer") # Print statistics about the auto-sklearn run such as number of # iterations, number of models failed with a time out. diff --git a/examples/60_search/example_random_search.py b/examples/60_search/example_random_search.py index 2c9cc76695..520c8c18b0 100644 --- a/examples/60_search/example_random_search.py +++ b/examples/60_search/example_random_search.py @@ -29,8 +29,9 @@ # ============ X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) ############################################################################ @@ -48,8 +49,10 @@ def get_roar_object_callback( """Random online adaptive racing.""" if n_jobs > 1 or (dask_client and len(dask_client.nthreads()) > 1): - raise ValueError("Please make sure to guard the code invoking Auto-sklearn by " - "`if __name__ == '__main__'` and remove this exception.") + raise ValueError( + "Please make sure to guard the code invoking Auto-sklearn by " + "`if __name__ == '__main__'` and remove this exception." + ) scenario = Scenario(scenario_dict) return ROAR( @@ -66,15 +69,15 @@ def get_roar_object_callback( automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=60, per_run_time_limit=15, - tmp_folder='/tmp/autosklearn_random_search_example_tmp', + tmp_folder="/tmp/autosklearn_random_search_example_tmp", initial_configurations_via_metalearning=0, # The callback to get the SMAC object get_smac_object_callback=get_roar_object_callback, ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") -print('#' * 80) -print('Results for ROAR.') +print("#" * 80) +print("Results for ROAR.") # Print the final ensemble constructed by auto-sklearn via ROAR. pprint(automl.show_models(), indent=4) predictions = automl.predict(X_test) @@ -88,22 +91,18 @@ def get_roar_object_callback( # Fit a classifier using Random Search # ==================================== def get_random_search_object_callback( - scenario_dict, - seed, - ta, - ta_kwargs, - metalearning_configurations, - n_jobs, - dask_client + scenario_dict, seed, ta, ta_kwargs, metalearning_configurations, n_jobs, dask_client ): - """ Random search """ + """Random search""" if n_jobs > 1 or (dask_client and len(dask_client.nthreads()) > 1): - raise ValueError("Please make sure to guard the code invoking Auto-sklearn by " - "`if __name__ == '__main__'` and remove this exception.") + raise ValueError( + "Please make sure to guard the code invoking Auto-sklearn by " + "`if __name__ == '__main__'` and remove this exception." + ) - scenario_dict['minR'] = len(scenario_dict['instances']) - scenario_dict['initial_incumbent'] = 'RANDOM' + scenario_dict["minR"] = len(scenario_dict["instances"]) + scenario_dict["initial_incumbent"] = "RANDOM" scenario = Scenario(scenario_dict) return ROAR( scenario=scenario, @@ -119,15 +118,15 @@ def get_random_search_object_callback( automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=60, per_run_time_limit=15, - tmp_folder='/tmp/autosklearn_random_search_example_tmp', + tmp_folder="/tmp/autosklearn_random_search_example_tmp", initial_configurations_via_metalearning=0, # Passing the callback to get the SMAC object get_smac_object_callback=get_random_search_object_callback, ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") -print('#' * 80) -print('Results for random search.') +print("#" * 80) +print("Results for random search.") # Print the final ensemble constructed by auto-sklearn via random search. pprint(automl.show_models(), indent=4) diff --git a/examples/60_search/example_sequential.py b/examples/60_search/example_sequential.py index fad088396d..1ff63649da 100644 --- a/examples/60_search/example_sequential.py +++ b/examples/60_search/example_sequential.py @@ -22,8 +22,9 @@ # ====================================== X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) ############################################################################ # Build and fit the classifier @@ -32,14 +33,14 @@ automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=30, - tmp_folder='/tmp/autosklearn_sequential_example_tmp', + tmp_folder="/tmp/autosklearn_sequential_example_tmp", # Do not construct ensembles in parallel to avoid using more than one # core at a time. The ensemble will be constructed after auto-sklearn # finished fitting all machine learning models. ensemble_size=0, delete_tmp_folder_after_terminate=False, ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") # This call to fit_ensemble uses all models trained in the previous call # to fit to build an ensemble which can be used with automl.predict() diff --git a/examples/60_search/example_successive_halving.py b/examples/60_search/example_successive_halving.py index fdb29da6e0..e57be7f157 100644 --- a/examples/60_search/example_successive_halving.py +++ b/examples/60_search/example_successive_halving.py @@ -27,6 +27,7 @@ # Define a callback that instantiates SuccessiveHalving # ===================================================== + def get_smac_object_callback(budget_type): def get_smac_object( scenario_dict, @@ -43,8 +44,10 @@ def get_smac_object( from smac.scenario.scenario import Scenario if n_jobs > 1 or (dask_client and len(dask_client.nthreads()) > 1): - raise ValueError("Please make sure to guard the code invoking Auto-sklearn by " - "`if __name__ == '__main__'` and remove this exception.") + raise ValueError( + "Please make sure to guard the code invoking Auto-sklearn by " + "`if __name__ == '__main__'` and remove this exception." + ) scenario = Scenario(scenario_dict) if len(metalearning_configurations) > 0: @@ -54,7 +57,7 @@ def get_smac_object( initial_configurations = None rh2EPM = RunHistory2EPM4LogCost - ta_kwargs['budget_type'] = budget_type + ta_kwargs["budget_type"] = budget_type return SMAC4AC( scenario=scenario, @@ -66,14 +69,15 @@ def get_smac_object( run_id=seed, intensifier=SuccessiveHalving, intensifier_kwargs={ - 'initial_budget': 10.0, - 'max_budget': 100, - 'eta': 2, - 'min_chall': 1 + "initial_budget": 10.0, + "max_budget": 100, + "eta": 2, + "min_chall": 1, }, n_jobs=n_jobs, dask_client=dask_client, ) + return get_smac_object @@ -82,8 +86,9 @@ def get_smac_object( # ============ X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1, shuffle=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1, shuffle=True +) ############################################################################ # Build and fit a classifier @@ -92,23 +97,26 @@ def get_smac_object( automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=40, per_run_time_limit=10, - tmp_folder='/tmp/autosklearn_sh_example_tmp', + tmp_folder="/tmp/autosklearn_sh_example_tmp", disable_evaluator_output=False, # 'holdout' with 'train_size'=0.67 is the default argument setting # for AutoSklearnClassifier. It is explicitly specified in this example # for demonstrational purpose. - resampling_strategy='holdout', - resampling_strategy_arguments={'train_size': 0.67}, + resampling_strategy="holdout", + resampling_strategy_arguments={"train_size": 0.67}, include={ - 'classifier': [ - 'extra_trees', 'gradient_boosting', 'random_forest', - 'sgd', 'passive_aggressive' + "classifier": [ + "extra_trees", + "gradient_boosting", + "random_forest", + "sgd", + "passive_aggressive", ], - 'feature_preprocessor': ['no_preprocessing'] + "feature_preprocessor": ["no_preprocessing"], }, - get_smac_object_callback=get_smac_object_callback('iterations'), + get_smac_object_callback=get_smac_object_callback("iterations"), ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") pprint(automl.show_models(), indent=4) predictions = automl.predict(X_test) @@ -122,25 +130,29 @@ def get_smac_object( # ======================================================== X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1, shuffle=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1, shuffle=True +) automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=40, per_run_time_limit=10, - tmp_folder='/tmp/autosklearn_sh_example_tmp_01', + tmp_folder="/tmp/autosklearn_sh_example_tmp_01", disable_evaluator_output=False, - resampling_strategy='cv', + resampling_strategy="cv", include={ - 'classifier': [ - 'extra_trees', 'gradient_boosting', 'random_forest', - 'sgd', 'passive_aggressive' + "classifier": [ + "extra_trees", + "gradient_boosting", + "random_forest", + "sgd", + "passive_aggressive", ], - 'feature_preprocessor': ['no_preprocessing'] + "feature_preprocessor": ["no_preprocessing"], }, - get_smac_object_callback=get_smac_object_callback('iterations'), + get_smac_object_callback=get_smac_object_callback("iterations"), ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") # Print the final ensemble constructed by auto-sklearn. pprint(automl.show_models(), indent=4) @@ -156,25 +168,29 @@ def get_smac_object( # ============================================================= X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1, shuffle=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1, shuffle=True +) automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=40, per_run_time_limit=10, - tmp_folder='/tmp/autosklearn_sh_example_tmp_cv_02', + tmp_folder="/tmp/autosklearn_sh_example_tmp_cv_02", disable_evaluator_output=False, - resampling_strategy='cv-iterative-fit', + resampling_strategy="cv-iterative-fit", include={ - 'classifier': [ - 'extra_trees', 'gradient_boosting', 'random_forest', - 'sgd', 'passive_aggressive' + "classifier": [ + "extra_trees", + "gradient_boosting", + "random_forest", + "sgd", + "passive_aggressive", ], - 'feature_preprocessor': ['no_preprocessing'] + "feature_preprocessor": ["no_preprocessing"], }, - get_smac_object_callback=get_smac_object_callback('iterations'), + get_smac_object_callback=get_smac_object_callback("iterations"), ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") # Print the final ensemble constructed by auto-sklearn. pprint(automl.show_models(), indent=4) @@ -190,22 +206,23 @@ def get_smac_object( # =============================================================== X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1, shuffle=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1, shuffle=True +) automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=40, per_run_time_limit=10, - tmp_folder='/tmp/autosklearn_sh_example_tmp_03', + tmp_folder="/tmp/autosklearn_sh_example_tmp_03", disable_evaluator_output=False, # 'holdout' with 'train_size'=0.67 is the default argument setting # for AutoSklearnClassifier. It is explicitly specified in this example # for demonstrational purpose. - resampling_strategy='holdout', - resampling_strategy_arguments={'train_size': 0.67}, - get_smac_object_callback=get_smac_object_callback('subsample'), + resampling_strategy="holdout", + resampling_strategy_arguments={"train_size": 0.67}, + get_smac_object_callback=get_smac_object_callback("subsample"), ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") # Print the final ensemble constructed by auto-sklearn. pprint(automl.show_models(), indent=4) @@ -222,27 +239,26 @@ def get_smac_object( # subsamples otherwise X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=1, shuffle=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1, shuffle=True +) automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=40, per_run_time_limit=10, - tmp_folder='/tmp/autosklearn_sh_example_tmp_04', + tmp_folder="/tmp/autosklearn_sh_example_tmp_04", disable_evaluator_output=False, # 'holdout' with 'train_size'=0.67 is the default argument setting # for AutoSklearnClassifier. It is explicitly specified in this example # for demonstrational purpose. - resampling_strategy='holdout', - resampling_strategy_arguments={'train_size': 0.67}, + resampling_strategy="holdout", + resampling_strategy_arguments={"train_size": 0.67}, include={ - 'classifier': [ - 'extra_trees', 'gradient_boosting', 'random_forest', 'sgd' - ] + "classifier": ["extra_trees", "gradient_boosting", "random_forest", "sgd"] }, - get_smac_object_callback=get_smac_object_callback('mixed'), + get_smac_object_callback=get_smac_object_callback("mixed"), ) -automl.fit(X_train, y_train, dataset_name='breast_cancer') +automl.fit(X_train, y_train, dataset_name="breast_cancer") # Print the final ensemble constructed by auto-sklearn. pprint(automl.show_models(), indent=4) diff --git a/examples/80_extending/example_extending_classification.py b/examples/80_extending/example_extending_classification.py index b6132f4c18..b5112c022b 100644 --- a/examples/80_extending/example_extending_classification.py +++ b/examples/80_extending/example_extending_classification.py @@ -9,16 +9,22 @@ from pprint import pprint from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import CategoricalHyperparameter, \ - UniformIntegerHyperparameter, UniformFloatHyperparameter +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, + UniformFloatHyperparameter, +) import sklearn.metrics import autosklearn.classification import autosklearn.pipeline.components.classification -from autosklearn.pipeline.components.base \ - import AutoSklearnClassificationAlgorithm -from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, UNSIGNED_DATA, \ - PREDICTIONS +from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm +from autosklearn.pipeline.constants import ( + DENSE, + SIGNED_DATA, + UNSIGNED_DATA, + PREDICTIONS, +) from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split @@ -28,8 +34,8 @@ # Create MLP classifier component for auto-sklearn # ================================================ -class MLPClassifier(AutoSklearnClassificationAlgorithm): +class MLPClassifier(AutoSklearnClassificationAlgorithm): def __init__( self, hidden_layer_depth, @@ -52,15 +58,18 @@ def fit(self, X, y): self.alpha = float(self.alpha) from sklearn.neural_network import MLPClassifier + hidden_layer_sizes = tuple( self.num_nodes_per_layer for i in range(self.hidden_layer_depth) ) - self.estimator = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, - activation=self.activation, - alpha=self.alpha, - solver=self.solver, - random_state=self.random_state) + self.estimator = MLPClassifier( + hidden_layer_sizes=hidden_layer_sizes, + activation=self.activation, + alpha=self.alpha, + solver=self.solver, + random_state=self.random_state, + ) self.estimator.fit(X, y) return self @@ -77,17 +86,17 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): return { - 'shortname': 'MLP Classifier', - 'name': 'MLP CLassifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': False, + "shortname": "MLP Classifier", + "name": "MLP CLassifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": False, # Both input and output must be tuple(iterable) - 'input': [DENSE, SIGNED_DATA, UNSIGNED_DATA], - 'output': [PREDICTIONS] + "input": [DENSE, SIGNED_DATA, UNSIGNED_DATA], + "output": [PREDICTIONS], } @staticmethod @@ -100,18 +109,25 @@ def get_hyperparameter_search_space(dataset_properties=None): name="num_nodes_per_layer", lower=16, upper=216, default_value=32 ) activation = CategoricalHyperparameter( - name="activation", choices=['identity', 'logistic', 'tanh', 'relu'], - default_value='relu' + name="activation", + choices=["identity", "logistic", "tanh", "relu"], + default_value="relu", ) alpha = UniformFloatHyperparameter( name="alpha", lower=0.0001, upper=1.0, default_value=0.0001 ) solver = CategoricalHyperparameter( - name="solver", choices=['lbfgs', 'sgd', 'adam'], default_value='adam' + name="solver", choices=["lbfgs", "sgd", "adam"], default_value="adam" + ) + cs.add_hyperparameters( + [ + hidden_layer_depth, + num_nodes_per_layer, + activation, + alpha, + solver, + ] ) - cs.add_hyperparameters([ - hidden_layer_depth, num_nodes_per_layer, activation, alpha, solver, - ]) return cs @@ -134,13 +150,11 @@ def get_hyperparameter_search_space(dataset_properties=None): clf = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=10, - include={ - 'classifier': ['MLPClassifier'] - }, + include={"classifier": ["MLPClassifier"]}, # Bellow two flags are provided to speed up calculations # Not recommended for a real implementation initial_configurations_via_metalearning=0, - smac_scenario_args={'runcount_limit': 5}, + smac_scenario_args={"runcount_limit": 5}, ) clf.fit(X_train, y_train) diff --git a/examples/80_extending/example_extending_data_preprocessor.py b/examples/80_extending/example_extending_data_preprocessor.py index 7fdd72e971..aa5c443255 100644 --- a/examples/80_extending/example_extending_data_preprocessor.py +++ b/examples/80_extending/example_extending_data_preprocessor.py @@ -21,9 +21,8 @@ # Create NoPreprocessing component for auto-sklearn # ================================================= class NoPreprocessing(AutoSklearnPreprocessingAlgorithm): - def __init__(self, **kwargs): - """ This preprocessors does not change the data """ + """This preprocessors does not change the data""" # Some internal checks makes sure parameters are set for key, val in kwargs.items(): setattr(self, key, val) @@ -37,16 +36,16 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): return { - 'shortname': 'NoPreprocessing', - 'name': 'NoPreprocessing', - 'handles_regression': True, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT,) + "shortname": "NoPreprocessing", + "name": "NoPreprocessing", + "handles_regression": True, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": True, + "is_deterministic": True, + "input": (SPARSE, DENSE, UNSIGNED_DATA), + "output": (INPUT,), } @staticmethod @@ -70,13 +69,11 @@ def get_hyperparameter_search_space(dataset_properties=None): clf = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, - include={ - 'data_preprocessor': ['NoPreprocessing'] - }, + include={"data_preprocessor": ["NoPreprocessing"]}, # Bellow two flags are provided to speed up calculations # Not recommended for a real implementation initial_configurations_via_metalearning=0, - smac_scenario_args={'runcount_limit': 5}, + smac_scenario_args={"runcount_limit": 5}, ) clf.fit(X_train, y_train) diff --git a/examples/80_extending/example_extending_preprocessor.py b/examples/80_extending/example_extending_preprocessor.py index 9ac93a45b3..1eb3fc1daf 100644 --- a/examples/80_extending/example_extending_preprocessor.py +++ b/examples/80_extending/example_extending_preprocessor.py @@ -10,16 +10,17 @@ from pprint import pprint from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, CategoricalHyperparameter +from ConfigSpace.hyperparameters import ( + UniformFloatHyperparameter, + CategoricalHyperparameter, +) from ConfigSpace.conditions import InCondition import sklearn.metrics import autosklearn.classification import autosklearn.pipeline.components.feature_preprocessing -from autosklearn.pipeline.components.base \ - import AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, \ - UNSIGNED_DATA +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, UNSIGNED_DATA from autosklearn.util.common import check_none from sklearn.datasets import load_breast_cancer @@ -30,7 +31,6 @@ # Create LDA component for auto-sklearn # ===================================== class LDA(AutoSklearnPreprocessingAlgorithm): - def __init__(self, solver, tol, shrinkage=None, random_state=None): self.solver = solver self.shrinkage = shrinkage @@ -46,6 +46,7 @@ def fit(self, X, y=None): self.tol = float(self.tol) import sklearn.discriminant_analysis + self.preprocessor = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( shrinkage=self.shrinkage, solver=self.solver, @@ -62,23 +63,23 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): return { - 'shortname': 'LDA', - 'name': 'Linear Discriminant Analysis', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, UNSIGNED_DATA, SIGNED_DATA), - 'output': (DENSE, UNSIGNED_DATA, SIGNED_DATA) + "shortname": "LDA", + "name": "Linear Discriminant Analysis", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, UNSIGNED_DATA, SIGNED_DATA), + "output": (DENSE, UNSIGNED_DATA, SIGNED_DATA), } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() solver = CategoricalHyperparameter( - name="solver", choices=['svd', 'lsqr', 'eigen'], default_value='svd' + name="solver", choices=["svd", "lsqr", "eigen"], default_value="svd" ) shrinkage = UniformFloatHyperparameter( name="shrinkage", lower=0.0, upper=1.0, default_value=0.5 @@ -87,7 +88,7 @@ def get_hyperparameter_search_space(dataset_properties=None): name="tol", lower=0.0001, upper=1, default_value=0.0001 ) cs.add_hyperparameters([solver, shrinkage, tol]) - shrinkage_condition = InCondition(shrinkage, solver, ['lsqr', 'eigen']) + shrinkage_condition = InCondition(shrinkage, solver, ["lsqr", "eigen"]) cs.add_condition(shrinkage_condition) return cs @@ -115,13 +116,11 @@ def get_hyperparameter_search_space(dataset_properties=None): clf = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=30, - include={ - 'feature_preprocessor': ['LDA'] - }, + include={"feature_preprocessor": ["LDA"]}, # Bellow two flags are provided to speed up calculations # Not recommended for a real implementation initial_configurations_via_metalearning=0, - smac_scenario_args={'runcount_limit': 5}, + smac_scenario_args={"runcount_limit": 5}, ) clf.fit(X_train, y_train) diff --git a/examples/80_extending/example_extending_regression.py b/examples/80_extending/example_extending_regression.py index 3bdc008d4e..4d6987a9db 100644 --- a/examples/80_extending/example_extending_regression.py +++ b/examples/80_extending/example_extending_regression.py @@ -9,16 +9,24 @@ from pprint import pprint from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - UniformIntegerHyperparameter, CategoricalHyperparameter +from ConfigSpace.hyperparameters import ( + UniformFloatHyperparameter, + UniformIntegerHyperparameter, + CategoricalHyperparameter, +) from ConfigSpace.conditions import EqualsCondition import sklearn.metrics import autosklearn.regression import autosklearn.pipeline.components.regression from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm -from autosklearn.pipeline.constants import SPARSE, DENSE, \ - SIGNED_DATA, UNSIGNED_DATA, PREDICTIONS +from autosklearn.pipeline.constants import ( + SPARSE, + DENSE, + SIGNED_DATA, + UNSIGNED_DATA, + PREDICTIONS, +) from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split @@ -28,8 +36,8 @@ # Implement kernel ridge regression component for auto-sklearn # ============================================================ -class KernelRidgeRegression(AutoSklearnRegressionAlgorithm): +class KernelRidgeRegression(AutoSklearnRegressionAlgorithm): def __init__(self, alpha, kernel, gamma, degree, coef0, random_state=None): self.alpha = alpha self.kernel = kernel @@ -46,12 +54,13 @@ def fit(self, X, y): self.coef0 = float(self.coef0) import sklearn.kernel_ridge + self.estimator = sklearn.kernel_ridge.KernelRidge( alpha=self.alpha, kernel=self.kernel, gamma=self.gamma, degree=self.degree, - coef0=self.coef0 + coef0=self.coef0, ) self.estimator.fit(X, y) return self @@ -64,42 +73,46 @@ def predict(self, X): @staticmethod def get_properties(dataset_properties=None): return { - 'shortname': 'KRR', - 'name': 'Kernel Ridge Regression', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': True, - 'is_deterministic': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA, SIGNED_DATA), - 'output': (PREDICTIONS,) + "shortname": "KRR", + "name": "Kernel Ridge Regression", + "handles_regression": True, + "handles_classification": False, + "handles_multiclass": False, + "handles_multilabel": False, + "handles_multioutput": True, + "is_deterministic": True, + "input": (SPARSE, DENSE, UNSIGNED_DATA, SIGNED_DATA), + "output": (PREDICTIONS,), } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() alpha = UniformFloatHyperparameter( - name='alpha', lower=10 ** -5, upper=1, log=True, default_value=1.0 + name="alpha", lower=10**-5, upper=1, log=True, default_value=1.0 ) kernel = CategoricalHyperparameter( - name='kernel', + name="kernel", # We restrict ourselves to two possible kernels for this example - choices=['polynomial', 'rbf'], - default_value='polynomial' + choices=["polynomial", "rbf"], + default_value="polynomial", ) gamma = UniformFloatHyperparameter( - name='gamma', lower=0.00001, upper=1, default_value=0.1, log=True + name="gamma", lower=0.00001, upper=1, default_value=0.1, log=True ) degree = UniformIntegerHyperparameter( - name='degree', lower=2, upper=5, default_value=3 + name="degree", lower=2, upper=5, default_value=3 ) coef0 = UniformFloatHyperparameter( - name='coef0', lower=1e-2, upper=1e2, log=True, default_value=1, + name="coef0", + lower=1e-2, + upper=1e2, + log=True, + default_value=1, ) cs.add_hyperparameters([alpha, kernel, gamma, degree, coef0]) - degree_condition = EqualsCondition(degree, kernel, 'polynomial') - coef0_condition = EqualsCondition(coef0, kernel, 'polynomial') + degree_condition = EqualsCondition(degree, kernel, "polynomial") + coef0_condition = EqualsCondition(coef0, kernel, "polynomial") cs.add_conditions([degree_condition, coef0_condition]) return cs @@ -123,13 +136,11 @@ def get_hyperparameter_search_space(dataset_properties=None): reg = autosklearn.regression.AutoSklearnRegressor( time_left_for_this_task=30, per_run_time_limit=10, - include={ - 'regressor': ['KernelRidgeRegression'] - }, + include={"regressor": ["KernelRidgeRegression"]}, # Bellow two flags are provided to speed up calculations # Not recommended for a real implementation initial_configurations_via_metalearning=0, - smac_scenario_args={'runcount_limit': 5}, + smac_scenario_args={"runcount_limit": 5}, ) reg.fit(X_train, y_train) diff --git a/examples/80_extending/example_restrict_number_of_hyperparameters.py b/examples/80_extending/example_restrict_number_of_hyperparameters.py index 9c6ec2501f..d8bd2f4a98 100644 --- a/examples/80_extending/example_restrict_number_of_hyperparameters.py +++ b/examples/80_extending/example_restrict_number_of_hyperparameters.py @@ -9,15 +9,19 @@ """ from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, UniformFloatHyperparameter +from ConfigSpace.hyperparameters import ( + UniformIntegerHyperparameter, + UniformFloatHyperparameter, +) from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split import autosklearn.classification import autosklearn.pipeline.components.classification -from autosklearn.pipeline.components.classification \ - import AutoSklearnClassificationAlgorithm +from autosklearn.pipeline.components.classification import ( + AutoSklearnClassificationAlgorithm, +) from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE @@ -29,8 +33,8 @@ # default parametrization (``max_features``). Instead, it also # tunes the number of estimators (``n_estimators``). -class CustomRandomForest(AutoSklearnClassificationAlgorithm): +class CustomRandomForest(AutoSklearnClassificationAlgorithm): def __init__(self, n_estimators, max_features, random_state=None): self.n_estimators = n_estimators self.max_features = max_features @@ -67,16 +71,16 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): return { - 'shortname': 'RF', - 'name': 'Random Forest Classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,) + "shortname": "RF", + "name": "Random Forest Classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), } @staticmethod @@ -87,8 +91,12 @@ def get_hyperparameter_search_space(dataset_properties=None): # m is the total number of features, and max_features is the hyperparameter specified below. # The default is 0.5, which yields sqrt(m) features as max_features in the estimator. This # corresponds with Geurts' heuristic. - max_features = UniformFloatHyperparameter("max_features", 0., 1., default_value=0.5) - n_estimators = UniformIntegerHyperparameter("n_estimators", 10, 1000, default_value=100) + max_features = UniformFloatHyperparameter( + "max_features", 0.0, 1.0, default_value=0.5 + ) + n_estimators = UniformIntegerHyperparameter( + "n_estimators", 10, 1000, default_value=100 + ) cs.add_hyperparameters([max_features, n_estimators]) return cs @@ -114,13 +122,11 @@ def get_hyperparameter_search_space(dataset_properties=None): time_left_for_this_task=30, per_run_time_limit=10, # Here we exclude auto-sklearn's default random forest component - exclude={ - 'classifier': ['random_forest'] - }, + exclude={"classifier": ["random_forest"]}, # Bellow two flags are provided to speed up calculations # Not recommended for a real implementation initial_configurations_via_metalearning=0, - smac_scenario_args={'runcount_limit': 1}, + smac_scenario_args={"runcount_limit": 1}, ) clf.fit(X_train, y_train) @@ -131,5 +137,5 @@ def get_hyperparameter_search_space(dataset_properties=None): # Observe that this configuration space only contains our custom random # forest, but not auto-sklearn's ``random_forest`` cs = clf.get_configuration_space(X_train, y_train) -assert 'random_forest' not in str(cs) +assert "random_forest" not in str(cs) print(cs) diff --git a/misc/create_hyperparameter_table.py b/misc/create_hyperparameter_table.py index dd23f8ac29..7495ee686a 100644 --- a/misc/create_hyperparameter_table.py +++ b/misc/create_hyperparameter_table.py @@ -15,43 +15,43 @@ CONST = "constant" UN = "unparameterized" -template_string = \ -""" +template_string = r""" \documentclass{article} %% For LaTeX2 \usepackage[a4paper, left=5mm, right=5mm, top=5mm, bottom=5mm]{geometry} -%%\\usepackage[landscape]{geometry} -\\usepackage{multirow} %% import command \multicolmun -\\usepackage{tabularx} %% Convenient table formatting -\\usepackage{booktabs} %% provides \\toprule, \midrule and \\bottomrule +%%\usepackage[landscape]{geometry} +\usepackage{multirow} %% import command \multicolmun +\usepackage{tabularx} %% Convenient table formatting +\usepackage{booktabs} %% provides \\toprule, \midrule and \\bottomrule -\\begin{document} +\begin{document} %s -\\end{document} +\end{document} """ -caption_str = "Number of Hyperparameters for each possible %s " \ - "for a dataset with these properties: %s" - -table_str = \ -""" -\\begin{table}[t!] -\\centering -\\scriptsize -\\caption{ %s } -\\begin{tabularx}{\\textwidth}{ X X X X X X } -\\toprule -name & \#$\lambda$ & cat (cond) & cont (cond) & const & un \\\\ -\\toprule -\\\\ +caption_str = ( + "Number of Hyperparameters for each possible %s " + "for a dataset with these properties: %s" +) + +table_str = r""" +\begin{table}[t!] +\centering +\scriptsize +\caption{ %s } +\begin{tabularx}{\textwidth}{ X X X X X X } +\toprule +name & \#$\lambda$ & cat (cond) & cont (cond) & const & un \\ +\toprule +\\ %s -\\\\ -\\toprule -\\bottomrule -\\end{tabularx} -\\end{table} +\\ +\toprule +\bottomrule +\end{tabularx} +\end{table} """ @@ -59,11 +59,13 @@ def get_dict(task_type="classifier", **kwargs): assert task_type in ("classifier", "regressor") if task_type == "classifier": - cs = autosklearn.pipeline.classification.SimpleClassificationPipeline\ - .get_hyperparameter_search_space(dataset_properties=kwargs) + cs = autosklearn.pipeline.classification.SimpleClassificationPipeline.get_hyperparameter_search_space( + dataset_properties=kwargs + ) elif task_type == "regressor": - cs = autosklearn.pipeline.regression.SimpleRegressionPipeline\ - .get_hyperparameter_search_space(dataset_properties=kwargs) + cs = autosklearn.pipeline.regression.SimpleRegressionPipeline.get_hyperparameter_search_space( + dataset_properties=kwargs + ) else: raise ValueError("'task_type' is not in ('classifier', 'regressor')") @@ -73,7 +75,7 @@ def get_dict(task_type="classifier", **kwargs): for h in cs.get_hyperparameters(): if h.name == "feature_preprocessor:__choice__": preprocessor = h - elif h.name == (task_type + ':__choice__'): + elif h.name == (task_type + ":__choice__"): estimator = h if estimator is None: @@ -100,8 +102,9 @@ def get_dict(task_type="classifier", **kwargs): preprocessor_dict[i][UN] = 0 for h in cs.get_hyperparameters(): - if h.name == "feature_preprocessor:__choice__" or \ - h.name == (task_type + ':__choice__'): + if h.name == "feature_preprocessor:__choice__" or h.name == ( + task_type + ":__choice__" + ): continue # walk over both dicts for d in (estimator_dict, preprocessor_dict): @@ -116,14 +119,18 @@ def get_dict(task_type="classifier", **kwargs): d[est][CAT] += 1 elif isinstance(h, ConfigSpace.hyperparameters.Constant): d[est][CONST] += 1 - elif isinstance(h, ConfigSpace.hyperparameters.UnParametrizedHyperparameter): + elif isinstance( + h, ConfigSpace.hyperparameters.UnParametrizedHyperparameter + ): d[est][UN] += 1 else: raise ValueError("Don't know that type: %s" % type(h)) for h in cs.get_conditions(): - if h.parent.name == (task_type + ':__choice__') or h.parent.name == \ - "feature_preprocessor:__choice__": + if ( + h.parent.name == (task_type + ":__choice__") + or h.parent.name == "feature_preprocessor:__choice__" + ): # ignore this condition # print "IGNORE", h continue @@ -132,22 +139,30 @@ def get_dict(task_type="classifier", **kwargs): for d in (estimator_dict, preprocessor_dict): est = h.child.name.split(":")[1] if est not in d: - #print "Could not find %s" % est + # print "Could not find %s" % est continue - #print "####" - #print vars(h) - #print h.parent - #print type(h) - if isinstance(h.child, ConfigSpace.hyperparameters.UniformIntegerHyperparameter): + # print "####" + # print vars(h) + # print h.parent + # print type(h) + if isinstance( + h.child, ConfigSpace.hyperparameters.UniformIntegerHyperparameter + ): d[est][COND][CONT] += 1 - elif isinstance(h.child, ConfigSpace.hyperparameters.UniformFloatHyperparameter): + elif isinstance( + h.child, ConfigSpace.hyperparameters.UniformFloatHyperparameter + ): d[est][COND][CONT] += 1 - elif isinstance(h.child, ConfigSpace.hyperparameters.CategoricalHyperparameter): + elif isinstance( + h.child, ConfigSpace.hyperparameters.CategoricalHyperparameter + ): d[est][COND][CAT] += 1 elif isinstance(h.child, ConfigSpace.hyperparameters.Constant): d[est][COND][CONST] += 1 - elif isinstance(h.child, ConfigSpace.hyperparameters.UnParametrizedHyperparameter): + elif isinstance( + h.child, ConfigSpace.hyperparameters.UnParametrizedHyperparameter + ): d[est][COND][UN] += 1 else: raise ValueError("Don't know that type: %s" % type(h)) @@ -159,7 +174,11 @@ def build_table(d): lines = list() for est in d.keys(): sum_ = 0 - t_list = list([est.replace("_", " "), ]) + t_list = list( + [ + est.replace("_", " "), + ] + ) for t in (CAT, CONT): sum_ += d[est][t] t_list.append("%d (%d)" % (d[est][t], d[est][COND][t])) @@ -175,33 +194,68 @@ def main(): parser = ArgumentParser() # General Options - parser.add_argument("-s", "--save", dest="save", default=None, - help="Where to save plot instead of showing it?") - parser.add_argument("-t", "--type", dest="task_type", default="classifier", - choices=("classifier", ), help="Type of dataset") - parser.add_argument("--sparse", dest="sparse", default=False, - action="store_true", help="dataset property") + parser.add_argument( + "-s", + "--save", + dest="save", + default=None, + help="Where to save plot instead of showing it?", + ) + parser.add_argument( + "-t", + "--type", + dest="task_type", + default="classifier", + choices=("classifier",), + help="Type of dataset", + ) + parser.add_argument( + "--sparse", + dest="sparse", + default=False, + action="store_true", + help="dataset property", + ) prop = parser.add_mutually_exclusive_group(required=True) - prop.add_argument("--multilabel", dest="multilabel", default=False, - action="store_true", help="dataset property") - prop.add_argument("--multiclass", dest="multiclass", default=False, - action="store_true", help="dataset property") - prop.add_argument("--binary", dest="binary", default=False, - action="store_true", help="dataset property") + prop.add_argument( + "--multilabel", + dest="multilabel", + default=False, + action="store_true", + help="dataset property", + ) + prop.add_argument( + "--multiclass", + dest="multiclass", + default=False, + action="store_true", + help="dataset property", + ) + prop.add_argument( + "--binary", + dest="binary", + default=False, + action="store_true", + help="dataset property", + ) args, unknown = parser.parse_known_args() - props = {"sparse": args.sparse, - "multilabel": args.multilabel, - "multiclass": args.multiclass} + props = { + "sparse": args.sparse, + "multilabel": args.multilabel, + "multiclass": args.multiclass, + } est_dict, preproc_dict = get_dict(task_type=args.task_type, **props) est_table = build_table(est_dict) preproc_table = build_table(preproc_dict) est_table = table_str % (caption_str % (args.task_type, str(props)), est_table) - preproc_table = table_str % (caption_str % ( - "feature_preprocessor", str(props)), preproc_table) + preproc_table = table_str % ( + caption_str % ("feature_preprocessor", str(props)), + preproc_table, + ) tex_doc = template_string % "\n".join([est_table, preproc_table]) if args.save is None: @@ -210,7 +264,7 @@ def main(): fh = open(args.save, "w") fh.write(tex_doc) fh.close() - proc = subprocess.Popen(shlex.split('pdflatex %s' % args.save)) + proc = subprocess.Popen(shlex.split("pdflatex %s" % args.save)) proc.communicate() try: os.remove(args.save.replace(".tex", ".aux")) @@ -221,4 +275,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/misc/create_list_of_potential_models.py b/misc/create_list_of_potential_models.py index 8153c639e7..cec7959ab1 100644 --- a/misc/create_list_of_potential_models.py +++ b/misc/create_list_of_potential_models.py @@ -5,30 +5,32 @@ import sklearn.base -files = glob.glob(os.path.join(os.path.dirname(sklearn.__file__), "**/*.py"), - recursive=True) +files = glob.glob( + os.path.join(os.path.dirname(sklearn.__file__), "**/*.py"), recursive=True +) + def find_all(cls): found = set() for file in files: - parts = file.split('/') - parts[-1] = parts[-1].replace('.py', '') - sklearn_dir = parts.index('sklearn') - name = '.'.join(parts[sklearn_dir:]) + parts = file.split("/") + parts[-1] = parts[-1].replace(".py", "") + sklearn_dir = parts.index("sklearn") + name = ".".join(parts[sklearn_dir:]) module = importlib.import_module(name) for member in module.__dict__.values(): if not inspect.isclass(member): continue if issubclass(member, cls): found.add(member) - print('#####') + print("#####") found = list(found) found.sort(key=lambda t: str(t)) for f in found: print(f) return found -#classifiers = find_all(sklearn.base.ClassifierMixin) -#regressors = find_all(sklearn.base.RegressorMixin) -preprocs = find_all(sklearn.base.TransformerMixin) +# classifiers = find_all(sklearn.base.ClassifierMixin) +# regressors = find_all(sklearn.base.RegressorMixin) +preprocs = find_all(sklearn.base.TransformerMixin) diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index 8c8b6589af..0000000000 --- a/mypy.ini +++ /dev/null @@ -1,8 +0,0 @@ -[mypy] -# Reports any config lines that are not recognized -warn_unused_configs=True -ignore_missing_imports=True -follow_imports=skip -disallow_untyped_defs=True -disallow_incomplete_defs=True -disallow_untyped_decorators=True diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..0e48e3fc5f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,152 @@ +# For TOML reference +# https://learnxinyminutes.com/docs/toml/ + +[tool.pytest.ini_options] +testpaths = ["test"] +minversion = "3.7" +#addopts = "--cov=autosklearn" + +[tool.coverage.run] +branch = true +context = "autosklearn" + +[tool.coverage.report] +show_missing = true +skip_covered = true +exclude_lines = [ + "pragma: no cover", + '\.\.\.', + "raise NotImplementedError", + "if TYPE_CHECKING" +] + +[tool.black] +target-version = ['py37'] + +[tool.isort] +py_version = "37" +profile = "black" # Play nicely with black +src_paths = ["autosklearn", "test"] +known_types = ["typing", "abc"] # We put these in their own section TYPES +known_first_party = ["autosklearn"] # Say that autosklearn is FIRSTPARTY +known_test = ["test"] # Say that test.* is TEST +sections = ["FUTURE", "TYPES", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "TEST", "LOCALFOLDER"] # section ordering +multi_line_output = 3 # https://pycqa.github.io/isort/docs/configuration/multi_line_output_modes.html + +[tool.pydocstyle] +convention = "numpy" +add-ignore = [ # http://www.pydocstyle.org/en/stable/error_codes.html + "D100", # Missing docstring in public module + "D101", # Missing docstring in public class + "D104", # Missing docstring in public package + "D105", # Missing docstring in magic method + + "D203", # 1 blank line required before class docstring + "D205", # 1 blank line required between summary and description + "D210", # No whitespaces allowed surrounding docstring text + "D212", # Multi-line docstring summary should start at the first line + "D213", # Multi-line docstring summary should start at the second line + + "D400", # First line should end with a period + "D401", # First line should be in imperative mood + "D404", # First word of the docstring should not be "This" + "D413", # Missing blank line after last section + "D415" # First line should end with a period, question mark, or exclamation point +] + +[tool.mypy] +python_version = "3.7" + +show_error_codes = true + +warn_unused_configs = true # warn about unused [tool.mypy] lines + +follow_imports = "normal" # Type check top level api code we use from imports +ignore_missing_imports = false # prefer explicit ignores + +disallow_untyped_defs = true # All functions must have types +disallow_untyped_decorators = true # ... even decorators +disallow_incomplete_defs = true # ...all types + +# This is a problem with the tests of `automl_common` being distributed as a submodule +# probably indicative that is should be a package. +exclude = "autosklearn/automl_common/test" + +# This is handled by automl_common itself in its own CI +[[tool.mypy.overrides]] +module = ["autosklearn.automl_common.common.*"] +ignore_errors = true + +# Submodules that need to be updated with mypy +[[tool.mypy.overrides]] +module = [ + "autosklearn", #__init__ + "autosklearn.estimators", + "autosklearn.automl", + "autosklearn.smbo", + "autosklearn.experimental.askl2", + "autosklearn.ensemble_builder", + "autosklearn.ensembles.singlebest_ensemble", + "autosklearn.ensembles.ensemble_selection", + "autosklearn.evaluation", #__init__ + "autosklearn.evaluation.abstract_evaluator", + "autosklearn.evaluation.test_evaluator", + "autosklearn.evaluation.train_evaluator", + "autosklearn.metalearning.input.aslib_simple", + "autosklearn.metalearning.mismbo", + "autosklearn.metalearning.metafeatures.metafeature", + "autosklearn.metalearning.metafeatures.metafeatures", + "autosklearn.metalearning.metalearning.meta_base", + "autosklearn.metalearning.metalearning.metrics.misc", + "autosklearn.metalearning.metalearning.create_datasets", + "autosklearn.metalearning.metalearning.kNearestDatasets.kND", + "autosklearn.metalearning.metalearning.clustering.gmeans", + "autosklearn.metalearning.optimizers.optimizer_base", + "autosklearn.metalearning.optimizers.metalearn_optimizer.metalearn_optimizer_parser", + "autosklearn.metalearning.optimizers.metalearn_optimizer.metalearner", + "autosklearn.pipeline.base", + "autosklearn.pipeline.classification", + "autosklearn.pipeline.regression", + "autosklearn.pipeline.components.base", + "autosklearn.pipeline.components.data_preprocessing.*", + "autosklearn.pipeline.components.regression.*", + "autosklearn.pipeline.components.classification.*", + "autosklearn.pipeline.components.feature_preprocessing.*", + "autosklearn.pipeline.util", + "autosklearn.pipeline.logging_", + "autosklearn.pipeline.create_searchspace_util", + "autosklearn.pipeline.implementations.util", + "autosklearn.pipeline.implementations.SparseOneHotEncoder", + "autosklearn.pipeline.implementations.MinorityCoalescer", + "autosklearn.pipeline.implementations.CategoryShift", + "autosklearn.experimental.selector", + "autosklearn.data.validation", + "autosklearn.data.abstract_data_manager", + "autosklearn.data.xy_data_manager", + "autosklearn.data.target_validator", + "autosklearn.data.feature_validator", + "autosklearn.util.single_threaded_client", + "autosklearn.util.logging_", +] +ignore_errors = true + +# Packages without exported types +[[tool.mypy.overrides]] +module = [ + "sklearn.*", + "dask.*", + "ConfigSpace.*", + "arff.*", + "scipy.*", + "smac.*", + "pandas.*", + "pynisher.*", + "distro.*", + "joblib.*", + "threadpoolctl.*", + "setuptools.*", + "pkg_resources.*", + "yaml.*", +] +ignore_missing_imports = true + diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 049e247a21..0000000000 --- a/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -testpaths = - test diff --git a/scripts/01_create_commands.py b/scripts/01_create_commands.py index c6e28c606b..72e406d3d7 100644 --- a/scripts/01_create_commands.py +++ b/scripts/01_create_commands.py @@ -5,45 +5,49 @@ import openml -sys.path.append('.') +sys.path.append(".") from update_metadata_util import classification_tasks, regression_tasks parser = argparse.ArgumentParser() -parser.add_argument('--working-directory', type=str, required=True) -parser.add_argument('--test', action='store_true') +parser.add_argument("--working-directory", type=str, required=True) +parser.add_argument("--test", action="store_true") args = parser.parse_args() working_directory = args.working_directory test = args.test -command_file_name = os.path.join(working_directory, 'metadata_commands.txt') +command_file_name = os.path.join(working_directory, "metadata_commands.txt") this_directory = os.path.dirname(os.path.abspath(__file__)) -script_name = 'run_auto-sklearn_for_metadata_generation.py' +script_name = "run_auto-sklearn_for_metadata_generation.py" absolute_script_name = os.path.join(this_directory, script_name) commands = [] -for task_id in (classification_tasks if not test else (233, 245, 258)): - for metric in ('accuracy', 'balanced_accuracy', 'roc_auc', 'logloss'): +for task_id in classification_tasks if not test else (233, 245, 258): + for metric in ("accuracy", "balanced_accuracy", "roc_auc", "logloss"): if ( len(openml.tasks.get_task(task_id, download_data=False).class_labels) > 2 - and metric == 'roc_auc' + and metric == "roc_auc" ): continue - command = ('python3 %s --working-directory %s --time-limit 86400 ' - '--per-run-time-limit 1800 --task-id %d -s 1 --metric %s' % - (absolute_script_name, working_directory, task_id, metric)) + command = ( + "python3 %s --working-directory %s --time-limit 86400 " + "--per-run-time-limit 1800 --task-id %d -s 1 --metric %s" + % (absolute_script_name, working_directory, task_id, metric) + ) commands.append(command) -for task_id in (regression_tasks if not test else (360029, 360033)): - for metric in ('r2', 'root_mean_squared_error', 'mean_absolute_error'): - command = ('python3 %s --working-directory %s --time-limit 86400 ' - '--per-run-time-limit 1800 --task-id %d -s 1 --metric %s' % - (absolute_script_name, working_directory, task_id, metric)) +for task_id in regression_tasks if not test else (360029, 360033): + for metric in ("r2", "root_mean_squared_error", "mean_absolute_error"): + command = ( + "python3 %s --working-directory %s --time-limit 86400 " + "--per-run-time-limit 1800 --task-id %d -s 1 --metric %s" + % (absolute_script_name, working_directory, task_id, metric) + ) commands.append(command) -with open(command_file_name, 'w') as fh: +with open(command_file_name, "w") as fh: for command in commands: fh.writelines(command) - fh.write('\n') + fh.write("\n") diff --git a/scripts/02_retrieve_metadata.py b/scripts/02_retrieve_metadata.py index 611b190dfa..f87f65ecc4 100644 --- a/scripts/02_retrieve_metadata.py +++ b/scripts/02_retrieve_metadata.py @@ -16,8 +16,9 @@ from autosklearn.util import pipeline -def retrieve_matadata(validation_directory, metric, configuration_space, - cutoff=0, only_best=True): +def retrieve_matadata( + validation_directory, metric, configuration_space, cutoff=0, only_best=True +): if not only_best: raise NotImplementedError() if cutoff > 0: @@ -29,9 +30,9 @@ def retrieve_matadata(validation_directory, metric, configuration_space, configurations_to_ids = dict() try: - validation_trajectory_files = glob.glob(os.path.join( - validation_directory, '*', '*', 'validation_trajectory_*.json' - )) + validation_trajectory_files = glob.glob( + os.path.join(validation_directory, "*", "*", "validation_trajectory_*.json") + ) except FileNotFoundError: return {}, {} @@ -66,7 +67,8 @@ def retrieve_matadata(validation_directory, metric, configuration_space, try: best_configuration = Configuration( - configuration_space=configuration_space, values=config) + configuration_space=configuration_space, values=config + ) best_value = score best_configuration_dir = validation_trajectory_file except Exception as e: @@ -74,18 +76,22 @@ def retrieve_matadata(validation_directory, metric, configuration_space, n_broken += 1 if task_name is None: - print('Could not find any configuration better than the default configuration!') + print( + "Could not find any configuration better than the default configuration!" + ) continue if best_configuration is None: - print('Could not find a valid configuration; total %d, better %d, broken %d' - % (n_configs, n_better, n_broken)) + print( + "Could not find a valid configuration; total %d, better %d, broken %d" + % (n_configs, n_better, n_broken) + ) continue elif best_configuration in configurations_to_ids: - print('Found configuration in', best_configuration_dir) + print("Found configuration in", best_configuration_dir) config_id = configurations_to_ids[best_configuration] else: - print('Found configuration in', best_configuration_dir) + print("Found configuration in", best_configuration_dir) config_id = len(configurations_to_ids) configurations_to_ids[config_id] = best_configuration configurations[config_id] = best_configuration @@ -102,34 +108,33 @@ def retrieve_matadata(validation_directory, metric, configuration_space, return outputs, configurations -def write_output(outputs, configurations, output_dir, configuration_space, - metric): +def write_output(outputs, configurations, output_dir, configuration_space, metric): arff_object = dict() - arff_object['attributes'] = [('instance_id', 'STRING'), - ('repetition', 'NUMERIC'), - ('algorithm', 'STRING'), - (metric, 'NUMERIC'), - ('runstatus', - ['ok', 'timeout', 'memout', 'not_applicable', - 'crash', 'other'])] - arff_object['relation'] = "ALGORITHM_RUNS" - arff_object['description'] = "" + arff_object["attributes"] = [ + ("instance_id", "STRING"), + ("repetition", "NUMERIC"), + ("algorithm", "STRING"), + (metric, "NUMERIC"), + ("runstatus", ["ok", "timeout", "memout", "not_applicable", "crash", "other"]), + ] + arff_object["relation"] = "ALGORITHM_RUNS" + arff_object["description"] = "" data = [] keep_configurations = set() for dataset, (configuration_id, value) in outputs.items(): if not np.isfinite(value): - runstatus = 'not_applicable' + runstatus = "not_applicable" value = None else: - runstatus = 'ok' + runstatus = "ok" line = [dataset, 1, configuration_id + 1, value, runstatus] data.append(line) keep_configurations.add(configuration_id) - arff_object['data'] = data + arff_object["data"] = data with open(os.path.join(output_dir, "algorithm_runs.arff"), "w") as fh: arff.dump(arff_object, fh) @@ -139,7 +144,7 @@ def write_output(outputs, configurations, output_dir, configuration_space, if idx not in keep_configurations: continue configuration = configurations[idx] - line = {'idx': idx + 1} + line = {"idx": idx + 1} for hp_name in configuration: value = configuration[hp_name] if value is not None: @@ -147,7 +152,7 @@ def write_output(outputs, configurations, output_dir, configuration_space, hyperparameters.append(line) - fieldnames = ['idx'] + fieldnames = ["idx"] for hyperparameter in configuration_space.get_hyperparameters(): fieldnames.append(hyperparameter.name) fieldnames = [fieldnames[0]] + sorted(fieldnames[1:]) @@ -158,16 +163,17 @@ def write_output(outputs, configurations, output_dir, configuration_space, csv_writer.writerow(line) description = dict() - description['algorithms_deterministic'] = \ - ",".join([str(configuration_id + 1) - for configuration_id in sorted(configurations.keys())]) - description['algorithms_stochastic'] = \ - ",".join([]) - description['performance_measures'] = metric - description['performance_type'] = 'solution_quality' - - with open(os.path.join(output_dir, "description.results.txt"), - "w") as fh: + description["algorithms_deterministic"] = ",".join( + [ + str(configuration_id + 1) + for configuration_id in sorted(configurations.keys()) + ] + ) + description["algorithms_stochastic"] = ",".join([]) + description["performance_measures"] = metric + description["performance_type"] = "solution_quality" + + with open(os.path.join(output_dir, "description.results.txt"), "w") as fh: for key in description: fh.write("%s: %s\n" % (key, description[key])) @@ -184,44 +190,56 @@ def main(): cutoff = args.cutoff only_best = args.only_best - for task_type in ('classification', 'regression'): - if task_type == 'classification': + for task_type in ("classification", "regression"): + if task_type == "classification": metadata_sets = itertools.product( - [0, 1], [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION], - CLASSIFICATION_METRICS) - input_directory = os.path.join(working_directory, 'configuration', - 'classification') - elif task_type == 'regression': - metadata_sets = itertools.product( - [0, 1], [REGRESSION], REGRESSION_METRICS) - input_directory = os.path.join(working_directory, 'configuration', - 'regression') + [0, 1], + [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION], + CLASSIFICATION_METRICS, + ) + input_directory = os.path.join( + working_directory, "configuration", "classification" + ) + elif task_type == "regression": + metadata_sets = itertools.product([0, 1], [REGRESSION], REGRESSION_METRICS) + input_directory = os.path.join( + working_directory, "configuration", "regression" + ) else: raise ValueError(task_type) - output_dir = os.path.join(working_directory, 'configuration_results') + output_dir = os.path.join(working_directory, "configuration_results") for sparse, task, metric in metadata_sets: print(TASK_TYPES_TO_STRING[task], metric, sparse) - output_dir_ = os.path.join(output_dir, '%s_%s_%s' % ( - metric, TASK_TYPES_TO_STRING[task], - 'sparse' if sparse else 'dense')) + output_dir_ = os.path.join( + output_dir, + "%s_%s_%s" + % (metric, TASK_TYPES_TO_STRING[task], "sparse" if sparse else "dense"), + ) configuration_space = pipeline.get_configuration_space( - {'is_sparse': sparse, 'task': task}) + {"is_sparse": sparse, "task": task} + ) outputs, configurations = retrieve_matadata( validation_directory=input_directory, metric=metric, cutoff=cutoff, configuration_space=configuration_space, - only_best=only_best) + only_best=only_best, + ) if len(outputs) == 0: - print("No output found for %s, %s, %s" % - (metric, TASK_TYPES_TO_STRING[task], - 'sparse' if sparse else 'dense')) + print( + "No output found for %s, %s, %s" + % ( + metric, + TASK_TYPES_TO_STRING[task], + "sparse" if sparse else "dense", + ) + ) continue try: @@ -229,8 +247,9 @@ def main(): except: pass - write_output(outputs, configurations, output_dir_, - configuration_space, metric) + write_output( + outputs, configurations, output_dir_, configuration_space, metric + ) if __name__ == "__main__": diff --git a/scripts/03_calculate_metafeatures.py b/scripts/03_calculate_metafeatures.py index 1d058c5dae..3b32dde8e3 100644 --- a/scripts/03_calculate_metafeatures.py +++ b/scripts/03_calculate_metafeatures.py @@ -11,15 +11,22 @@ import numpy as np import pandas as pd -from autosklearn.constants import BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, REGRESSION +from autosklearn.constants import ( + BINARY_CLASSIFICATION, + MULTICLASS_CLASSIFICATION, + REGRESSION, +) from autosklearn.metalearning.metafeatures import metafeatures -from autosklearn.smbo import _calculate_metafeatures, _calculate_metafeatures_encoded, \ - EXCLUDE_META_FEATURES_REGRESSION, EXCLUDE_META_FEATURES_CLASSIFICATION +from autosklearn.smbo import ( + _calculate_metafeatures, + _calculate_metafeatures_encoded, + EXCLUDE_META_FEATURES_REGRESSION, + EXCLUDE_META_FEATURES_CLASSIFICATION, +) from autosklearn.util.stopwatch import StopWatch -sys.path.append('.') -from update_metadata_util import load_task, classification_tasks, \ - regression_tasks +sys.path.append(".") +from update_metadata_util import load_task, classification_tasks, regression_tasks logger = logging.getLogger("03_calculate_metafeatures") @@ -28,7 +35,7 @@ def calculate_metafeatures(task_id): X_train, y_train, X_test, y_test, cat, task_type, dataset_name = load_task(task_id) watch = StopWatch() - if task_type == 'classification': + if task_type == "classification": if len(np.unique(y_train)) == 2: task_type = BINARY_CLASSIFICATION else: @@ -37,20 +44,27 @@ def calculate_metafeatures(task_id): task_type = REGRESSION _metafeatures_labels = _calculate_metafeatures( - x_train=X_train, y_train=y_train, data_feat_type=cat, - data_info_task=task_type, basename=dataset_name, logger_=logger, + x_train=X_train, + y_train=y_train, + data_feat_type=cat, + data_info_task=task_type, + basename=dataset_name, + logger_=logger, watcher=watch, ) _metafeatures_encoded_labels = _calculate_metafeatures_encoded( - x_train=X_train, y_train=y_train, data_feat_type=cat, - task=task_type, basename=dataset_name, logger_=logger, + x_train=X_train, + y_train=y_train, + data_feat_type=cat, + task=task_type, + basename=dataset_name, + logger_=logger, watcher=watch, ) mf = _metafeatures_labels - mf.metafeature_values.update( - _metafeatures_encoded_labels.metafeature_values) + mf.metafeature_values.update(_metafeatures_encoded_labels.metafeature_values) return mf @@ -59,15 +73,15 @@ def calculate_metafeatures(task_id): parser = ArgumentParser() parser.add_argument("--working-directory", type=str, required=True) parser.add_argument("--memory-limit", type=int, default=3072) - parser.add_argument("--test-mode", action='store_true') + parser.add_argument("--test-mode", action="store_true") args = parser.parse_args() working_directory = args.working_directory memory_limit = args.memory_limit test_mode = args.test_mode - for task_type in ('classification', 'regression'): - output_directory = os.path.join(working_directory, 'metafeatures', task_type) + for task_type in ("classification", "regression"): + output_directory = os.path.join(working_directory, "metafeatures", task_type) try: os.makedirs(output_directory) except: @@ -75,7 +89,7 @@ def calculate_metafeatures(task_id): all_metafeatures = {} - if task_type == 'classification': + if task_type == "classification": tasks = classification_tasks else: tasks = regression_tasks @@ -90,12 +104,9 @@ def producer(): for task_id in tasks: yield task_id - memory = joblib.Memory(location='/tmp/joblib', verbose=10) + memory = joblib.Memory(location="/tmp/joblib", verbose=10) cached_calculate_metafeatures = memory.cache(calculate_metafeatures) - mfs = [ - cached_calculate_metafeatures(task_id) - for task_id in producer() - ] + mfs = [cached_calculate_metafeatures(task_id) for task_id in producer()] for mf in mfs: if mf is not None: @@ -110,45 +121,50 @@ def producer(): for i, task_id in enumerate(all_metafeatures): calculation_times[task_id] = dict() for metafeature_name in sorted( - all_metafeatures[task_id].metafeature_values): + all_metafeatures[task_id].metafeature_values + ): metafeature_value = all_metafeatures[task_id].metafeature_values[ - metafeature_name] - calculation_times[task_id][metafeature_name] = \ - metafeature_value.time + metafeature_name + ] + calculation_times[task_id][metafeature_name] = metafeature_value.time if metafeature_value.type_ == "HELPERFUNCTION": - helperfunction_values[task_id][metafeature_name] = \ - metafeature_value.value + helperfunction_values[task_id][ + metafeature_name + ] = metafeature_value.value else: - metafeature_values[task_id][metafeature_name] = \ - metafeature_value.value + metafeature_values[task_id][ + metafeature_name + ] = metafeature_value.value calculation_times = pd.DataFrame(calculation_times).transpose() calculation_times = calculation_times.sort_index() - with open(os.path.join(output_directory, "calculation_times.csv"), - "w") as fh: + with open(os.path.join(output_directory, "calculation_times.csv"), "w") as fh: fh.write(calculation_times.to_csv()) # Write all metafeatures in the aslib1.0 format - metafeature_values = metafeature_values = pd.DataFrame(metafeature_values).transpose() + metafeature_values = metafeature_values = pd.DataFrame( + metafeature_values + ).transpose() metafeature_values = metafeature_values.sort_index() arff_object = dict() - arff_object['attributes'] = [('instance_id', 'STRING'), - ('repetition', 'NUMERIC')] + \ - [('%s' % name, 'NUMERIC') for name in - metafeature_values.columns] - arff_object['relation'] = "FEATURE_VALUES" - arff_object['description'] = "" + arff_object["attributes"] = [ + ("instance_id", "STRING"), + ("repetition", "NUMERIC"), + ] + [("%s" % name, "NUMERIC") for name in metafeature_values.columns] + arff_object["relation"] = "FEATURE_VALUES" + arff_object["description"] = "" data = [] for idx in metafeature_values.index: line = [idx, 1] - line += [value if np.isfinite(value) else None - for value in metafeature_values.loc[idx, :].values] + line += [ + value if np.isfinite(value) else None + for value in metafeature_values.loc[idx, :].values + ] data.append(line) - arff_object['data'] = data + arff_object["data"] = data - with open(os.path.join(output_directory, "feature_values.arff"), - "w") as fh: + with open(os.path.join(output_directory, "feature_values.arff"), "w") as fh: arff.dump(arff_object, fh) # Feature steps and runtimes according to the aslib1.0 format @@ -157,7 +173,8 @@ def producer(): exclude_metafeatures = ( EXCLUDE_META_FEATURES_CLASSIFICATION - if task_type == 'classification' else EXCLUDE_META_FEATURES_REGRESSION + if task_type == "classification" + else EXCLUDE_META_FEATURES_REGRESSION ) for metafeature_name in metafeatures.metafeatures.functions: @@ -174,42 +191,48 @@ def producer(): # Write the feature runstatus in the aslib1.0 format arff_object = dict() - arff_object['attributes'] = [('instance_id', 'STRING'), - ('repetition', 'NUMERIC')] + \ - [('%s' % name, - ['ok', 'timeout', 'memout', 'presolved', - 'crash', 'other']) - for name in feature_steps] - arff_object['relation'] = "FEATURE_RUNSTATUS" - arff_object['description'] = "" + arff_object["attributes"] = [ + ("instance_id", "STRING"), + ("repetition", "NUMERIC"), + ] + [ + ("%s" % name, ["ok", "timeout", "memout", "presolved", "crash", "other"]) + for name in feature_steps + ] + arff_object["relation"] = "FEATURE_RUNSTATUS" + arff_object["description"] = "" data = [] for idx in metafeature_values.index: line = [idx, 1] for feature_step in feature_steps: if feature_step in helperfunction_values[idx]: - line.append('ok' if helperfunction_values[feature_step] is not \ - None else 'other') + line.append( + "ok" + if helperfunction_values[feature_step] is not None + else "other" + ) elif feature_step in metafeature_values.loc[idx]: - line.append('ok' if np.isfinite(metafeature_values.loc[idx][ - feature_step]) else 'other') + line.append( + "ok" + if np.isfinite(metafeature_values.loc[idx][feature_step]) + else "other" + ) else: - line.append('other') + line.append("other") data.append(line) - arff_object['data'] = data + arff_object["data"] = data - with open(os.path.join(output_directory, "feature_runstatus.arff"), - "w") as fh: + with open(os.path.join(output_directory, "feature_runstatus.arff"), "w") as fh: arff.dump(arff_object, fh) arff_object = dict() - arff_object['attributes'] = [('instance_id', 'STRING'), - ('repetition', 'NUMERIC')] + \ - [('%s' % feature_step, 'NUMERIC') for - feature_step in feature_steps] - arff_object['relation'] = "FEATURE_COSTS" - arff_object['description'] = "" + arff_object["attributes"] = [ + ("instance_id", "STRING"), + ("repetition", "NUMERIC"), + ] + [("%s" % feature_step, "NUMERIC") for feature_step in feature_steps] + arff_object["relation"] = "FEATURE_COSTS" + arff_object["description"] = "" data = [] for instance_id in calculation_times.index: @@ -220,33 +243,35 @@ def producer(): for feature in feature_steps[feature_step]: time_ += calculation_times[feature][instance_id] if not np.isfinite(time_): - raise ValueError("Feature cost %s for instance %s and feature " - "step %s not finite" % (time_, instance_id, feature)) + raise ValueError( + "Feature cost %s for instance %s and feature " + "step %s not finite" % (time_, instance_id, feature) + ) line.append(time_) data.append(line) - arff_object['data'] = data + arff_object["data"] = data - with open(os.path.join(output_directory, "feature_costs.arff"), - "w") as fh: + with open(os.path.join(output_directory, "feature_costs.arff"), "w") as fh: arff.dump(arff_object, fh) # Write the features part of the description.txt to a file description = OrderedDict() - description['features_cutoff_time'] = '3600' - description['features_cutoff_memory'] = args.memory_limit - description['number_of_feature_steps'] = str(len(feature_steps)) + description["features_cutoff_time"] = "3600" + description["features_cutoff_memory"] = args.memory_limit + description["number_of_feature_steps"] = str(len(feature_steps)) for feature_step in feature_steps: - description['feature_step %s' % feature_step] = \ - ", ".join(feature_steps[feature_step]) - description['features_deterministic'] = ", ".join([ - metafeature_name for - metafeature_name in - metafeature_names]) - description['features_stochastic'] = '' - description['default_steps'] = ", ".join(feature_steps) - - with open(os.path.join(output_directory, - "description.features.txt"), "w") as fh: + description["feature_step %s" % feature_step] = ", ".join( + feature_steps[feature_step] + ) + description["features_deterministic"] = ", ".join( + [metafeature_name for metafeature_name in metafeature_names] + ) + description["features_stochastic"] = "" + description["default_steps"] = ", ".join(feature_steps) + + with open( + os.path.join(output_directory, "description.features.txt"), "w" + ) as fh: for entry in description: fh.write("%s: %s\n" % (entry, description[entry])) diff --git a/scripts/04_create_aslib_files.py b/scripts/04_create_aslib_files.py index d5e10a9c15..8c83dc1648 100644 --- a/scripts/04_create_aslib_files.py +++ b/scripts/04_create_aslib_files.py @@ -10,16 +10,16 @@ if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("--working-directory", type=str, required=True) - parser.add_argument("--scenario_id", type=str, default='auto-sklearn') + parser.add_argument("--scenario_id", type=str, default="auto-sklearn") parser.add_argument("--algorithm_cutoff_time", type=int, default=1800) parser.add_argument("--algorithm_cutoff_memory", type=int, default=3072) args = parser.parse_args() working_directory = args.working_directory - output_dir = os.path.join(working_directory, 'metadata') - results_dir = os.path.join(working_directory, 'configuration_results') - metafeatures_dir = os.path.join(working_directory, 'metafeatures') + output_dir = os.path.join(working_directory, "metadata") + results_dir = os.path.join(working_directory, "configuration_results") + metafeatures_dir = os.path.join(working_directory, "metafeatures") scenario_id = args.scenario_id algorithm_cutoff_time = args.algorithm_cutoff_time @@ -31,25 +31,29 @@ except (OSError, IOError): pass - for task_type in ('classification', 'regression'): - if task_type == 'classification': + for task_type in ("classification", "regression"): + if task_type == "classification": metadata_sets = itertools.product( - [0, 1], [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION], - CLASSIFICATION_METRICS) - elif task_type == 'regression': - metadata_sets = itertools.product( - [0, 1], [REGRESSION], REGRESSION_METRICS) + [0, 1], + [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION], + CLASSIFICATION_METRICS, + ) + elif task_type == "regression": + metadata_sets = itertools.product([0, 1], [REGRESSION], REGRESSION_METRICS) else: raise ValueError(task_type) - input_directory = os.path.join(working_directory, 'configuration', task_type) + input_directory = os.path.join(working_directory, "configuration", task_type) metafeatures_dir_for_task = os.path.join(metafeatures_dir, task_type) for sparse, task, metric in metadata_sets: print(TASK_TYPES_TO_STRING[task], metric, sparse) - dir_name = '%s_%s_%s' % (metric, TASK_TYPES_TO_STRING[task], - 'sparse' if sparse else 'dense') + dir_name = "%s_%s_%s" % ( + metric, + TASK_TYPES_TO_STRING[task], + "sparse" if sparse else "dense", + ) output_dir_ = os.path.join(output_dir, dir_name) results_dir_ = os.path.join(results_dir, dir_name) @@ -67,21 +71,19 @@ pass # Create description.txt - with open(os.path.join(metafeatures_dir_for_task, - "description.features.txt")) as fh: + with open( + os.path.join(metafeatures_dir_for_task, "description.features.txt") + ) as fh: description_metafeatures = fh.read() - with open(os.path.join(results_dir_, - "description.results.txt")) as fh: + with open(os.path.join(results_dir_, "description.results.txt")) as fh: description_results = fh.read() description = [description_metafeatures, description_results] description.append("scenario_id: %s" % scenario_id) description.append("maximize: false") - description.append( - "algorithm_cutoff_time: %d" % algorithm_cutoff_time) - description.append( - "algorithm_cutoff_memory: %d" % algorithm_cutoff_memory) + description.append("algorithm_cutoff_time: %d" % algorithm_cutoff_time) + description.append("algorithm_cutoff_memory: %d" % algorithm_cutoff_memory) with open(os.path.join(output_dir_, "description.txt"), "w") as fh: for line in description: @@ -89,59 +91,54 @@ fh.write("\n") # Copy feature values and add instance id - with open(os.path.join(metafeatures_dir_for_task, - "feature_values.arff")) as fh: + with open( + os.path.join(metafeatures_dir_for_task, "feature_values.arff") + ) as fh: feature_values = arff.load(fh) - feature_values['relation'] = scenario_id + "_" + feature_values[ - 'relation'] + feature_values["relation"] = scenario_id + "_" + feature_values["relation"] - with open(os.path.join(output_dir_, "feature_values.arff"), - "w") as fh: + with open(os.path.join(output_dir_, "feature_values.arff"), "w") as fh: arff.dump(feature_values, fh) # Copy feature runstatus and add instance id - with open(os.path.join(metafeatures_dir_for_task, - "feature_runstatus.arff")) as fh: + with open( + os.path.join(metafeatures_dir_for_task, "feature_runstatus.arff") + ) as fh: feature_runstatus = arff.load(fh) - feature_runstatus['relation'] = scenario_id + "_" + \ - feature_runstatus['relation'] + feature_runstatus["relation"] = ( + scenario_id + "_" + feature_runstatus["relation"] + ) - with open(os.path.join(output_dir_, "feature_runstatus.arff"), "w") \ - as fh: + with open(os.path.join(output_dir_, "feature_runstatus.arff"), "w") as fh: arff.dump(feature_runstatus, fh) # Copy feature runstatus and add instance id with open( - os.path.join(metafeatures_dir_for_task, "feature_costs.arff")) as fh: + os.path.join(metafeatures_dir_for_task, "feature_costs.arff") + ) as fh: feature_costs = arff.load(fh) - feature_costs['relation'] = scenario_id + "_" + feature_costs[ - 'relation'] - for i in range(len(feature_costs['data'])): - for j in range(2, len(feature_costs['data'][i])): - feature_costs['data'][i][j] = \ - round(feature_costs['data'][i][j], 5) + feature_costs["relation"] = scenario_id + "_" + feature_costs["relation"] + for i in range(len(feature_costs["data"])): + for j in range(2, len(feature_costs["data"][i])): + feature_costs["data"][i][j] = round(feature_costs["data"][i][j], 5) - with open(os.path.join(output_dir_, "feature_costs.arff"), "w") \ - as fh: + with open(os.path.join(output_dir_, "feature_costs.arff"), "w") as fh: arff.dump(feature_costs, fh) # Copy algorithm runs and add instance id with open(os.path.join(results_dir_, "algorithm_runs.arff")) as fh: algorithm_runs = arff.load(fh) - algorithm_runs['relation'] = scenario_id + "_" + algorithm_runs[ - 'relation'] + algorithm_runs["relation"] = scenario_id + "_" + algorithm_runs["relation"] - with open(os.path.join(output_dir_, "algorithm_runs.arff"), "w") \ - as fh: + with open(os.path.join(output_dir_, "algorithm_runs.arff"), "w") as fh: arff.dump(algorithm_runs, fh) # Copy configurations file with open(os.path.join(results_dir_, "configurations.csv")) as fh: algorithm_runs = fh.read() - with open(os.path.join(output_dir_, "configurations.csv"), "w") \ - as fh: + with open(os.path.join(output_dir_, "configurations.csv"), "w") as fh: fh.write(algorithm_runs) diff --git a/scripts/2015_nips_paper/plot/plot_ranks.py b/scripts/2015_nips_paper/plot/plot_ranks.py index 5be095389c..b2e85248b7 100644 --- a/scripts/2015_nips_paper/plot/plot_ranks.py +++ b/scripts/2015_nips_paper/plot/plot_ranks.py @@ -17,8 +17,8 @@ def read_csv(fn, has_header=True, data_type=str): """ data = list() header = None - with open(fn, 'r') as csvfile: - csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|') + with open(fn, "r") as csvfile: + csv_reader = csv.reader(csvfile, delimiter=",", quotechar="|") for row in csv_reader: if header is None and has_header: header = row @@ -37,7 +37,7 @@ def fill_trajectory(performance_list, time_list): series = pd.concat(series_list, axis=1) # Fill missing performance values (NaNs) with last non-NaN value. - series = series.fillna(method='ffill') + series = series.fillna(method="ffill") # return the trajectories over seeds (series object) return series @@ -52,10 +52,10 @@ def main(): working_directory = "../log_output" # list of models - model_list = ['vanilla', 'ensemble', 'metalearning', 'meta_ensemble'] + model_list = ["vanilla", "ensemble", "metalearning", "meta_ensemble"] # list of seeds - seed_dir = os.path.join(working_directory, 'vanilla') + seed_dir = os.path.join(working_directory, "vanilla") seed_list = [seed for seed in os.listdir(seed_dir)] # list of tasks @@ -74,21 +74,23 @@ def main(): for seed in seed_list: # collect all csv files of different seeds for current model and # current task. - if model in ['vanilla', 'ensemble']: - csv_file = os.path.join(working_directory, - 'vanilla', - seed, - task_id, - "score_{}.csv".format(model) - ) - - elif model in ['metalearning', 'meta_ensemble']: - csv_file = os.path.join(working_directory, - 'metalearning', - seed, - task_id, - "score_{}.csv".format(model), - ) + if model in ["vanilla", "ensemble"]: + csv_file = os.path.join( + working_directory, + "vanilla", + seed, + task_id, + "score_{}.csv".format(model), + ) + + elif model in ["metalearning", "meta_ensemble"]: + csv_file = os.path.join( + working_directory, + "metalearning", + seed, + task_id, + "score_{}.csv".format(model), + ) csv_files.append(csv_file) performance_list = [] @@ -99,8 +101,9 @@ def main(): _, csv_data = read_csv(fl, has_header=True) csv_data = np.array(csv_data) # Replace too high values with args.maxsize - data = [min([sys.maxsize, float(i.strip())]) for i in - csv_data[:, 2]] # test trajectories are stored in third column + data = [ + min([sys.maxsize, float(i.strip())]) for i in csv_data[:, 2] + ] # test trajectories are stored in third column time_steps = [float(i.strip()) for i in csv_data[:, 0]] assert time_steps[0] == 0 @@ -123,15 +126,16 @@ def main(): n_tasks = len(task_list) for i in range(n_iter): - pick = np.random.choice(all_trajectories[0][0].shape[1], - size=(len(model_list))) + pick = np.random.choice(all_trajectories[0][0].shape[1], size=(len(model_list))) for j in range(n_tasks): all_trajectories_tmp = pd.DataFrame( - {model_list[k]: at[j].iloc[:, pick[k]] for - k, at in enumerate(all_trajectories)} + { + model_list[k]: at[j].iloc[:, pick[k]] + for k, at in enumerate(all_trajectories) + } ) - all_trajectories_tmp = all_trajectories_tmp.fillna(method='ffill', axis=0) + all_trajectories_tmp = all_trajectories_tmp.fillna(method="ffill", axis=0) r_tmp = all_trajectories_tmp.rank(axis=1) all_rankings.append(r_tmp) @@ -141,7 +145,7 @@ def main(): for ranking in all_rankings: ranks_for_model.append(ranking.loc[:, model]) ranks_for_model = pd.DataFrame(ranks_for_model) - ranks_for_model = ranks_for_model.fillna(method='ffill', axis=1) + ranks_for_model = ranks_for_model.fillna(method="ffill", axis=1) final_ranks.append(ranks_for_model.mean(skipna=True)) # Step 3. Plot the average ranks over time. @@ -155,8 +159,8 @@ def main(): X_data.append(max_runtime) y_data.append(y) plt.plot(X_data, y_data, label=model) - plt.xlabel('time [sec]') - plt.ylabel('average rank') + plt.xlabel("time [sec]") + plt.ylabel("average rank") plt.legend() plt.savefig(saveto) diff --git a/scripts/2015_nips_paper/run/remove_dataset_from_metadata.py b/scripts/2015_nips_paper/run/remove_dataset_from_metadata.py index f31e16e65f..d16e67e23c 100644 --- a/scripts/2015_nips_paper/run/remove_dataset_from_metadata.py +++ b/scripts/2015_nips_paper/run/remove_dataset_from_metadata.py @@ -4,25 +4,27 @@ from shutil import copyfile -def remove_dataset_from_aslib_arff(input_file, - output_file, - id, - ): +def remove_dataset_from_aslib_arff( + input_file, + output_file, + id, +): with open(input_file) as fh: arff_object = arff.load(fh) - for i in range(len(arff_object['data']) - 1, -1, -1): - if str(arff_object['data'][i][0]) == str(id): - del arff_object['data'][i] + for i in range(len(arff_object["data"]) - 1, -1, -1): + if str(arff_object["data"][i][0]) == str(id): + del arff_object["data"][i] with open(output_file, "w") as fh: arff.dump(arff_object, fh) del arff_object -def remove_dataset(metadata_directory, - output_directory, - id, - ): +def remove_dataset( + metadata_directory, + output_directory, + id, +): metadata_sub_directories = os.listdir(metadata_directory) for metadata_sub_directory in metadata_sub_directories: diff --git a/scripts/2015_nips_paper/run/run_auto_sklearn.py b/scripts/2015_nips_paper/run/run_auto_sklearn.py index 366280692e..960ab7be80 100644 --- a/scripts/2015_nips_paper/run/run_auto_sklearn.py +++ b/scripts/2015_nips_paper/run/run_auto_sklearn.py @@ -21,11 +21,12 @@ def load_task(task_id): X_test = X[test_indices] y_test = y[test_indices] dataset = openml.datasets.get_dataset(task.dataset_id) - _, _, cat = dataset.get_data(return_categorical_indicator=True, - target=task.target_name) + _, _, cat = dataset.get_data( + return_categorical_indicator=True, target=task.target_name + ) del _ del dataset - cat = ['categorical' if c else 'numerical' for c in cat] + cat = ["categorical" if c else "numerical" for c in cat] unique = np.unique(y_train) mapping = {unique_value: i for i, unique_value in enumerate(unique)} @@ -35,13 +36,14 @@ def load_task(task_id): return X_train, y_train, X_test, y_test, cat -def run_experiment(working_directory, - time_limit, - per_run_time_limit, - task_id, - seed, - use_metalearning, - ): +def run_experiment( + working_directory, + time_limit, + per_run_time_limit, + task_id, + seed, + use_metalearning, +): # set this to local dataset cache # openml.config.cache_directory = os.path.join(working_directory, "../cache") @@ -57,12 +59,14 @@ def run_experiment(working_directory, if use_metalearning is True: # path to the original metadata directory. metadata_directory = os.path.abspath(os.path.dirname(__file__)) - metadata_directory = os.path.join(metadata_directory, - "../../../autosklearn/metalearning/files/") + metadata_directory = os.path.join( + metadata_directory, "../../../autosklearn/metalearning/files/" + ) # Create new metadata directory not containing task_id. - new_metadata_directory = os.path.abspath(os.path.join(working_directory, - "metadata_%i" % task_id)) + new_metadata_directory = os.path.abspath( + os.path.join(working_directory, "metadata_%i" % task_id) + ) try: os.makedirs(new_metadata_directory) @@ -73,100 +77,105 @@ def run_experiment(working_directory, remove_dataset(metadata_directory, new_metadata_directory, task_id) automl_arguments = { - 'time_left_for_this_task': time_limit, - 'per_run_time_limit': per_run_time_limit, - 'initial_configurations_via_metalearning': 25, - 'ensemble_size': 0, - 'seed': seed, - 'memory_limit': 3072, - 'resampling_strategy': 'holdout', - 'resampling_strategy_arguments': {'train_size': 0.67}, - 'tmp_folder': tmp_dir, - 'delete_tmp_folder_after_terminate': False, - 'disable_evaluator_output': False, - 'metadata_directory': new_metadata_directory + "time_left_for_this_task": time_limit, + "per_run_time_limit": per_run_time_limit, + "initial_configurations_via_metalearning": 25, + "ensemble_size": 0, + "seed": seed, + "memory_limit": 3072, + "resampling_strategy": "holdout", + "resampling_strategy_arguments": {"train_size": 0.67}, + "tmp_folder": tmp_dir, + "delete_tmp_folder_after_terminate": False, + "disable_evaluator_output": False, + "metadata_directory": new_metadata_directory, } # Without metalearning else: automl_arguments = { - 'time_left_for_this_task': time_limit, - 'per_run_time_limit': per_run_time_limit, - 'initial_configurations_via_metalearning': 0, - 'ensemble_size': 0, - 'seed': seed, - 'memory_limit': 3072, - 'resampling_strategy': 'holdout', - 'resampling_strategy_arguments': {'train_size': 0.67}, - 'tmp_folder': tmp_dir, - 'delete_tmp_folder_after_terminate': False, - 'disable_evaluator_output': False, + "time_left_for_this_task": time_limit, + "per_run_time_limit": per_run_time_limit, + "initial_configurations_via_metalearning": 0, + "ensemble_size": 0, + "seed": seed, + "memory_limit": 3072, + "resampling_strategy": "holdout", + "resampling_strategy_arguments": {"train_size": 0.67}, + "tmp_folder": tmp_dir, + "delete_tmp_folder_after_terminate": False, + "disable_evaluator_output": False, } automl = AutoSklearnClassifier(**automl_arguments) X_train, y_train, X_test, y_test, cat = load_task(task_id) - automl.fit(X_train, y_train, - dataset_name=str(task_id), - X_test=X_test, y_test=y_test, - metric=balanced_accuracy) + automl.fit( + X_train, + y_train, + dataset_name=str(task_id), + X_test=X_test, + y_test=y_test, + metric=balanced_accuracy, + ) -def main(working_directory, - output_file, - task_id, - seed, - model, - time_limit, - per_run_time_limit): +def main( + working_directory, output_file, task_id, seed, model, time_limit, per_run_time_limit +): # vanilla and metalearning must be called first before ensemble and # meta_ensemble can be called, respectively. if model == "vanilla": - run_experiment(working_directory, - time_limit, - per_run_time_limit, - task_id, - seed, - use_metalearning=False, - ) - score_ensemble.main(working_directory, - output_file, - task_id, - seed, - ensemble_size=1, - ) + run_experiment( + working_directory, + time_limit, + per_run_time_limit, + task_id, + seed, + use_metalearning=False, + ) + score_ensemble.main( + working_directory, + output_file, + task_id, + seed, + ensemble_size=1, + ) elif model == "metalearning": - run_experiment(working_directory, - time_limit, - per_run_time_limit, - task_id, - seed, - use_metalearning=True, - ) - score_ensemble.main(working_directory, - output_file, - task_id, - seed, - ensemble_size=1, - ) + run_experiment( + working_directory, + time_limit, + per_run_time_limit, + task_id, + seed, + use_metalearning=True, + ) + score_ensemble.main( + working_directory, + output_file, + task_id, + seed, + ensemble_size=1, + ) else: - score_ensemble.main(working_directory, - output_file, - task_id, - seed, - ensemble_size=50, - ) + score_ensemble.main( + working_directory, + output_file, + task_id, + seed, + ensemble_size=50, + ) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--working-directory', type=str, required=True) + parser.add_argument("--working-directory", type=str, required=True) parser.add_argument("--output-file", type=str, required=True) parser.add_argument("--time-limit", type=int, required=True) parser.add_argument("--per-runtime-limit", type=int, required=True) - parser.add_argument('--task-id', type=int, required=True) - parser.add_argument('-s', '--seed', type=int) + parser.add_argument("--task-id", type=int, required=True) + parser.add_argument("-s", "--seed", type=int) parser.add_argument("--model", type=str, required=True) args = parser.parse_args() @@ -178,11 +187,12 @@ def main(working_directory, time_limit = args.time_limit per_run_time_limit = args.per_runtime_limit - main(working_directory, - output_file, - task_id, - seed, - model, - time_limit, - per_run_time_limit, - ) + main( + working_directory, + output_file, + task_id, + seed, + model, + time_limit, + per_run_time_limit, + ) diff --git a/scripts/2015_nips_paper/run/score_ensemble.py b/scripts/2015_nips_paper/run/score_ensemble.py index 3d10954d94..1e873f01fd 100644 --- a/scripts/2015_nips_paper/run/score_ensemble.py +++ b/scripts/2015_nips_paper/run/score_ensemble.py @@ -14,21 +14,21 @@ def _load_file(f): - split = f.split('_') + split = f.split("_") as_seed = int(split[-2]) - ta_seed = int(split[-1].split('.')[0]) + ta_seed = int(split[-1].split(".")[0]) np_array = np.load(f) return np_array, (as_seed, ta_seed), os.path.getmtime(f) def read_files(directory, seed=None, n_jobs=1): - seed_pattern = '*' if seed is None else str(seed) - glob_pattern = os.path.join(directory, "predictions_*_%s_*.npy" % - seed_pattern) + seed_pattern = "*" if seed is None else str(seed) + glob_pattern = os.path.join(directory, "predictions_*_%s_*.npy" % seed_pattern) files = sorted(glob.glob(glob_pattern)) files = joblib.Parallel(n_jobs=n_jobs, verbose=10)( - joblib.delayed(_load_file)(f=f) for f in files) + joblib.delayed(_load_file)(f=f) for f in files + ) return files @@ -38,13 +38,13 @@ def main(input_directories, output_file, task_id, seed, ensemble_size, n_jobs=1) if isinstance(input_directories, str): # add seed and task id directories - input_directories += '/%i/%i' % (seed, task_id) + input_directories += "/%i/%i" % (seed, task_id) input_directories = [input_directories] else: new_directories = [] for dir in input_directories: - dir += '/%i/%i' % (seed, task_id) + dir += "/%i/%i" % (seed, task_id) new_directories.append(dir) input_directories = new_directories @@ -54,28 +54,28 @@ def main(input_directories, output_file, task_id, seed, ensemble_size, n_jobs=1) # Get the prediction files. for input_directory in input_directories: - print('Loading files from input directory:', input_directory) + print("Loading files from input directory:", input_directory) validation_files_ = read_files( - os.path.join(input_directory, - '.auto-sklearn/predictions_ensemble'), - n_jobs=n_jobs) + os.path.join(input_directory, ".auto-sklearn/predictions_ensemble"), + n_jobs=n_jobs, + ) validation_files.extend(validation_files_) test_files_ = read_files( - os.path.join(input_directory, - '.auto-sklearn/predictions_test'), - n_jobs=n_jobs) + os.path.join(input_directory, ".auto-sklearn/predictions_test"), + n_jobs=n_jobs, + ) test_files.extend(test_files_) assert len(validation_files_) > 0 assert len(validation_files_) == len(test_files_) - print('Loaded %d files!' % len(validation_files_)) + print("Loaded %d files!" % len(validation_files_)) # if not specified, we get all files. - seed_pattern = '*' if seed is None else str(seed) - glob_pattern = os.path.join(input_directory, - ".auto-sklearn", - "start_time_%s" % seed_pattern) + seed_pattern = "*" if seed is None else str(seed) + glob_pattern = os.path.join( + input_directory, ".auto-sklearn", "start_time_%s" % seed_pattern + ) start_time_files = glob.glob(glob_pattern) # find the earliest startime. @@ -90,14 +90,15 @@ def main(input_directories, output_file, task_id, seed, ensemble_size, n_jobs=1) validation_files.sort(key=lambda t: t[-1]) - keys_to_test_files = {test_file[1]: test_file - for test_file in test_files} + keys_to_test_files = {test_file[1]: test_file for test_file in test_files} # Resort such that both files have the same order - test_files = [keys_to_test_files[validation_file[1]] - for validation_file in validation_files] + test_files = [ + keys_to_test_files[validation_file[1]] for validation_file in validation_files + ] assert [validation_file[1] for validation_file in validation_files] == [ - test_file[1] for test_file in test_files] + test_file[1] for test_file in test_files + ] losses = [] top_models_at_step = dict() @@ -106,7 +107,7 @@ def main(input_directories, output_file, task_id, seed, ensemble_size, n_jobs=1) temporary_directory=input_directory, output_directory=input_directory + "_output", delete_tmp_folder_after_terminate=False, - prefix="auto-sklearn" + prefix="auto-sklearn", ) valid_labels = backend.load_targets_ensemble() score = balanced_accuracy @@ -124,46 +125,63 @@ def main(input_directories, output_file, task_id, seed, ensemble_size, n_jobs=1) if top_model in models_to_remove: models_to_remove.remove(top_model) - print("Removing the following %d models from the library: %s" - % (len(models_to_remove), models_to_remove)) + print( + "Removing the following %d models from the library: %s" + % (len(models_to_remove), models_to_remove) + ) for model_id in models_to_remove: validation_files[model_id] = None test_files[model_id] = None - print('Starting ensemble building!') + print("Starting ensemble building!") output = joblib.Parallel(n_jobs=n_jobs, verbose=20)( - joblib.delayed( - evaluate)(input_directory=input_directories[0], - validation_files=[validation_files[j] for - j in range(len(validation_files)) - if j in top_models_at_step[i]], - test_files=[test_files[j] for - j in range(len(test_files)) - if j in top_models_at_step[i]], - ensemble_size=ensemble_size) - for i in range(len(test_files))) + joblib.delayed(evaluate)( + input_directory=input_directories[0], + validation_files=[ + validation_files[j] + for j in range(len(validation_files)) + if j in top_models_at_step[i] + ], + test_files=[ + test_files[j] + for j in range(len(test_files)) + if j in top_models_at_step[i] + ], + ensemble_size=ensemble_size, + ) + for i in range(len(test_files)) + ) # Create output csv file file_path = os.path.abspath("%s/%s" % (input_directory, output_file)) with open(file_path, "w") as csv_file: - fieldnames = ['Time', 'Training (Empirical) Performance', - 'Test Set Performance'] + fieldnames = [ + "Time", + "Training (Empirical) Performance", + "Test Set Performance", + ] csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames) csv_writer.writeheader() # First time step - csv_writer.writerow({'Time': 0, - 'Training (Empirical) Performance': 1.0, - 'Test Set Performance': 1.0}) + csv_writer.writerow( + { + "Time": 0, + "Training (Empirical) Performance": 1.0, + "Test Set Performance": 1.0, + } + ) for i, o in enumerate(output): - csv_writer.writerow({'Time': o['ensemble_time'] - + o['time_function_evaluation'] - - starttime, - 'Training (Empirical) Performance': - o['ensemble_error'], - 'Test Set Performance': - o['ensemble_test_error']}) + csv_writer.writerow( + { + "Time": o["ensemble_time"] + + o["time_function_evaluation"] + - starttime, + "Training (Empirical) Performance": o["ensemble_error"], + "Test Set Performance": o["ensemble_test_error"], + } + ) def evaluate(input_directory, validation_files, test_files, ensemble_size=50): @@ -187,18 +205,18 @@ def evaluate(input_directory, validation_files, test_files, ensemble_size=50): # Build the ensemble start = time.time() - ensemble_selection = EnsembleSelection(ensemble_size=ensemble_size, - task_type=D.info['task'], - metric=score, - random_state=np.random.RandomState()) + ensemble_selection = EnsembleSelection( + ensemble_size=ensemble_size, + task_type=D.info["task"], + metric=score, + random_state=np.random.RandomState(), + ) validation_predictions = np.array([v[0] for v in validation_files]) test_predictions = np.array([t[0] for t in test_files]) - ensemble_selection.fit(validation_predictions, valid_labels, - identifiers=None) - y_hat_ensemble = ensemble_selection.predict(np.array( - validation_predictions)) + ensemble_selection.fit(validation_predictions, valid_labels, identifiers=None) + y_hat_ensemble = ensemble_selection.predict(np.array(validation_predictions)) y_hat_test = ensemble_selection.predict(np.array(test_predictions)) # Compute validation error @@ -209,21 +227,22 @@ def evaluate(input_directory, validation_files, test_files, ensemble_size=50): ensemble_time = time.time() - start - rval = {'ensemble_time': ensemble_time, - 'time_function_evaluation': time_function_evaluation, - 'ensemble_error': ensemble_error, - 'ensemble_test_error': ensemble_test_error} + rval = { + "ensemble_time": ensemble_time, + "time_function_evaluation": time_function_evaluation, + "ensemble_error": ensemble_error, + "ensemble_test_error": ensemble_test_error, + } return rval -if __name__ == '__main__': +if __name__ == "__main__": parser = ArgumentParser() - parser.add_argument('--input-directory', type=str, - required=True, nargs='+') - parser.add_argument('--task-id', type=int, required=True) - parser.add_argument('-s', '--seed', type=int) - parser.add_argument("--output-file", type=str, default='score_ensemble.csv') + parser.add_argument("--input-directory", type=str, required=True, nargs="+") + parser.add_argument("--task-id", type=int, required=True) + parser.add_argument("-s", "--seed", type=int) + parser.add_argument("--output-file", type=str, default="score_ensemble.csv") parser.add_argument("--ensemble-size", type=int, default=50) parser.add_argument("--n-jobs", type=int, default=1) args = parser.parse_args() diff --git a/scripts/2015_nips_paper/setup/get_tasks.py b/scripts/2015_nips_paper/setup/get_tasks.py index 09f06a0a64..98c4ee085e 100644 --- a/scripts/2015_nips_paper/setup/get_tasks.py +++ b/scripts/2015_nips_paper/setup/get_tasks.py @@ -4,30 +4,162 @@ # List of dataset IDs used for the NIPS experiment. -dataset_ids = [1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1049, 1050, 1053, - 1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130, - 1134, 1138, 1139, 1142, 1146, 1161, 1166, 12, 14, 16, 179, 180, 181, 182, - 184, 185, 18, 21, 22, 23, 24, 26, 273, 28, 293, 300, 30, 31, 32, 351, 354, - 357, 36, 389, 38, 390, 391, 392, 393, 395, 396, 398, 399, 3, 401, 44, 46, - 554, 57, 60, 679, 6, 715, 718, 720, 722, 723, 727, 728, 734, 735, 737, - 740, 741, 743, 751, 752, 761, 772, 797, 799, 803, 806, 807, 813, 816, 819, - 821, 822, 823, 833, 837, 843, 845, 846, 847, 849, 866, 871, 881, 897, 901, - 903, 904, 910, 912, 913, 914, 917, 923, 930, 934, 953, 958, 959, 962, 966, - 971, 976, 977, 978, 979, 980, 991, 993, 995] +dataset_ids = [ + 1000, + 1002, + 1018, + 1019, + 1020, + 1021, + 1036, + 1040, + 1041, + 1049, + 1050, + 1053, + 1056, + 1067, + 1068, + 1069, + 1111, + 1112, + 1114, + 1116, + 1119, + 1120, + 1128, + 1130, + 1134, + 1138, + 1139, + 1142, + 1146, + 1161, + 1166, + 12, + 14, + 16, + 179, + 180, + 181, + 182, + 184, + 185, + 18, + 21, + 22, + 23, + 24, + 26, + 273, + 28, + 293, + 300, + 30, + 31, + 32, + 351, + 354, + 357, + 36, + 389, + 38, + 390, + 391, + 392, + 393, + 395, + 396, + 398, + 399, + 3, + 401, + 44, + 46, + 554, + 57, + 60, + 679, + 6, + 715, + 718, + 720, + 722, + 723, + 727, + 728, + 734, + 735, + 737, + 740, + 741, + 743, + 751, + 752, + 761, + 772, + 797, + 799, + 803, + 806, + 807, + 813, + 816, + 819, + 821, + 822, + 823, + 833, + 837, + 843, + 845, + 846, + 847, + 849, + 866, + 871, + 881, + 897, + 901, + 903, + 904, + 910, + 912, + 913, + 914, + 917, + 923, + 930, + 934, + 953, + 958, + 959, + 962, + 966, + 971, + 976, + 977, + 978, + 979, + 980, + 991, + 993, + 995, +] def get_task_ids(dataset_ids): # return task ids of corresponding datset ids. # active tasks - tasks_a = openml.tasks.list_tasks(task_type_id=1, status='active') + tasks_a = openml.tasks.list_tasks(task_type_id=1, status="active") tasks_a = pd.DataFrame.from_dict(tasks_a, orient="index") # query only those with holdout as the resampling startegy. tasks_a = tasks_a[(tasks_a.estimation_procedure == "33% Holdout set")] # deactivated tasks - tasks_d = openml.tasks.list_tasks(task_type_id=1, status='deactivated') + tasks_d = openml.tasks.list_tasks(task_type_id=1, status="deactivated") tasks_d = pd.DataFrame.from_dict(tasks_d, orient="index") tasks_d = tasks_d[(tasks_d.estimation_procedure == "33% Holdout set")] @@ -47,9 +179,9 @@ def get_task_ids(dataset_ids): def main(): task_ids = sorted(get_task_ids(dataset_ids)) - string_to_print = '' + string_to_print = "" for tid in task_ids: - string_to_print += str(tid) + ' ' + string_to_print += str(tid) + " " print(string_to_print) # print the task ids for bash script. diff --git a/scripts/run_auto-sklearn_for_metadata_generation.py b/scripts/run_auto-sklearn_for_metadata_generation.py index e1fc71a135..6b82b233c7 100644 --- a/scripts/run_auto-sklearn_for_metadata_generation.py +++ b/scripts/run_auto-sklearn_for_metadata_generation.py @@ -1,4 +1,4 @@ -if __name__ == '__main__': +if __name__ == "__main__": import argparse import json @@ -11,27 +11,35 @@ from autosklearn.classification import AutoSklearnClassifier from autosklearn.regression import AutoSklearnRegressor from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash - from autosklearn.metrics import accuracy, balanced_accuracy, roc_auc, log_loss, r2, \ - mean_squared_error, mean_absolute_error, root_mean_squared_error, CLASSIFICATION_METRICS, \ - REGRESSION_METRICS + from autosklearn.metrics import ( + accuracy, + balanced_accuracy, + roc_auc, + log_loss, + r2, + mean_squared_error, + mean_absolute_error, + root_mean_squared_error, + CLASSIFICATION_METRICS, + REGRESSION_METRICS, + ) from smac.runhistory.runhistory import RunInfo from smac.scenario.scenario import Scenario from smac.stats.stats import Stats from smac.tae import StatusType - sys.path.append('.') + sys.path.append(".") from update_metadata_util import load_task - parser = argparse.ArgumentParser() - parser.add_argument('--working-directory', type=str, required=True) - parser.add_argument('--time-limit', type=int, required=True) - parser.add_argument('--per-run-time-limit', type=int, required=True) - parser.add_argument('--task-id', type=int, required=True) - parser.add_argument('--metric', type=str, required=True) - parser.add_argument('-s', '--seed', type=int, required=True) - parser.add_argument('--unittest', action='store_true') + parser.add_argument("--working-directory", type=str, required=True) + parser.add_argument("--time-limit", type=int, required=True) + parser.add_argument("--per-run-time-limit", type=int, required=True) + parser.add_argument("--task-id", type=int, required=True) + parser.add_argument("--metric", type=str, required=True) + parser.add_argument("-s", "--seed", type=int, required=True) + parser.add_argument("--unittest", action="store_true") args = parser.parse_args() working_directory = args.working_directory @@ -44,8 +52,9 @@ X_train, y_train, X_test, y_test, cat, task_type, dataset_name = load_task(task_id) - configuration_output_dir = os.path.join(working_directory, 'configuration', - task_type) + configuration_output_dir = os.path.join( + working_directory, "configuration", task_type + ) os.makedirs(configuration_output_dir, exist_ok=True) tmp_dir = os.path.join(configuration_output_dir, str(task_id), metric) os.makedirs(tmp_dir, exist_ok=True) @@ -54,49 +63,55 @@ autosklearn_directory = os.path.join(tempdir, "dir") automl_arguments = { - 'time_left_for_this_task': time_limit, - 'per_run_time_limit': per_run_time_limit, - 'initial_configurations_via_metalearning': 0, - 'ensemble_size': 0, - 'ensemble_nbest': 0, - 'seed': seed, - 'memory_limit': 3072, - 'resampling_strategy': 'partial-cv', - 'delete_tmp_folder_after_terminate': False, - 'tmp_folder': autosklearn_directory, - 'disable_evaluator_output': True, + "time_left_for_this_task": time_limit, + "per_run_time_limit": per_run_time_limit, + "initial_configurations_via_metalearning": 0, + "ensemble_size": 0, + "ensemble_nbest": 0, + "seed": seed, + "memory_limit": 3072, + "resampling_strategy": "partial-cv", + "delete_tmp_folder_after_terminate": False, + "tmp_folder": autosklearn_directory, + "disable_evaluator_output": True, } if is_test: - automl_arguments['resampling_strategy_arguments'] = {'folds': 2} - if task_type == 'classification': - include = {'classifier': ['libsvm_svc'], 'feature_preprocessor': ['no_preprocessing']} - automl_arguments['include'] = include - elif task_type == 'regression': - include = {'regressor': ['extra_trees'], 'feature_preprocessor': ['no_preprocessing']} - automl_arguments['include'] = include + automl_arguments["resampling_strategy_arguments"] = {"folds": 2} + if task_type == "classification": + include = { + "classifier": ["libsvm_svc"], + "feature_preprocessor": ["no_preprocessing"], + } + automl_arguments["include"] = include + elif task_type == "regression": + include = { + "regressor": ["extra_trees"], + "feature_preprocessor": ["no_preprocessing"], + } + automl_arguments["include"] = include else: - raise ValueError('Unsupported task type: %s' % str(task_type)) + raise ValueError("Unsupported task type: %s" % str(task_type)) else: - automl_arguments['resampling_strategy_arguments'] = {'folds': 10} + automl_arguments["resampling_strategy_arguments"] = {"folds": 10} include = None metric = { - 'accuracy': accuracy, - 'balanced_accuracy': balanced_accuracy, - 'roc_auc': roc_auc, - 'logloss': log_loss, - 'r2': r2, - 'mean_squared_error': mean_squared_error, - 'root_mean_squared_error': root_mean_squared_error, - 'mean_absolute_error': mean_absolute_error, + "accuracy": accuracy, + "balanced_accuracy": balanced_accuracy, + "roc_auc": roc_auc, + "logloss": log_loss, + "r2": r2, + "mean_squared_error": mean_squared_error, + "root_mean_squared_error": root_mean_squared_error, + "mean_absolute_error": mean_absolute_error, }[metric] - automl_arguments['metric'] = metric + automl_arguments["metric"] = metric - if task_type == 'classification': + if task_type == "classification": automl = AutoSklearnClassifier(**automl_arguments) scorer_list = CLASSIFICATION_METRICS - elif task_type == 'regression': + elif task_type == "regression": automl = AutoSklearnRegressor(**automl_arguments) scorer_list = REGRESSION_METRICS else: @@ -104,8 +119,14 @@ scoring_functions = [scorer for name, scorer in scorer_list.items()] - automl.fit(X_train, y_train, dataset_name=dataset_name, - feat_type=cat, X_test=X_test, y_test=y_test) + automl.fit( + X_train, + y_train, + dataset_name=dataset_name, + feat_type=cat, + X_test=X_test, + y_test=y_test, + ) trajectory = automl.trajectory_ incumbent_id_to_model = {} @@ -117,40 +138,44 @@ else: memory_limit_factor = 2 - print('Starting to validate configurations') + print("Starting to validate configurations") for i, entry in enumerate(trajectory): - print('Starting to validate configuration %d/%d' % (i + 1, len(trajectory))) + print("Starting to validate configuration %d/%d" % (i + 1, len(trajectory))) incumbent_id = entry.incumbent_id train_performance = entry.train_perf if incumbent_id not in incumbent_id_to_model: config = entry.incumbent - logger = logging.getLogger('Testing:)') + logger = logging.getLogger("Testing:)") stats = Stats( - Scenario({ - 'cutoff_time': per_run_time_limit * 2, - 'run_obj': 'quality', - }) + Scenario( + { + "cutoff_time": per_run_time_limit * 2, + "run_obj": "quality", + } + ) ) stats.start_timing() # To avoid the output "first run crashed"... stats.submitted_ta_runs += 1 stats.finished_ta_runs += 1 - memory_lim = memory_limit_factor * automl_arguments['memory_limit'] + memory_lim = memory_limit_factor * automl_arguments["memory_limit"] pipeline, run_info, run_value = automl.fit_pipeline( - X=X_train, y=y_train, - X_test=X_test, y_test=y_test, - resampling_strategy='test', + X=X_train, + y=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy="test", memory_limit=memory_lim, disable_file_output=True, logger=logger, stats=stats, scoring_functions=scoring_functions, include=include, - metric=automl_arguments['metric'], - pynisher_context='spawn', - cutoff=per_run_time_limit*3, + metric=automl_arguments["metric"], + pynisher_context="spawn", + cutoff=per_run_time_limit * 3, config=config, ) @@ -159,58 +184,65 @@ # print(additional_run_info) - validated_trajectory.append(list(entry) + [task_id] + - [run_value.additional_info]) - print('Finished validating configuration %d/%d' % (i + 1, len(trajectory))) - print('Finished to validate configurations') - - print('Starting to copy data to configuration directory', flush=True) - validated_trajectory = [entry[:2] + [entry[2].get_dictionary()] + entry[3:] - for entry in validated_trajectory] - validated_trajectory_file = os.path.join(tmp_dir, 'validation_trajectory_%d.json' % seed) - with open(validated_trajectory_file, 'w') as fh: + validated_trajectory.append( + list(entry) + [task_id] + [run_value.additional_info] + ) + print("Finished validating configuration %d/%d" % (i + 1, len(trajectory))) + print("Finished to validate configurations") + + print("Starting to copy data to configuration directory", flush=True) + validated_trajectory = [ + entry[:2] + [entry[2].get_dictionary()] + entry[3:] + for entry in validated_trajectory + ] + validated_trajectory_file = os.path.join( + tmp_dir, "validation_trajectory_%d.json" % seed + ) + with open(validated_trajectory_file, "w") as fh: json.dump(validated_trajectory, fh, indent=4) - for dirpath, dirnames, filenames in os.walk(autosklearn_directory, topdown=False): print(dirpath, dirnames, filenames) for filename in filenames: - if filename == 'datamanager.pkl': + if filename == "datamanager.pkl": os.remove(os.path.join(dirpath, filename)) - elif filename == 'configspace.pcs': + elif filename == "configspace.pcs": os.remove(os.path.join(dirpath, filename)) for dirname in dirnames: - if dirname in ('models', 'cv_models'): + if dirname in ("models", "cv_models"): os.rmdir(os.path.join(dirpath, dirname)) - print('*' * 80) - print('Going to copy the configuration directory') - script = 'cp -r %s %s' % (autosklearn_directory, os.path.join(tmp_dir, 'auto-sklearn-output')) + print("*" * 80) + print("Going to copy the configuration directory") + script = "cp -r %s %s" % ( + autosklearn_directory, + os.path.join(tmp_dir, "auto-sklearn-output"), + ) proc = subprocess.run( script, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, - executable='/bin/bash', + executable="/bin/bash", ) - print('*' * 80) + print("*" * 80) print(script) print(proc.stdout) print(proc.stderr) - print('Finished copying the configuration directory') + print("Finished copying the configuration directory") - if not tempdir.startswith('/tmp'): - raise ValueError('%s must not start with /tmp' % tempdir) - script = 'rm -rf %s' % tempdir - print('*' * 80) + if not tempdir.startswith("/tmp"): + raise ValueError("%s must not start with /tmp" % tempdir) + script = "rm -rf %s" % tempdir + print("*" * 80) print(script) proc = subprocess.run( script, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, - executable='/bin/bash', + executable="/bin/bash", ) print(proc.stdout) print(proc.stderr) - print('Finished configuring') + print("Finished configuring") diff --git a/scripts/update_metadata_util.py b/scripts/update_metadata_util.py index 153e63c6cf..8ed99d9bd0 100644 --- a/scripts/update_metadata_util.py +++ b/scripts/update_metadata_util.py @@ -3,37 +3,327 @@ classification_tasks = [ - 232, 236, 241, 245, 253, 254, 256, 258, 260, 262, 267, 271, 273, 275, 279, 288, 336, - 340, 2119, 2120, 2121, 2122, 2123, 2125, 2356, 3044, 3047, 3048, 3049, 3053, 3054, - 3055, 75089, 75092, 75093, 75098, 75100, 75108, 75109, 75112, 75114, 75115, 75116, - 75118, 75120, 75121, 75125, 75126, 75129, 75131, 75133, 75134, 75136, 75139, 75141, - 75142, 75143, 75146, 75147, 75148, 75149, 75153, 75154, 75156, 75157, 75159, 75161, - 75163, 75166, 75169, 75171, 75173, 75174, 75176, 75178, 75179, 75180, 75184, 75185, - 75187, 75192, 75195, 75196, 75199, 75210, 75212, 75213, 75215, 75217, 75219, 75221, - 75223, 75225, 75232, 75233, 75234, 75235, 75236, 75237, 75239, 75250, 126021, 126024, - 126028, 126030, 126031, 146574, 146575, 146576, 146577, 146578, 146583, 146586, - 146592, 146593, 146594, 146596, 146597, 146600, 146601, 146602, 146603, 146679, - 166859, 166866, 166872, 166875, 166882, 166897, 166905, 166906, 166913, 166915, - 166931, 166932, 166944, 166950, 166951, 166953, 166956, 166957, 166958, 166959, - 166970, 166996, 167085, 167086, 167087, 167088, 167089, 167090, 167094, 167096, - 167097, 167099, 167100, 167101, 167103, 167105, 167106, 167202, 167203, 167204, - 167205, 168785, 168791, 189779, 189786, 189828, 189829, 189836, 189840, 189841, - 189843, 189844, 189845, 189846, 189857, 189858, 189859, 189863, 189864, 189869, - 189870, 189875, 189878, 189880, 189881, 189882, 189883, 189884, 189887, 189890, - 189893, 189894, 189899, 189900, 189902, 190154, 190155, 190156, 190157, 190158, - 190159, 211720, 211721, 211722, 211723, 211724 + 232, + 236, + 241, + 245, + 253, + 254, + 256, + 258, + 260, + 262, + 267, + 271, + 273, + 275, + 279, + 288, + 336, + 340, + 2119, + 2120, + 2121, + 2122, + 2123, + 2125, + 2356, + 3044, + 3047, + 3048, + 3049, + 3053, + 3054, + 3055, + 75089, + 75092, + 75093, + 75098, + 75100, + 75108, + 75109, + 75112, + 75114, + 75115, + 75116, + 75118, + 75120, + 75121, + 75125, + 75126, + 75129, + 75131, + 75133, + 75134, + 75136, + 75139, + 75141, + 75142, + 75143, + 75146, + 75147, + 75148, + 75149, + 75153, + 75154, + 75156, + 75157, + 75159, + 75161, + 75163, + 75166, + 75169, + 75171, + 75173, + 75174, + 75176, + 75178, + 75179, + 75180, + 75184, + 75185, + 75187, + 75192, + 75195, + 75196, + 75199, + 75210, + 75212, + 75213, + 75215, + 75217, + 75219, + 75221, + 75223, + 75225, + 75232, + 75233, + 75234, + 75235, + 75236, + 75237, + 75239, + 75250, + 126021, + 126024, + 126028, + 126030, + 126031, + 146574, + 146575, + 146576, + 146577, + 146578, + 146583, + 146586, + 146592, + 146593, + 146594, + 146596, + 146597, + 146600, + 146601, + 146602, + 146603, + 146679, + 166859, + 166866, + 166872, + 166875, + 166882, + 166897, + 166905, + 166906, + 166913, + 166915, + 166931, + 166932, + 166944, + 166950, + 166951, + 166953, + 166956, + 166957, + 166958, + 166959, + 166970, + 166996, + 167085, + 167086, + 167087, + 167088, + 167089, + 167090, + 167094, + 167096, + 167097, + 167099, + 167100, + 167101, + 167103, + 167105, + 167106, + 167202, + 167203, + 167204, + 167205, + 168785, + 168791, + 189779, + 189786, + 189828, + 189829, + 189836, + 189840, + 189841, + 189843, + 189844, + 189845, + 189846, + 189857, + 189858, + 189859, + 189863, + 189864, + 189869, + 189870, + 189875, + 189878, + 189880, + 189881, + 189882, + 189883, + 189884, + 189887, + 189890, + 189893, + 189894, + 189899, + 189900, + 189902, + 190154, + 190155, + 190156, + 190157, + 190158, + 190159, + 211720, + 211721, + 211722, + 211723, + 211724, ] regression_tasks = [ - 359997, 359998, 359999, 360000, 360001, 360002, 360003, 167146, 360004, 360005, 360006, - 360007, 211696, 360009, 360010, 360011, 360012, 360013, 360014, 360015, 360016, 360017, - 360018, 360019, 360020, 360021, 360022, 360023, 360024, 360025, 360026, 360027, 360028, - 360029, 360030, 360031, 360032, 360033, 360034, 360035, 360036, 360037, 360038, 360039, - 360040, 360041, 360042, 360043, 360044, 360045, 360046, 360047, 360048, 360049, 360050, - 360051, 360052, 360053, 360054, 360055, 360056, 360057, 360058, 360059, 360060, 360061, - 360062, 360063, 360064, 360066, 360067, 360068, 360069, 360070, 360071, 360072, 360073, - 360074, 360075, 360076, 360077, 360078, 360079, 360080, 360081, 360082, 360083, 360084, - 360085, 360086, 360087, 360088, 360089, 360090, 360091, 360092, 360093, 360094, 360095, - 360096, 360097, 360098, 360100, 360101, 360102, 360103, 360104, 360105, 360106, 360107, + 359997, + 359998, + 359999, + 360000, + 360001, + 360002, + 360003, + 167146, + 360004, + 360005, + 360006, + 360007, + 211696, + 360009, + 360010, + 360011, + 360012, + 360013, + 360014, + 360015, + 360016, + 360017, + 360018, + 360019, + 360020, + 360021, + 360022, + 360023, + 360024, + 360025, + 360026, + 360027, + 360028, + 360029, + 360030, + 360031, + 360032, + 360033, + 360034, + 360035, + 360036, + 360037, + 360038, + 360039, + 360040, + 360041, + 360042, + 360043, + 360044, + 360045, + 360046, + 360047, + 360048, + 360049, + 360050, + 360051, + 360052, + 360053, + 360054, + 360055, + 360056, + 360057, + 360058, + 360059, + 360060, + 360061, + 360062, + 360063, + 360064, + 360066, + 360067, + 360068, + 360069, + 360070, + 360071, + 360072, + 360073, + 360074, + 360075, + 360076, + 360077, + 360078, + 360079, + 360080, + 360081, + 360082, + 360083, + 360084, + 360085, + 360086, + 360087, + 360088, + 360089, + 360090, + 360091, + 360092, + 360093, + 360094, + 360095, + 360096, + 360097, + 360098, + 360100, + 360101, + 360102, + 360103, + 360104, + 360105, + 360106, + 360107, 360108, ] @@ -51,13 +341,13 @@ def load_task(task_id): name = dataset.name.lower() del _ del dataset - cat = {i: 'categorical' if c else 'numerical' for i, c in enumerate(cat)} + cat = {i: "categorical" if c else "numerical" for i, c in enumerate(cat)} if isinstance(task, openml.tasks.OpenMLClassificationTask): - task_type = 'classification' + task_type = "classification" elif isinstance(task, openml.tasks.OpenMLRegressionTask): - task_type = 'regression' + task_type = "regression" else: - raise ValueError('Unknown task type') + raise ValueError("Unknown task type") return X_train, y_train, X_test, y_test, cat, task_type, name diff --git a/setup.py b/setup.py index 6107e60321..003b573bd4 100644 --- a/setup.py +++ b/setup.py @@ -1,38 +1,46 @@ # -*- encoding: utf-8 -*- import os import sys -from setuptools import setup, find_packages +from setuptools import find_packages, setup + +HERE = os.path.abspath(os.path.dirname(__file__)) # Check if Auto-sklearn *could* run on the given system -if os.name != 'posix': +if os.name != "posix": raise ValueError( - 'Detected unsupported operating system: %s. Please check ' - 'the compability information of auto-sklearn: https://automl.github.io' - '/auto-sklearn/master/installation.html#windows-osx-compatibility' % - sys.platform + "Detected unsupported operating system: %s. Please check " + "the compability information of auto-sklearn: https://automl.github.io" + "/auto-sklearn/master/installation.html#windows-osx-compatibility" + % sys.platform ) if sys.version_info < (3, 7): raise ValueError( - 'Unsupported Python version %d.%d.%d found. Auto-sklearn requires Python ' - '3.7 or higher.' % (sys.version_info.major, sys.version_info.minor, sys.version_info.micro) + "Unsupported Python version %d.%d.%d found. Auto-sklearn requires Python " + "3.7 or higher." + % (sys.version_info.major, sys.version_info.minor, sys.version_info.micro) ) -HERE = os.path.abspath(os.path.dirname(__file__)) -with open(os.path.join(HERE, 'requirements.txt')) as fp: - install_reqs = [r.rstrip() for r in fp.readlines() - if not r.startswith('#') and not r.startswith('git+')] +with open(os.path.join(HERE, "requirements.txt")) as fp: + install_reqs = [ + r.rstrip() + for r in fp.readlines() + if not r.startswith("#") and not r.startswith("git+") + ] -extras_reqs={ +extras_reqs = { "test": [ "pytest>=4.6", - "mypy", + "pytest-cov", "pytest-xdist", "pytest-timeout", + "mypy", + "isort", + "black", + "pydocstyle", "openml", "pre-commit", - "pytest-cov", ], "examples": [ "matplotlib", @@ -46,32 +54,32 @@ "sphinx_bootstrap_theme", "numpydoc", "sphinx_toolbox", - "docutils==0.16" + "docutils==0.16", ], } -with open(os.path.join(HERE, 'autosklearn', '__version__.py')) as fh: +with open(os.path.join(HERE, "autosklearn", "__version__.py")) as fh: version = fh.readlines()[-1].split()[-1].strip("\"'") -with open(os.path.join(HERE, 'README.md')) as fh: +with open(os.path.join(HERE, "README.md")) as fh: long_description = fh.read() setup( - name='auto-sklearn', - author='Matthias Feurer', - author_email='feurerm@informatik.uni-freiburg.de', - description='Automated machine learning.', + name="auto-sklearn", + author="Matthias Feurer", + author_email="feurerm@informatik.uni-freiburg.de", + description="Automated machine learning.", long_description=long_description, - long_description_content_type='text/markdown', + long_description_content_type="text/markdown", version=version, - packages=find_packages(exclude=['test', 'scripts', 'examples']), + packages=find_packages(exclude=["test", "scripts", "examples"]), extras_require=extras_reqs, install_requires=install_reqs, include_package_data=True, - license='BSD3', - platforms=['Linux'], + license="BSD3", + platforms=["Linux"], classifiers=[ "Environment :: Console", "Intended Audience :: Developers", @@ -83,10 +91,10 @@ "Operating System :: OS Independent", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Information Analysis", - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", ], - python_requires='>=3.7', - url='https://automl.github.io/auto-sklearn', + python_requires=">=3.7", + url="https://automl.github.io/auto-sklearn", ) diff --git a/test/conftest.py b/test/conftest.py index d3df7508cd..16a285b9df 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -3,12 +3,12 @@ import time import unittest.mock -from dask.distributed import Client, get_client import psutil import pytest +from dask.distributed import Client, get_client -from autosklearn.automl_common.common.utils.backend import create, Backend from autosklearn.automl import AutoML +from autosklearn.automl_common.common.utils.backend import Backend, create class AutoMLStub(AutoML): @@ -36,9 +36,11 @@ def automl_stub(request): def backend(request): test_dir = os.path.dirname(__file__) - tmp = os.path.join(test_dir, '.tmp__%s__%s' % (request.module.__name__, request.node.name)) + tmp = os.path.join( + test_dir, ".tmp__%s__%s" % (request.module.__name__, request.node.name) + ) - for dir in (tmp, ): + for dir in (tmp,): for i in range(10): if os.path.exists(dir): try: @@ -49,14 +51,12 @@ def backend(request): # Make sure the folders we wanna create do not already exist. backend = create( - temporary_directory=tmp, - output_directory=None, - prefix="auto-sklearn" + temporary_directory=tmp, output_directory=None, prefix="auto-sklearn" ) def get_finalizer(tmp_dir): def session_run_at_end(): - for dir in (tmp_dir, ): + for dir in (tmp_dir,): for i in range(10): if os.path.exists(dir): try: @@ -64,7 +64,9 @@ def session_run_at_end(): break except OSError: time.sleep(1) + return session_run_at_end + request.addfinalizer(get_finalizer(tmp)) return backend @@ -72,7 +74,7 @@ def session_run_at_end(): @pytest.fixture(scope="function") def tmp_dir(request): - return _dir_fixture('tmp', request) + return _dir_fixture("tmp", request) def _dir_fixture(dir_type, request): @@ -124,8 +126,10 @@ def session_run_at_end(): client.shutdown() client.close() del client + return session_run_at_end - request.addfinalizer(get_finalizer(client.scheduler_info()['address'])) + + request.addfinalizer(get_finalizer(client.scheduler_info()["address"])) return client @@ -149,8 +153,10 @@ def session_run_at_end(): client.shutdown() client.close() del client + return session_run_at_end - request.addfinalizer(get_finalizer(client.scheduler_info()['address'])) + + request.addfinalizer(get_finalizer(client.scheduler_info()["address"])) return client diff --git a/test/test_automl/automl_utils.py b/test/test_automl/automl_utils.py index 768f94ff8d..577ea97359 100644 --- a/test/test_automl/automl_utils.py +++ b/test/test_automl/automl_utils.py @@ -1,17 +1,17 @@ # -*- encoding: utf-8 -*- -import re -import os -import glob import typing -import numpy as np +import glob +import os +import re +import numpy as np scores_dict = { - 'train_single': ["single_best_train_score", "single_best_optimization_score"], - 'test_single': ["single_best_test_score"], - 'train_ensamble': ["ensemble_optimization_score"], - 'test_ensamble': ["ensemble_test_score"] + "train_single": ["single_best_train_score", "single_best_optimization_score"], + "test_single": ["single_best_test_score"], + "train_ensamble": ["ensemble_optimization_score"], + "test_ensamble": ["ensemble_test_score"], } @@ -19,15 +19,15 @@ def print_debug_information(automl): # In case it is called with estimator, # Get the automl object - if hasattr(automl, 'automl_'): + if hasattr(automl, "automl_"): automl = automl.automl_ # Log file path - log_file = glob.glob(os.path.join( - automl._backend.temporary_directory, 'AutoML*.log'))[0] + log_file = glob.glob( + os.path.join(automl._backend.temporary_directory, "AutoML*.log") + )[0] - include_messages = ['INFO', 'DEBUG', 'WARN', - 'CRITICAL', 'ERROR', 'FATAL'] + include_messages = ["INFO", "DEBUG", "WARN", "CRITICAL", "ERROR", "FATAL"] # There is a lot of content in the log files. Only # parsing the main message and ignore the metalearning @@ -37,53 +37,69 @@ def print_debug_information(automl): content = logfile.readlines() # Get the messages to debug easier! - content = [line for line in content if any( - msg in line for msg in include_messages - ) and 'metalearning' not in line] + content = [ + line + for line in content + if any(msg in line for msg in include_messages) + and "metalearning" not in line + ] except Exception as e: return str(e) # Also add the run history if any - if hasattr(automl, 'runhistory_') and hasattr(automl.runhistory_, 'data'): + if hasattr(automl, "runhistory_") and hasattr(automl.runhistory_, "data"): for k, v in automl.runhistory_.data.items(): content += ["{}->{}".format(k, v)] else: - content += ['No RunHistory'] + content += ["No RunHistory"] # Also add the ensemble history if any if len(automl.ensemble_performance_history) > 0: content += [str(h) for h in automl.ensemble_performance_history] else: - content += ['No Ensemble History'] + content += ["No Ensemble History"] return os.linesep.join(content) def _includes(scores, all_scores): - return all(score in all_scores for score in scores) and len(scores) == len(all_scores) + return all(score in all_scores for score in scores) and len(scores) == len( + all_scores + ) def count_succeses(cv_results): return np.sum( - [status in ['Success', 'Success (but do not advance to higher budget)'] - for status in cv_results['status']] + [ + status in ["Success", "Success (but do not advance to higher budget)"] + for status in cv_results["status"] + ] ) def includes_all_scores(scores): - all_scores = scores_dict["train_single"] + scores_dict["test_single"] + \ - scores_dict["train_ensamble"] + scores_dict["test_ensamble"] + ["Timestamp"] + all_scores = ( + scores_dict["train_single"] + + scores_dict["test_single"] + + scores_dict["train_ensamble"] + + scores_dict["test_ensamble"] + + ["Timestamp"] + ) return _includes(scores, all_scores) def include_single_scores(scores): - all_scores = scores_dict["train_single"] + scores_dict["test_single"] + ["Timestamp"] + all_scores = ( + scores_dict["train_single"] + scores_dict["test_single"] + ["Timestamp"] + ) return _includes(scores, all_scores) def includes_train_scores(scores): - all_scores = scores_dict["train_single"] + scores_dict["train_ensamble"] + ["Timestamp"] + all_scores = ( + scores_dict["train_single"] + scores_dict["train_ensamble"] + ["Timestamp"] + ) return _includes(scores, all_scores) @@ -113,7 +129,7 @@ def parse_logfile(self) -> typing.List[str]: assert os.path.exists(self.logfile), "{} not found".format(self.logfile) with open(self.logfile) as fh: - content = [line.strip() for line in fh if re.search(r'[\w+]', line)] + content = [line.strip() for line in fh if re.search(r"[\w+]", line)] return content def count_ensembler_iterations(self) -> int: @@ -129,11 +145,12 @@ def count_ensembler_iterations(self) -> int: # We expect the start msg to be something like: # [DEBUG] [2020-11-26 19:22:42,160:EnsembleBuilder] \ # Function called with argument: (61.... - # [DEBUG] [2020-11-30 11:53:47,069:EnsembleBuilder] Function called with argument: - # (28.246965646743774, 1, False), {} + # [DEBUG] [2020-11-30 11:53:47,069:EnsembleBuilder] \ + # Function called with argument: (28.246965646743774, 1, False), {} match = re.search( - r'EnsembleBuilder]\s+Function called with argument:\s+\(\d+\.\d+, (\d+), \w+', - line) + r"EnsembleBuilder]\s+Function called with argument:\s+\(\d+\.\d+, (\d+), \w+", # noqa: E501 + line, + ) if match: iterations.append(int(match.group(1))) @@ -143,19 +160,15 @@ def count_ensembler_iterations(self) -> int: # time left: 61.266255 # [DEBUG] [2020-11-27 20:27:28,044:EnsembleBuilder] Starting iteration 2, # time left: 10.603252 - match = re.search( - r'EnsembleBuilder]\s+Starting iteration (\d+)', - line) + match = re.search(r"EnsembleBuilder]\s+Starting iteration (\d+)", line) if match: iterations_from_inside_ensemble_builder.append(int(match.group(1))) # The ensemble builder might not be called if there is no time. # Here we expect the msg: - # [DEBUG] [2020-11-27 20:27:28,044:EnsembleBuilder] Not starting iteration 2, - # as time left: 1.59324 - match = re.search( - r'EnsembleBuilder]\s+Not starting iteration (\d+)', - line) + # [DEBUG] [2020-11-27 20:27:28,044:EnsembleBuilder] \ + # Not starting iteration 2, as time left: 1.59324 + match = re.search(r"EnsembleBuilder]\s+Not starting iteration (\d+)", line) if match: iterations_from_inside_ensemble_builder.append(int(match.group(1))) @@ -174,49 +187,80 @@ def count_ensembler_success_pynisher_calls(self) -> int: # [DEBUG] [2020-11-30 11:54:05,984:EnsembleBuilder] return value: # (([{'Timestamp': Timestamp('2020-11- 30 11:54:05.983837'), # 'ensemble_optimization_score': 0.9787234042553191}], 50, None, None, None), 0) - return_msgs = len([line for line in self.lines if re.search( - r'EnsembleBuilder]\s+return value:.*Timestamp', line)]) + return_msgs = len( + [ + line + for line in self.lines + if re.search(r"EnsembleBuilder]\s+return value:.*Timestamp", line) + ] + ) return return_msgs def count_tae_pynisher_calls(self) -> int: # We expect the return msg to be something like: - # [DEBUG] [2020-12-16 11:57:08,987:Client-pynisher] Function called with argument: () - # , {'queue': , 'config': 1 - # [DEBUG] [2020-12-16 11:57:10,537:Client-pynisher] Function called with argument: () - # , {'queue': , - # 'config': Configuration: + """ + [DEBUG] [2020-12-16 11:57:08,987:Client-pynisher] Function called with argument: (), + {'queue': , 'config': 1 + [DEBUG] [2020-12-16 11:57:10,537:Client-pynisher] Function called with argument: (), + {'queue': , 'config': Configuration: + """ # noqa: E501 # Only the parenthesis below need to be escaped, ] and { do not. - call_msgs = len([line for line in self.lines if re.search( - r'pynisher]\s+Function called with argument: \(\), {', line)]) + call_msgs = len( + [ + line + for line in self.lines + if re.search( + r"pynisher]\s+Function called with argument: \(\), {", line + ) + ] + ) return call_msgs def count_tae_pynisher_returns(self) -> int: # We expect the return msg to be something like: # [DEBUG] [2020-11-30 11:53:11,264:pynisher] return value: (None, 0) # [DEBUG] [2020-11-30 11:53:13,768:pynisher] return value: (None, 0) - return_msgs = len([line for line in self.lines if re.search( - r'pynisher]\s+return value:\s+', line)]) + return_msgs = len( + [ + line + for line in self.lines + if re.search(r"pynisher]\s+return value:\s+", line) + ] + ) # When the pynisher pipe is prematurely closed, we also expect: # Your function call closed the pipe prematurely # -> Subprocess probably got an uncatchable signal # We expect the return msg to be something like: # OR # Something else went wrong, sorry. - premature_msgs = len([line for line in self.lines if re.search( - r'pynisher]\s+Your function call closed the pipe prematurely', line)]) - failure_msgs = len([line for line in self.lines if re.search( - r'pynisher]\s+Something else went wrong, sorry.', line)]) + premature_msgs = len( + [ + line + for line in self.lines + if re.search( + r"pynisher]\s+Your function call closed the pipe prematurely", line + ) + ] + ) + failure_msgs = len( + [ + line + for line in self.lines + if re.search(r"pynisher]\s+Something else went wrong, sorry.", line) + ] + ) return return_msgs + premature_msgs + failure_msgs def get_automl_setting_from_log(self, dataset_name: str, setting: str) -> str: for line in self.lines: # We expect messages of the form - # [DEBUG] [2020-11-30 11:53:10,457:AutoML(5):breast_cancer] ensemble_size: 50 - # [DEBUG] [2020-11-30 11:53:10,457:AutoML(5):breast_cancer] ensemble_nbest: 50 - match = re.search( - f"{dataset_name}]\\s*{setting}\\s*:\\s*(\\w+)", - line) + """ + [DEBUG] [2020-11-30 11:53:10,457:AutoML(5):breast_cancer] ensemble_size: 50 + [DEBUG] [2020-11-30 11:53:10,457:AutoML(5):breast_cancer] ensemble_nbest: 50 + """ # noqa: E501 + match = re.search(f"{dataset_name}]\\s*{setting}\\s*:\\s*(\\w+)", line) if match: return match.group(1) + return None diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py index 4e509d8755..37040f0560 100644 --- a/test/test_automl/test_automl.py +++ b/test/test_automl/test_automl.py @@ -1,12 +1,11 @@ # -*- encoding: utf-8 -*- from typing import Dict, List, Union +import glob import itertools import os import pickle -import sys import time -import glob import unittest import unittest.mock import warnings @@ -14,35 +13,46 @@ import numpy as np import pandas as pd import pytest -from scipy.sparse import csr_matrix, spmatrix import sklearn.datasets -from sklearn.ensemble import VotingRegressor, VotingClassifier -from smac.scenario.scenario import Scenario +from scipy.sparse import csr_matrix, spmatrix +from sklearn.ensemble import VotingClassifier, VotingRegressor from smac.facade.roar_facade import ROAR +from smac.scenario.scenario import Scenario +from smac.tae import StatusType -from autosklearn.automl import AutoML, AutoMLClassifier, AutoMLRegressor, _model_predict -from autosklearn.data.validation import InputValidator import autosklearn.automl -from autosklearn.data.xy_data_manager import XYDataManager -from autosklearn.metrics import ( - accuracy, log_loss, balanced_accuracy, default_metric_for_task -) -from autosklearn.evaluation.abstract_evaluator import MyDummyClassifier, MyDummyRegressor -from autosklearn.util.data import default_dataset_compression_arg -from autosklearn.util.logging_ import PickableLoggerAdapter import autosklearn.pipeline.util as putil +from autosklearn.automl import AutoML, AutoMLClassifier, AutoMLRegressor, _model_predict from autosklearn.constants import ( - MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, + CLASSIFICATION_TASKS, + MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION, - REGRESSION, MULTIOUTPUT_REGRESSION, - CLASSIFICATION_TASKS, + REGRESSION, ) -from smac.tae import StatusType +from autosklearn.data.validation import InputValidator +from autosklearn.data.xy_data_manager import XYDataManager +from autosklearn.evaluation.abstract_evaluator import ( + MyDummyClassifier, + MyDummyRegressor, +) +from autosklearn.metrics import ( + accuracy, + balanced_accuracy, + default_metric_for_task, + log_loss, +) +from autosklearn.util.data import default_dataset_compression_arg +from autosklearn.util.logging_ import PickableLoggerAdapter -sys.path.append(os.path.dirname(__file__)) -from automl_utils import print_debug_information, count_succeses, AutoMLLogParser, includes_all_scores, includes_train_scores, performance_over_time_is_plausible # noqa (E402: module level import not at top of file) +from test.test_automl.automl_utils import ( + AutoMLLogParser, + count_succeses, + includes_train_scores, + performance_over_time_is_plausible, + print_debug_information, +) class AutoMLStub(AutoML): @@ -57,7 +67,7 @@ def __del__(self): def test_fit(dask_client): - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") automl = autosklearn.automl.AutoML( seed=0, time_left_for_this_task=30, @@ -80,13 +90,7 @@ def test_fit(dask_client): def test_fit_roar(dask_client_single_worker): def get_roar_object_callback( - scenario_dict, - seed, - ta, - ta_kwargs, - dask_client, - n_jobs, - **kwargs + scenario_dict, seed, ta, ta_kwargs, dask_client, n_jobs, **kwargs ): """Random online adaptive racing. @@ -101,7 +105,7 @@ def get_roar_object_callback( n_jobs=n_jobs, ) - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") automl = autosklearn.automl.AutoML( time_left_for_this_task=30, per_run_time_limit=5, @@ -126,8 +130,7 @@ def test_refit_shuffle_on_fail(dask_client): failing_model = unittest.mock.Mock() failing_model.fit.side_effect = [ValueError(), ValueError(), None] - failing_model.fit_transformer.side_effect = [ - ValueError(), ValueError(), (None, {})] + failing_model.fit_transformer.side_effect = [ValueError(), ValueError(), (None, {})] failing_model.get_max_iter.return_value = 100 auto = AutoML(30, 5, dask_client=dask_client) @@ -135,7 +138,7 @@ def test_refit_shuffle_on_fail(dask_client): ensemble_mock.get_selected_model_identifiers.return_value = [(1, 1, 50.0)] auto.ensemble_ = ensemble_mock auto.InputValidator = InputValidator() - for budget_type in [None, 'iterations']: + for budget_type in [None, "iterations"]: auto._budget_type = budget_type auto.models_ = {(1, 1, 50.0): failing_model} @@ -153,12 +156,11 @@ def test_refit_shuffle_on_fail(dask_client): def test_only_loads_ensemble_models(automl_stub): - def side_effect(ids, *args, **kwargs): return models if ids is identifiers else {} # Add a resampling strategy as this is required by load_models - automl_stub._resampling_strategy = 'holdout' + automl_stub._resampling_strategy = "holdout" identifiers = [(1, 2), (3, 4)] models = [42] @@ -171,7 +173,7 @@ def side_effect(ids, *args, **kwargs): assert models == automl_stub.models_ assert automl_stub.cv_models_ is None - automl_stub._resampling_strategy = 'cv' + automl_stub._resampling_strategy = "cv" models = [42] automl_stub._backend.load_cv_models_by_identifiers.side_effect = side_effect @@ -192,7 +194,7 @@ def test_check_for_models_if_no_ensemble(automl_stub): def test_raises_if_no_models(automl_stub): automl_stub._backend.load_ensemble.return_value = None automl_stub._backend.list_all_models.return_value = [] - automl_stub._resampling_strategy = 'holdout' + automl_stub._resampling_strategy = "holdout" automl_stub._disable_evaluator_output = False with pytest.raises(ValueError): @@ -205,7 +207,7 @@ def test_raises_if_no_models(automl_stub): def test_delete_non_candidate_models(dask_client): seed = 555 - X, Y, _, _ = putil.get_dataset('iris') + X, Y, _, _ = putil.get_dataset("iris") automl = autosklearn.automl.AutoML( delete_tmp_folder_after_terminate=False, time_left_for_this_task=60, @@ -213,11 +215,8 @@ def test_delete_non_candidate_models(dask_client): ensemble_nbest=3, seed=seed, initial_configurations_via_metalearning=0, - resampling_strategy='holdout', - include={ - 'classifier': ['sgd'], - 'feature_preprocessor': ['no_preprocessing'] - }, + resampling_strategy="holdout", + include={"classifier": ["sgd"], "feature_preprocessor": ["no_preprocessing"]}, metric=accuracy, dask_client=dask_client, # Force model to be deleted. That is, from 50 which is the @@ -229,23 +228,31 @@ def test_delete_non_candidate_models(dask_client): # Assert at least one model file has been deleted and that there were no # deletion errors - log_file_path = glob.glob(os.path.join( - automl._backend.temporary_directory, 'AutoML(' + str(seed) + '):*.log')) + log_file_path = glob.glob( + os.path.join( + automl._backend.temporary_directory, "AutoML(" + str(seed) + "):*.log" + ) + ) with open(log_file_path[0]) as log_file: log_content = log_file.read() - assert 'Deleted files of non-candidate model' in log_content, log_content - assert 'Failed to delete files of non-candidate model' not in log_content, log_content - assert 'Failed to lock model' not in log_content, log_content + assert "Deleted files of non-candidate model" in log_content, log_content + assert ( + "Failed to delete files of non-candidate model" not in log_content + ), log_content + assert "Failed to lock model" not in log_content, log_content # Assert that the files of the models used by the ensemble weren't deleted model_files = automl._backend.list_all_models(seed=seed) model_files_idx = set() for m_file in model_files: # Extract the model identifiers from the filename - m_file = os.path.split(m_file)[1].replace('.model', '').split('.', 2) + m_file = os.path.split(m_file)[1].replace(".model", "").split(".", 2) model_files_idx.add((int(m_file[0]), int(m_file[1]), float(m_file[2]))) ensemble_members_idx = set(automl.ensemble_.identifiers_) - assert ensemble_members_idx.issubset(model_files_idx), (ensemble_members_idx, model_files_idx) + assert ensemble_members_idx.issubset(model_files_idx), ( + ensemble_members_idx, + model_files_idx, + ) del automl @@ -257,17 +264,23 @@ def test_binary_score_and_include(dask_client): """ data = sklearn.datasets.make_classification( - n_samples=400, n_features=10, n_redundant=1, n_informative=3, - n_repeated=1, n_clusters_per_class=2, random_state=1) + n_samples=400, + n_features=10, + n_redundant=1, + n_informative=3, + n_repeated=1, + n_clusters_per_class=2, + random_state=1, + ) X_train = data[0][:200] Y_train = data[1][:200] X_test = data[0][200:] Y_test = data[1][200:] automl = autosklearn.automl.AutoML( - 20, 5, - include={'classifier': ['sgd'], - 'feature_preprocessor': ['no_preprocessing']}, + 20, + 5, + include={"classifier": ["sgd"], "feature_preprocessor": ["no_preprocessing"]}, metric=accuracy, dask_client=dask_client, ) @@ -286,10 +299,11 @@ def test_binary_score_and_include(dask_client): def test_automl_outputs(dask_client): - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') - name = 'iris' + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") + name = "iris" auto = autosklearn.automl.AutoML( - 30, 5, + 30, + 5, initial_configurations_via_metalearning=0, seed=100, metric=accuracy, @@ -307,59 +321,70 @@ def test_automl_outputs(dask_client): ) data_manager_file = os.path.join( - auto._backend.temporary_directory, - '.auto-sklearn', - 'datamanager.pkl' + auto._backend.temporary_directory, ".auto-sklearn", "datamanager.pkl" ) # pickled data manager (without one hot encoding!) - with open(data_manager_file, 'rb') as fh: + with open(data_manager_file, "rb") as fh: D = pickle.load(fh) - assert np.allclose(D.data['X_train'], X_train) + assert np.allclose(D.data["X_train"], X_train) # Check that all directories are there fixture = [ - 'true_targets_ensemble.npy', - 'start_time_100', - 'datamanager.pkl', - 'ensemble_read_preds.pkl', - 'ensemble_read_losses.pkl', - 'runs', - 'ensembles', - 'ensemble_history.json', + "true_targets_ensemble.npy", + "start_time_100", + "datamanager.pkl", + "ensemble_read_preds.pkl", + "ensemble_read_losses.pkl", + "runs", + "ensembles", + "ensemble_history.json", ] - assert ( - sorted(os.listdir(os.path.join(auto._backend.temporary_directory, - '.auto-sklearn'))) - == sorted(fixture) - ) + assert sorted( + os.listdir(os.path.join(auto._backend.temporary_directory, ".auto-sklearn")) + ) == sorted(fixture) # At least one ensemble, one validation, one test prediction and one # model and one ensemble - fixture = glob.glob(os.path.join( - auto._backend.temporary_directory, - '.auto-sklearn', 'runs', '*', 'predictions_ensemble*npy', - )) + fixture = glob.glob( + os.path.join( + auto._backend.temporary_directory, + ".auto-sklearn", + "runs", + "*", + "predictions_ensemble*npy", + ) + ) assert len(fixture) > 0 - fixture = glob.glob(os.path.join(auto._backend.temporary_directory, '.auto-sklearn', - 'runs', '*', '100.*.model')) + fixture = glob.glob( + os.path.join( + auto._backend.temporary_directory, + ".auto-sklearn", + "runs", + "*", + "100.*.model", + ) + ) assert len(fixture) > 0 - fixture = os.listdir(os.path.join(auto._backend.temporary_directory, - '.auto-sklearn', 'ensembles')) - assert '100.0000000000.ensemble' in fixture + fixture = os.listdir( + os.path.join(auto._backend.temporary_directory, ".auto-sklearn", "ensembles") + ) + assert "100.0000000000.ensemble" in fixture # Start time - start_time_file_path = os.path.join(auto._backend.temporary_directory, - '.auto-sklearn', "start_time_100") - with open(start_time_file_path, 'r') as fh: + start_time_file_path = os.path.join( + auto._backend.temporary_directory, ".auto-sklearn", "start_time_100" + ) + with open(start_time_file_path, "r") as fh: start_time = float(fh.read()) assert time.time() - start_time >= 10, print_debug_information(auto) # Then check that the logger matches the run expectation - logfile = glob.glob(os.path.join( - auto._backend.temporary_directory, 'AutoML*.log'))[0] + logfile = glob.glob(os.path.join(auto._backend.temporary_directory, "AutoML*.log"))[ + 0 + ] parser = AutoMLLogParser(logfile) # The number of ensemble trajectories properly in log file @@ -381,42 +406,61 @@ def test_automl_outputs(dask_client): # Dummy not in run history total_calls_to_pynisher_log = parser.count_tae_pynisher_calls() - 1 total_returns_from_pynisher_log = parser.count_tae_pynisher_returns() - 1 - total_elements_rh = len([run_value for run_value in auto.runhistory_.data.values( - ) if run_value.status == StatusType.RUNNING]) + total_elements_rh = len( + [ + run_value + for run_value in auto.runhistory_.data.values() + if run_value.status == StatusType.RUNNING + ] + ) # Make sure we register all calls to pynisher # The less than or equal here is added as a WA as # https://github.com/automl/SMAC3/pull/712 is not yet integrated - assert total_elements_rh <= total_calls_to_pynisher_log, print_debug_information(auto) + assert total_elements_rh <= total_calls_to_pynisher_log, print_debug_information( + auto + ) # Make sure we register all returns from pynisher - assert total_elements_rh <= total_returns_from_pynisher_log, print_debug_information(auto) + assert ( + total_elements_rh <= total_returns_from_pynisher_log + ), print_debug_information(auto) # Lastly check that settings are print to logfile - ensemble_size = parser.get_automl_setting_from_log(auto._dataset_name, 'ensemble_size') + ensemble_size = parser.get_automl_setting_from_log( + auto._dataset_name, "ensemble_size" + ) assert auto._ensemble_size == int(ensemble_size) del auto -@pytest.mark.parametrize("datasets", [('breast_cancer', BINARY_CLASSIFICATION), - ('wine', MULTICLASS_CLASSIFICATION), - ('diabetes', REGRESSION)]) +@pytest.mark.parametrize( + "datasets", + [ + ("breast_cancer", BINARY_CLASSIFICATION), + ("wine", MULTICLASS_CLASSIFICATION), + ("diabetes", REGRESSION), + ], +) def test_do_dummy_prediction(dask_client, datasets): name, task = datasets X_train, Y_train, X_test, Y_test = putil.get_dataset(name) datamanager = XYDataManager( - X_train, Y_train, - X_test, Y_test, + X_train, + Y_train, + X_test, + Y_test, task=task, dataset_name=name, - feat_type={i: 'numerical' for i in range(X_train.shape[1])}, + feat_type={i: "numerical" for i in range(X_train.shape[1])}, ) auto = autosklearn.automl.AutoML( - 20, 5, + 20, + 5, initial_configurations_via_metalearning=25, metric=accuracy, dask_client=dask_client, @@ -433,18 +477,18 @@ def test_do_dummy_prediction(dask_client, datasets): D = auto._backend.load_datamanager() # Check if data manager is correcly loaded - assert D.info['task'] == datamanager.info['task'] + assert D.info["task"] == datamanager.info["task"] auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the temporary directory. - unexpected_directory = os.path.join(os.getcwd(), '.auto-sklearn') + unexpected_directory = os.path.join(os.getcwd(), ".auto-sklearn") expected_directory = os.path.join( auto._backend.temporary_directory, - '.auto-sklearn', - 'runs', - '1_1_0.0', - 'predictions_ensemble_1_1_0.0.npy' + ".auto-sklearn", + "runs", + "1_1_0.0", + "predictions_ensemble_1_1_0.0.npy", ) assert not os.path.exists(unexpected_directory) assert os.path.exists(expected_directory) @@ -454,27 +498,30 @@ def test_do_dummy_prediction(dask_client, datasets): del auto -@unittest.mock.patch('autosklearn.evaluation.ExecuteTaFuncWithQueue.run') +@unittest.mock.patch("autosklearn.evaluation.ExecuteTaFuncWithQueue.run") def test_fail_if_dummy_prediction_fails(ta_run_mock, dask_client): - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") datamanager = XYDataManager( - X_train, Y_train, - X_test, Y_test, + X_train, + Y_train, + X_test, + Y_test, task=2, - feat_type={i: 'Numerical' for i in range(X_train.shape[1])}, - dataset_name='iris', + feat_type={i: "Numerical" for i in range(X_train.shape[1])}, + dataset_name="iris", ) time_for_this_task = 30 per_run_time = 10 - auto = autosklearn.automl.AutoML(time_for_this_task, - per_run_time, - initial_configurations_via_metalearning=25, - metric=accuracy, - dask_client=dask_client, - delete_tmp_folder_after_terminate=False, - ) + auto = autosklearn.automl.AutoML( + time_for_this_task, + per_run_time, + initial_configurations_via_metalearning=25, + metric=accuracy, + dask_client=dask_client, + delete_tmp_folder_after_terminate=False, + ) auto._backend = auto._create_backend() auto._backend._make_internals_directory() auto._backend.save_datamanager(datamanager) @@ -497,55 +544,55 @@ def test_fail_if_dummy_prediction_fails(ta_run_mock, dask_client): auto._do_dummy_prediction(datamanager, 1) except ValueError: raised = True - assert not raised, 'Exception raised' + assert not raised, "Exception raised" # Case 2. Check that if statustype returned by ta.run() != success, # the function raises error. ta_run_mock.return_value = StatusType.CRASHED, None, None, {} with pytest.raises( ValueError, - match='Dummy prediction failed with run state StatusType.CRASHED and additional output: {}.' # noqa + match="Dummy prediction failed with run state StatusType.CRASHED and additional output: {}.", # noqa ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.ABORT, None, None, {} with pytest.raises( ValueError, - match='Dummy prediction failed with run state StatusType.ABORT ' - 'and additional output: {}.', + match="Dummy prediction failed with run state StatusType.ABORT " + "and additional output: {}.", ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.TIMEOUT, None, None, {} with pytest.raises( ValueError, - match='Dummy prediction failed with run state StatusType.TIMEOUT ' - 'and additional output: {}.' + match="Dummy prediction failed with run state StatusType.TIMEOUT " + "and additional output: {}.", ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.MEMOUT, None, None, {} with pytest.raises( ValueError, - match='Dummy prediction failed with run state StatusType.MEMOUT ' - 'and additional output: {}.', + match="Dummy prediction failed with run state StatusType.MEMOUT " + "and additional output: {}.", ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.CAPPED, None, None, {} with pytest.raises( ValueError, - match='Dummy prediction failed with run state StatusType.CAPPED ' - 'and additional output: {}.' + match="Dummy prediction failed with run state StatusType.CAPPED " + "and additional output: {}.", ): auto._do_dummy_prediction(datamanager, 1) - ta_run_mock.return_value = StatusType.CRASHED, None, None, {'exitcode': -6} + ta_run_mock.return_value = StatusType.CRASHED, None, None, {"exitcode": -6} with pytest.raises( ValueError, - match='The error suggests that the provided memory limits were too tight.', + match="The error suggests that the provided memory limits are too tight.", ): auto._do_dummy_prediction(datamanager, 1) -@unittest.mock.patch('autosklearn.smbo.AutoMLSMBO.run_smbo') +@unittest.mock.patch("autosklearn.smbo.AutoMLSMBO.run_smbo") def test_exceptions_inside_log_in_smbo(smbo_run_mock, dask_client): # Below importing and shutdown is a workaround, to make sure @@ -553,6 +600,7 @@ def test_exceptions_inside_log_in_smbo(smbo_run_mock, dask_client): # this test with multiple other test at the same time causes this # test to fail. This resets the singletons of the logging class import logging + logging.shutdown() automl = autosklearn.automl.AutoML( @@ -563,15 +611,15 @@ def test_exceptions_inside_log_in_smbo(smbo_run_mock, dask_client): delete_tmp_folder_after_terminate=False, ) - dataset_name = 'test_exceptions_inside_log' + dataset_name = "test_exceptions_inside_log" # Create a custom exception to prevent other errors to slip in class MyException(Exception): pass - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") # The first call is on dummy predictor failure - message = str(np.random.randint(100)) + '_run_smbo' + message = str(np.random.randint(100)) + "_run_smbo" smbo_run_mock.side_effect = MyException(message) with pytest.raises(MyException): @@ -583,10 +631,12 @@ class MyException(Exception): ) # make sure that the logfile was created - logger_name = 'AutoML(%d):%s' % (1, dataset_name) + logger_name = "AutoML(%d):%s" % (1, dataset_name) logger = logging.getLogger(logger_name) - logfile = os.path.join(automl._backend.temporary_directory, logger_name + '.log') - assert os.path.exists(logfile), print_debug_information(automl) + str(automl._clean_logger()) + logfile = os.path.join(automl._backend.temporary_directory, logger_name + ".log") + assert os.path.exists(logfile), print_debug_information(automl) + str( + automl._clean_logger() + ) # Give some time for the error message to be printed in the # log file @@ -604,19 +654,21 @@ class MyException(Exception): automl._clean_logger() if not found_message: - pytest.fail("Did not find {} in the log file {} for logger {}/{}/{}".format( - message, - print_debug_information(automl), - vars(automl._logger.logger), - vars(logger), - vars(logging.getLogger()) - )) + pytest.fail( + "Did not find {} in the log file {} for logger {}/{}/{}".format( + message, + print_debug_information(automl), + vars(automl._logger.logger), + vars(logger), + vars(logging.getLogger()), + ) + ) @pytest.mark.parametrize("metric", [log_loss, balanced_accuracy]) def test_load_best_individual_model(metric, dask_client): - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") automl = autosklearn.automl.AutoML( time_left_for_this_task=30, per_run_time_limit=5, @@ -645,9 +697,9 @@ def test_load_best_individual_model(metric, dask_client): assert get_models_with_weights[0][0] == 1.0 # Match a toy dataset - if metric.name == 'balanced_accuracy': + if metric.name == "balanced_accuracy": assert automl.score(X_test, Y_test) > 0.9 - elif metric.name == 'log_loss': + elif metric.name == "log_loss": # Seen values in github actions of 0.6978304740364537 assert automl.score(X_test, Y_test) < 0.7 else: @@ -667,17 +719,18 @@ def test_fail_if_feat_type_on_pandas_input(dask_client): dask_client=dask_client, ) - X_train = pd.DataFrame({'a': [1, 1], 'c': [1, 2]}) + X_train = pd.DataFrame({"a": [1, 1], "c": [1, 2]}) y_train = [1, 0] - with pytest.raises( - ValueError, - match="" - "providing the option feat_type to the fit method is not supported when using a Dataframe" - ): + msg = ( + "providing the option feat_type to the fit method is not supported" + " when using a Dataframe." + ) + with pytest.raises(ValueError, match=msg): automl.fit( - X_train, y_train, + X_train, + y_train, task=BINARY_CLASSIFICATION, - feat_type={1: 'Categorical', 2: 'Numerical'}, + feat_type={1: "Categorical", 2: "Numerical"}, ) @@ -686,7 +739,7 @@ def data_input_and_target_types(): # Create valid inputs X_ndarray = np.random.random(size=(n_rows, 5)) - X_ndarray[X_ndarray < .9] = 0 + X_ndarray[X_ndarray < 0.9] = 0 # Binary Classificaiton y_binary_ndarray = np.random.random(size=n_rows) @@ -696,7 +749,9 @@ def data_input_and_target_types(): # Multiclass classification y_multiclass_ndarray = np.random.random(size=n_rows) y_multiclass_ndarray[y_multiclass_ndarray > 0.66] = 2 - y_multiclass_ndarray[(y_multiclass_ndarray <= 0.66) & (y_multiclass_ndarray >= 0.33)] = 1 + y_multiclass_ndarray[ + (y_multiclass_ndarray <= 0.66) & (y_multiclass_ndarray >= 0.33) + ] = 1 y_multiclass_ndarray[y_multiclass_ndarray < 0.33] = 0 # Multilabel classificaiton @@ -789,11 +844,7 @@ def test_input_and_target_types(dask_client, X, y, X_test, y_test, task): # To save time fitting and only validate the inputs we only return # the configuration space automl.fit( - X=X, - y=y, - X_test=X_test, - y_test=y_test, - only_return_configuration_space=True + X=X, y=y, X_test=X_test, y_test=y_test, only_return_configuration_space=True ) assert automl._task == task assert automl._metric.name == default_metric_for_task[task].name @@ -801,21 +852,15 @@ def test_input_and_target_types(dask_client, X, y, X_test, y_test, task): def data_test_model_predict_outsputs_correct_shapes(): datasets = sklearn.datasets - binary = datasets.make_classification( - n_samples=5, n_classes=2, random_state=0 - ) + binary = datasets.make_classification(n_samples=5, n_classes=2, random_state=0) multiclass = datasets.make_classification( n_samples=5, n_informative=3, n_classes=3, random_state=0 ) multilabel = datasets.make_multilabel_classification( n_samples=5, n_classes=3, random_state=0 ) - regression = datasets.make_regression( - n_samples=5, random_state=0 - ) - multioutput = datasets.make_regression( - n_samples=5, n_targets=3, random_state=0 - ) + regression = datasets.make_regression(n_samples=5, random_state=0) + multioutput = datasets.make_regression(n_samples=5, n_targets=3, random_state=0) # TODO issue 1169 # While testing output shapes, realised all models are wrapped to provide @@ -841,17 +886,15 @@ def regressor(X, y): # How cross validation models are currently grouped together def voting_classifier(X, y): classifiers = [ - MyDummyClassifier(config=1, random_state=0).fit(X, y) - for _ in range(5) + MyDummyClassifier(config=1, random_state=0).fit(X, y) for _ in range(5) ] - vc = VotingClassifier(estimators=None, voting='soft') + vc = VotingClassifier(estimators=None, voting="soft") vc.estimators_ = classifiers return vc def voting_regressor(X, y): regressors = [ - MyDummyRegressor(config=1, random_state=0).fit(X, y) - for _ in range(5) + MyDummyRegressor(config=1, random_state=0).fit(X, y) for _ in range(5) ] vr = VotingRegressor(estimators=None) vr.estimators_ = regressors @@ -859,41 +902,41 @@ def voting_regressor(X, y): test_data = { BINARY_CLASSIFICATION: { - 'models': [classifier(*binary), voting_classifier(*binary)], - 'data': binary, + "models": [classifier(*binary), voting_classifier(*binary)], + "data": binary, # prob of false/true for the one class - 'expected_output_shape': (len(binary[0]), 2) + "expected_output_shape": (len(binary[0]), 2), }, MULTICLASS_CLASSIFICATION: { - 'models': [classifier(*multiclass), voting_classifier(*multiclass)], - 'data': multiclass, + "models": [classifier(*multiclass), voting_classifier(*multiclass)], + "data": multiclass, # prob of true for each possible class - 'expected_output_shape': (len(multiclass[0]), 3) + "expected_output_shape": (len(multiclass[0]), 3), }, MULTILABEL_CLASSIFICATION: { - 'models': [classifier(*multilabel), voting_classifier(*multilabel)], - 'data': multilabel, + "models": [classifier(*multilabel), voting_classifier(*multilabel)], + "data": multilabel, # probability of true for each binary label - 'expected_output_shape': (len(multilabel[0]), 3) # type: ignore + "expected_output_shape": (len(multilabel[0]), 3), # type: ignore }, REGRESSION: { - 'models': [regressor(*regression), voting_regressor(*regression)], - 'data': regression, + "models": [regressor(*regression), voting_regressor(*regression)], + "data": regression, # array of single outputs - 'expected_output_shape': (len(regression[0]), ) + "expected_output_shape": (len(regression[0]),), }, MULTIOUTPUT_REGRESSION: { - 'models': [regressor(*multioutput), voting_regressor(*multioutput)], - 'data': multioutput, + "models": [regressor(*multioutput), voting_regressor(*multioutput)], + "data": multioutput, # array of vector otuputs - 'expected_output_shape': (len(multioutput[0]), 3) - } + "expected_output_shape": (len(multioutput[0]), 3), + }, } return itertools.chain.from_iterable( [ - (model, cfg['data'], task, cfg['expected_output_shape']) - for model in cfg['models'] + (model, cfg["data"], task, cfg["expected_output_shape"]) + for model in cfg["models"] ] for task, cfg in test_data.items() ) @@ -901,7 +944,7 @@ def voting_regressor(X, y): @pytest.mark.parametrize( "model, data, task, expected_output_shape", - data_test_model_predict_outsputs_correct_shapes() + data_test_model_predict_outsputs_correct_shapes(), ) def test_model_predict_outputs_correct_shapes(model, data, task, expected_output_shape): X, y = data @@ -912,12 +955,12 @@ def test_model_predict_outputs_correct_shapes(model, data, task, expected_output def test_model_predict_outputs_warnings_to_logs(): X = list(range(20)) task = REGRESSION - logger = PickableLoggerAdapter('test_model_predict_correctly_outputs_warnings') + logger = PickableLoggerAdapter("test_model_predict_correctly_outputs_warnings") logger.warning = unittest.mock.Mock() class DummyModel: def predict(self, x): - warnings.warn('test warning', Warning) + warnings.warn("test warning", Warning) return x model = DummyModel() @@ -933,7 +976,7 @@ def test_model_predict_outputs_to_stdout_if_no_logger(): class DummyModel: def predict(self, x): - warnings.warn('test warning', Warning) + warnings.warn("test warning", Warning) return x model = DummyModel() @@ -959,7 +1002,7 @@ def test_param_dataset_compression_false(dataset_compression: bool) -> None: auto = AutoMLRegressor( time_left_for_this_task=30, per_run_time_limit=5, - dataset_compression=dataset_compression + dataset_compression=dataset_compression, ) assert auto._dataset_compression is None @@ -980,14 +1023,16 @@ def test_construction_param_dataset_compression_true(dataset_compression: bool) auto = AutoMLRegressor( time_left_for_this_task=30, per_run_time_limit=5, - dataset_compression=dataset_compression + dataset_compression=dataset_compression, ) assert auto._dataset_compression == default_dataset_compression_arg @pytest.mark.parametrize("dataset_compression", [{"memory_allocation": 0.2}]) -def test_construction_param_dataset_compression_valid_dict(dataset_compression: Dict) -> None: +def test_construction_param_dataset_compression_valid_dict( + dataset_compression: Dict, +) -> None: """ Parameters ---------- @@ -1001,7 +1046,7 @@ def test_construction_param_dataset_compression_valid_dict(dataset_compression: auto = AutoMLRegressor( time_left_for_this_task=30, per_run_time_limit=5, - dataset_compression=dataset_compression + dataset_compression=dataset_compression, ) expected_memory_allocation = dataset_compression["memory_allocation"] @@ -1012,7 +1057,9 @@ def test_construction_param_dataset_compression_valid_dict(dataset_compression: assert auto._dataset_compression["methods"] == expected_methods -@pytest.mark.parametrize("dataset_compression", [{"methods": ["precision", "subsample"]}]) +@pytest.mark.parametrize( + "dataset_compression", [{"methods": ["precision", "subsample"]}] +) @pytest.mark.parametrize("X", [np.ones((100, 10), dtype=int)]) @pytest.mark.parametrize("y", [np.random.random((100,))]) @unittest.mock.patch("autosklearn.automl.reduce_dataset_size_if_too_large") @@ -1020,7 +1067,7 @@ def test_fit_performs_dataset_compression_without_precision_with_int( mock_reduce_dataset: unittest.mock.MagicMock, dataset_compression: Dict, X: np.ndarray, - y: np.ndarray + y: np.ndarray, ) -> None: """We can't reduce the precision of ints as we do with floats. Suppose someone was to pass a column with `max_int64` and `min_int64`, any reduction of bits will @@ -1053,7 +1100,7 @@ def test_fit_performs_dataset_compression_without_precision_with_int( auto = AutoMLRegressor( time_left_for_this_task=30, # not used but required per_run_time_limit=5, # not used but required - dataset_compression=dataset_compression + dataset_compression=dataset_compression, ) # To prevent fitting anything we use `only_return_configuration_space` @@ -1066,36 +1113,48 @@ def test_fit_performs_dataset_compression_without_precision_with_int( @pytest.mark.parametrize("dataset_compression", [True]) -@pytest.mark.parametrize("X", [ - np.empty((10, 10)), - csr_matrix(np.identity(10)), - pytest.param( - np.empty((10, 10)).tolist(), - marks=pytest.mark.xfail(reason="Converted to dataframe by InputValidator") - ), - pytest.param( - pd.DataFrame(np.empty((10, 10))), - marks=pytest.mark.xfail(reason="No pandas support yet for dataset compression") - ) -]) -@pytest.mark.parametrize("y", [ - np.random.random((10, 1)), - np.random.random((10, 1)).tolist(), - pytest.param( - pd.Series(np.random.random((10,))), - marks=pytest.mark.xfail(reason="No pandas support yet for dataset compression") - ), - pytest.param( - pd.DataFrame(np.random.random((10, 10))), - marks=pytest.mark.xfail(reason="No pandas support yet for dataset compression") - ) -]) +@pytest.mark.parametrize( + "X", + [ + np.empty((10, 10)), + csr_matrix(np.identity(10)), + pytest.param( + np.empty((10, 10)).tolist(), + marks=pytest.mark.xfail(reason="Converted to dataframe by InputValidator"), + ), + pytest.param( + pd.DataFrame(np.empty((10, 10))), + marks=pytest.mark.xfail( + reason="No pandas support yet for dataset compression" + ), + ), + ], +) +@pytest.mark.parametrize( + "y", + [ + np.random.random((10, 1)), + np.random.random((10, 1)).tolist(), + pytest.param( + pd.Series(np.random.random((10,))), + marks=pytest.mark.xfail( + reason="No pandas support yet for dataset compression" + ), + ), + pytest.param( + pd.DataFrame(np.random.random((10, 10))), + marks=pytest.mark.xfail( + reason="No pandas support yet for dataset compression" + ), + ), + ], +) @unittest.mock.patch("autosklearn.automl.reduce_dataset_size_if_too_large") def test_fit_performs_dataset_compression( mock_reduce_dataset: unittest.mock.MagicMock, dataset_compression: bool, X: Union[np.ndarray, spmatrix, List, pd.DataFrame], - y: Union[np.ndarray, List, pd.Series, pd.DataFrame] + y: Union[np.ndarray, List, pd.Series, pd.DataFrame], ) -> None: """ Parameters @@ -1122,7 +1181,7 @@ def test_fit_performs_dataset_compression( auto = AutoMLRegressor( time_left_for_this_task=30, # not used but required per_run_time_limit=5, # not used but required - dataset_compression=dataset_compression + dataset_compression=dataset_compression, ) # To prevent fitting anything we use `only_return_configuration_space` diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index b32d1d0026..ac60e51472 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -3,67 +3,74 @@ import copy import glob import importlib -import os import inspect import itertools +import os import pickle import re import sys import tempfile import unittest import unittest.mock -import pytest -from ConfigSpace.configuration_space import Configuration import joblib -from joblib import cpu_count import numpy as np import numpy.ma as npma import pandas as pd +import pytest import sklearn -import sklearn.model_selection as model_selection -import sklearn.dummy import sklearn.datasets -from sklearn.base import clone -from sklearn.base import ClassifierMixin, RegressorMixin -from sklearn.base import is_classifier -from smac.tae import StatusType +import sklearn.dummy +import sklearn.model_selection as model_selection +from ConfigSpace.configuration_space import Configuration from dask.distributed import Client +from joblib import cpu_count +from sklearn.base import ClassifierMixin, RegressorMixin, clone, is_classifier +from smac.tae import StatusType -from autosklearn.data.validation import InputValidator +import autosklearn.estimators # noqa F401 import autosklearn.pipeline.util as putil +from autosklearn.automl import AutoMLClassifier +from autosklearn.data.validation import InputValidator from autosklearn.ensemble_builder import MODEL_FN_RE -import autosklearn.estimators # noqa F401 from autosklearn.estimators import ( - AutoSklearnEstimator, AutoSklearnRegressor, AutoSklearnClassifier + AutoSklearnClassifier, + AutoSklearnEstimator, + AutoSklearnRegressor, ) -from autosklearn.metrics import accuracy, f1_macro, mean_squared_error, r2 -from autosklearn.automl import AutoMLClassifier from autosklearn.experimental.askl2 import AutoSklearn2Classifier +from autosklearn.metrics import accuracy, f1_macro, mean_squared_error, r2 from autosklearn.smbo import get_smac_object sys.path.append(os.path.dirname(__file__)) -from automl_utils import print_debug_information, count_succeses, includes_train_scores, includes_all_scores, include_single_scores, performance_over_time_is_plausible # noqa (E402: module level import not at top of file) +from automl_utils import ( # noqa (E402: module level import not at top of file) + count_succeses, + include_single_scores, + includes_all_scores, + includes_train_scores, + performance_over_time_is_plausible, + print_debug_information, +) def test_fit_n_jobs(tmp_dir): - X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer') + X_train, Y_train, X_test, Y_test = putil.get_dataset("breast_cancer") # test parallel Classifier to predict classes, not only indices Y_train += 1 Y_test += 1 class get_smac_object_wrapper: - def __call__(self, *args, **kwargs): - self.n_jobs = kwargs['n_jobs'] + self.n_jobs = kwargs["n_jobs"] smac = get_smac_object(*args, **kwargs) self.dask_n_jobs = smac.solver.tae_runner.n_workers self.dask_client_n_jobs = len( - smac.solver.tae_runner.client.scheduler_info()['workers'] + smac.solver.tae_runner.client.scheduler_info()["workers"] ) return smac + get_smac_object_wrapper_instance = get_smac_object_wrapper() automl = AutoSklearnClassifier( @@ -75,8 +82,7 @@ def __call__(self, *args, **kwargs): initial_configurations_via_metalearning=0, ensemble_size=5, n_jobs=2, - include={'classifier': ['sgd'], - 'feature_preprocessor': ['no_preprocessing']}, + include={"classifier": ["sgd"], "feature_preprocessor": ["no_preprocessing"]}, get_smac_object_callback=get_smac_object_wrapper_instance, max_models_on_disc=None, ) @@ -84,17 +90,24 @@ def __call__(self, *args, **kwargs): automl.fit(X_train, Y_train) # Test that the argument is correctly passed to SMAC - assert getattr(get_smac_object_wrapper_instance, 'n_jobs') == 2 - assert getattr(get_smac_object_wrapper_instance, 'dask_n_jobs') == 2 - assert getattr(get_smac_object_wrapper_instance, 'dask_client_n_jobs') == 2 + assert getattr(get_smac_object_wrapper_instance, "n_jobs") == 2 + assert getattr(get_smac_object_wrapper_instance, "dask_n_jobs") == 2 + assert getattr(get_smac_object_wrapper_instance, "dask_client_n_jobs") == 2 available_num_runs = set() for run_key, run_value in automl.automl_.runhistory_.data.items(): - if run_value.additional_info is not None and 'num_run' in run_value.additional_info: - available_num_runs.add(run_value.additional_info['num_run']) + if ( + run_value.additional_info is not None + and "num_run" in run_value.additional_info + ): + available_num_runs.add(run_value.additional_info["num_run"]) available_predictions = set() predictions = glob.glob( - os.path.join(automl.automl_._backend.get_runs_directory(), '*', 'predictions_ensemble*.npy') + os.path.join( + automl.automl_._backend.get_runs_directory(), + "*", + "predictions_ensemble*.npy", + ) ) seeds = set() for prediction in predictions: @@ -117,7 +130,7 @@ def __call__(self, *args, **kwargs): seeds = set() for ensemble_file in ensembles: - seeds.add(int(ensemble_file.split('.')[0].split('_')[0])) + seeds.add(int(ensemble_file.split(".")[0].split("_")[0])) assert len(seeds) == 1 assert count_succeses(automl.cv_results_) > 0 @@ -132,7 +145,7 @@ def test_feat_type_wrong_arguments(): # Every Auto-Sklearn estimator has a backend, that allows a single # call to fit X = np.zeros((100, 100)) - y = np.zeros((100, )) + y = np.zeros((100,)) cls = AutoSklearnClassifier(ensemble_size=0) expected_msg = r".*feat_type does not have same number of " @@ -143,43 +156,55 @@ def test_feat_type_wrong_arguments(): cls = AutoSklearnClassifier(ensemble_size=0) expected_msg = r".*feat_type must only contain strings.*" with pytest.raises(ValueError, match=expected_msg): - cls.fit(X=X, y=y, feat_type=[True]*100) + cls.fit(X=X, y=y, feat_type=[True] * 100) cls = AutoSklearnClassifier(ensemble_size=0) expected_msg = r".*Only `Categorical`, `Numerical` and `String` are" "valid feature types, you passed `Car`.*" with pytest.raises(ValueError, match=expected_msg): - cls.fit(X=X, y=y, feat_type=['Car']*100) + cls.fit(X=X, y=y, feat_type=["Car"] * 100) # Mock AutoSklearnEstimator.fit so the test doesn't actually run fit(). -@unittest.mock.patch('autosklearn.estimators.AutoSklearnEstimator.fit') +@unittest.mock.patch("autosklearn.estimators.AutoSklearnEstimator.fit") def test_type_of_target(mock_estimator): # Test that classifier raises error for illegal target types. - X = np.array([[1, 2], - [2, 3], - [3, 4], - [4, 5], - ]) + X = np.array( + [ + [1, 2], + [2, 3], + [3, 4], + [4, 5], + ] + ) # Possible target types y_binary = np.array([0, 0, 1, 1]) y_continuous = np.array([0.1, 1.3, 2.1, 4.0]) y_multiclass = np.array([0, 1, 2, 0]) - y_multilabel = np.array([[0, 1], - [1, 1], - [1, 0], - [0, 0], - ]) - y_multiclass_multioutput = np.array([[0, 1], - [1, 3], - [2, 2], - [5, 3], - ]) - y_continuous_multioutput = np.array([[0.1, 1.5], - [1.2, 3.5], - [2.7, 2.7], - [5.5, 3.9], - ]) + y_multilabel = np.array( + [ + [0, 1], + [1, 1], + [1, 0], + [0, 0], + ] + ) + y_multiclass_multioutput = np.array( + [ + [0, 1], + [1, 3], + [2, 2], + [5, 3], + ] + ) + y_continuous_multioutput = np.array( + [ + [0.1, 1.5], + [1.2, 3.5], + [2.7, 2.7], + [5.5, 3.9], + ] + ) cls = AutoSklearnClassifier(ensemble_size=0) cls.automl_ = unittest.mock.Mock() @@ -208,20 +233,19 @@ def test_type_of_target(mock_estimator): try: cls.fit(X, y_binary) except ValueError: - pytest.fail("cls.fit() raised ValueError while fitting " - "binary targets") + pytest.fail("cls.fit() raised ValueError while fitting " "binary targets") try: cls.fit(X, y_multiclass) except ValueError: - pytest.fail("cls.fit() raised ValueError while fitting " - "multiclass targets") + pytest.fail("cls.fit() raised ValueError while fitting " "multiclass targets") try: cls.fit(X, y_multilabel) except ValueError: - pytest.fail("cls.fit() raised ValueError while fitting " - "multilabel-indicator targets") + pytest.fail( + "cls.fit() raised ValueError while fitting " "multilabel-indicator targets" + ) # Test that regressor raises error for illegal target types. reg = AutoSklearnRegressor(ensemble_size=0) @@ -230,12 +254,18 @@ def test_type_of_target(mock_estimator): expected_msg = r".*Regression with data of type" " multilabel-indicator is not supported.*" with pytest.raises(ValueError, match=expected_msg): - reg.fit(X=X, y=y_multilabel,) + reg.fit( + X=X, + y=y_multilabel, + ) expected_msg = r".*Regression with data of type" " multiclass-multioutput is not supported.*" with pytest.raises(ValueError, match=expected_msg): - reg.fit(X=X, y=y_multiclass_multioutput,) + reg.fit( + X=X, + y=y_multiclass_multioutput, + ) # Legal target types: continuous, multiclass, # continuous-multioutput, @@ -243,37 +273,38 @@ def test_type_of_target(mock_estimator): try: reg.fit(X, y_continuous) except ValueError: - pytest.fail("reg.fit() raised ValueError while fitting " - "continuous targets") + pytest.fail("reg.fit() raised ValueError while fitting " "continuous targets") try: reg.fit(X, y_multiclass) except ValueError: - pytest.fail("reg.fit() raised ValueError while fitting " - "multiclass targets") + pytest.fail("reg.fit() raised ValueError while fitting " "multiclass targets") try: reg.fit(X, y_continuous_multioutput) except ValueError: - pytest.fail("reg.fit() raised ValueError while fitting " - "continuous_multioutput targets") + pytest.fail( + "reg.fit() raised ValueError while fitting " + "continuous_multioutput targets" + ) try: reg.fit(X, y_binary) except ValueError: - pytest.fail("reg.fit() raised ValueError while fitting " - "binary targets") + pytest.fail("reg.fit() raised ValueError while fitting " "binary targets") def test_performance_over_time_no_ensemble(tmp_dir): - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") - cls = AutoSklearnClassifier(time_left_for_this_task=30, - per_run_time_limit=5, - tmp_folder=tmp_dir, - seed=1, - initial_configurations_via_metalearning=0, - ensemble_size=0,) + cls = AutoSklearnClassifier( + time_left_for_this_task=30, + per_run_time_limit=5, + tmp_folder=tmp_dir, + seed=1, + initial_configurations_via_metalearning=0, + ensemble_size=0, + ) cls.fit(X_train, Y_train, X_test, Y_test) @@ -285,16 +316,17 @@ def test_performance_over_time_no_ensemble(tmp_dir): def test_cv_results(tmp_dir): # TODO restructure and actually use real SMAC output from a long run # to do this unittest! - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") - cls = AutoSklearnClassifier(time_left_for_this_task=30, - per_run_time_limit=5, - tmp_folder=tmp_dir, - seed=1, - initial_configurations_via_metalearning=0, - ensemble_size=0, - scoring_functions=[autosklearn.metrics.precision, - autosklearn.metrics.roc_auc]) + cls = AutoSklearnClassifier( + time_left_for_this_task=30, + per_run_time_limit=5, + tmp_folder=tmp_dir, + seed=1, + initial_configurations_via_metalearning=0, + ensemble_size=0, + scoring_functions=[autosklearn.metrics.precision, autosklearn.metrics.roc_auc], + ) params = cls.get_params() original_params = copy.deepcopy(params) @@ -303,23 +335,27 @@ def test_cv_results(tmp_dir): cv_results = cls.cv_results_ assert isinstance(cv_results, dict), type(cv_results) - assert isinstance(cv_results['mean_test_score'], np.ndarray), type( - cv_results['mean_test_score']) - assert isinstance(cv_results['mean_fit_time'], np.ndarray), type( - cv_results['mean_fit_time'] + assert isinstance(cv_results["mean_test_score"], np.ndarray), type( + cv_results["mean_test_score"] ) - assert isinstance(cv_results['params'], list), type(cv_results['params']) - assert isinstance(cv_results['rank_test_scores'], np.ndarray), type( - cv_results['rank_test_scores'] + assert isinstance(cv_results["mean_fit_time"], np.ndarray), type( + cv_results["mean_fit_time"] ) - assert isinstance(cv_results['metric_precision'], npma.MaskedArray), type( - cv_results['metric_precision'] + assert isinstance(cv_results["params"], list), type(cv_results["params"]) + assert isinstance(cv_results["rank_test_scores"], np.ndarray), type( + cv_results["rank_test_scores"] ) - assert isinstance(cv_results['metric_roc_auc'], npma.MaskedArray), type( - cv_results['metric_roc_auc'] + assert isinstance(cv_results["metric_precision"], npma.MaskedArray), type( + cv_results["metric_precision"] ) - cv_result_items = [isinstance(val, npma.MaskedArray) for key, val in - cv_results.items() if key.startswith('param_')] + assert isinstance(cv_results["metric_roc_auc"], npma.MaskedArray), type( + cv_results["metric_roc_auc"] + ) + cv_result_items = [ + isinstance(val, npma.MaskedArray) + for key, val in cv_results.items() + if key.startswith("param_") + ] assert all(cv_result_items), cv_results.items() # Compare the state of the model parameters with the original parameters @@ -337,21 +373,20 @@ def test_cv_results(tmp_dir): assert joblib.hash(new_value) == joblib.hash(original_value), ( "Estimator %s should not change or mutate " " the parameter %s from %s to %s during fit." - % (cls, param_name, original_value, new_value)) + % (cls, param_name, original_value, new_value) + ) # Comply with https://scikit-learn.org/dev/glossary.html#term-classes is_classifier(cls) - assert hasattr(cls, 'classes_') + assert hasattr(cls, "classes_") -@pytest.mark.parametrize('estimator_type,dataset_name', [ - (AutoSklearnClassifier, 'iris'), - (AutoSklearnRegressor, 'boston') -]) +@pytest.mark.parametrize( + "estimator_type,dataset_name", + [(AutoSklearnClassifier, "iris"), (AutoSklearnRegressor, "boston")], +) def test_leaderboard( - tmp_dir: str, - estimator_type: Type[AutoSklearnEstimator], - dataset_name: str + tmp_dir: str, estimator_type: Type[AutoSklearnEstimator], dataset_name: str ): # Comprehensive test tasks a substantial amount of time, manually set if # required. @@ -361,16 +396,16 @@ def test_leaderboard( # Create a dict of all possible param values for each param # with some invalid one's of the incorrect type include_combinations = itertools.chain( - itertools.combinations(column_types['all'], item_count) + itertools.combinations(column_types["all"], item_count) for item_count in range(1, MAX_COMBO_SIZE_FOR_INCLUDE_PARAM) ) valid_params = { - 'detailed': [True, False], - 'ensemble_only': [True, False], - 'top_k': [-10, 0, 1, 10, 'all'], - 'sort_by': [*column_types['all'], 'invalid'], - 'sort_order': ['ascending', 'descending', 'auto', 'invalid', None], - 'include': itertools.chain([None, 'invalid', 'type'], include_combinations), + "detailed": [True, False], + "ensemble_only": [True, False], + "top_k": [-10, 0, 1, 10, "all"], + "sort_by": [*column_types["all"], "invalid"], + "sort_order": ["ascending", "descending", "auto", "invalid", None], + "include": itertools.chain([None, "invalid", "type"], include_combinations), } # Create a generator of all possible combinations of valid_params @@ -381,55 +416,49 @@ def test_leaderboard( X_train, Y_train, _, _ = putil.get_dataset(dataset_name) model = estimator_type( - time_left_for_this_task=30, - per_run_time_limit=5, - tmp_folder=tmp_dir, - seed=1 + time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, seed=1 ) model.fit(X_train, Y_train) for params in params_generator: # Convert from iterator to solid list - if params['include'] is not None and not isinstance(params['include'], str): - params['include'] = list(params['include']) + if params["include"] is not None and not isinstance(params["include"], str): + params["include"] = list(params["include"]) # Invalid top_k should raise an error, is a positive int or 'all' - if not (params['top_k'] == 'all' or params['top_k'] > 0): + if not (params["top_k"] == "all" or params["top_k"] > 0): with pytest.raises(ValueError): model.leaderboard(**params) # Invalid sort_by column - elif params['sort_by'] not in column_types['all']: + elif params["sort_by"] not in column_types["all"]: with pytest.raises(ValueError): model.leaderboard(**params) # Shouldn't accept an invalid sort order - elif params['sort_order'] not in ['ascending', 'descending', 'auto']: + elif params["sort_order"] not in ["ascending", "descending", "auto"]: with pytest.raises(ValueError): model.leaderboard(**params) # include is single str but not valid elif ( - isinstance(params['include'], str) - and params['include'] not in column_types['all'] + isinstance(params["include"], str) + and params["include"] not in column_types["all"] ): with pytest.raises(ValueError): model.leaderboard(**params) # Crash if include is list but contains invalid column elif ( - isinstance(params['include'], list) - and len(set(params['include']) - set(column_types['all'])) != 0 + isinstance(params["include"], list) + and len(set(params["include"]) - set(column_types["all"])) != 0 ): with pytest.raises(ValueError): model.leaderboard(**params) # Can't have just model_id, in both single str and list case - elif ( - params['include'] == 'model_id' - or params['include'] == ['model_id'] - ): + elif params["include"] == "model_id" or params["include"] == ["model_id"]: with pytest.raises(ValueError): model.leaderboard(**params) @@ -439,8 +468,8 @@ def test_leaderboard( # top_k should never be less than the rows given back # It can however be larger - if isinstance(params['top_k'], int): - assert params['top_k'] >= len(leaderboard) + if isinstance(params["top_k"], int): + assert params["top_k"] >= len(leaderboard) # Check the right columns are present and in the right order # The model_id is set as the index, not included in pandas columns @@ -449,43 +478,47 @@ def test_leaderboard( def exclude(lst, s): return [x for x in lst if x != s] - if params['include'] is not None: + if params["include"] is not None: # Include with only single str should be the only column - if isinstance(params['include'], str): - assert params['include'] in columns and len(columns) == 1 + if isinstance(params["include"], str): + assert params["include"] in columns and len(columns) == 1 # Include as a list should have all the columns without model_id else: - assert columns == exclude(params['include'], 'model_id') - elif params['detailed']: - assert columns == exclude(column_types['detailed'], 'model_id') + assert columns == exclude(params["include"], "model_id") + elif params["detailed"]: + assert columns == exclude(column_types["detailed"], "model_id") else: - assert columns == exclude(column_types['simple'], 'model_id') + assert columns == exclude(column_types["simple"], "model_id") # Ensure that if it's ensemble only # Can only check if 'ensemble_weight' is present - if ( - params['ensemble_only'] - and 'ensemble_weight' in columns - ): - assert all(leaderboard['ensemble_weight'] > 0) - - -@pytest.mark.parametrize('estimator', [AutoSklearnRegressor]) -@pytest.mark.parametrize('resampling_strategy', ['holdout']) -@pytest.mark.parametrize('X', [ - np.asarray([[1.0, 1.0, 1.0]] * 25 + [[2.0, 2.0, 2.0]] * 25 + - [[3.0, 3.0, 3.0]] * 25 + [[4.0, 4.0, 4.0]] * 25) -]) -@pytest.mark.parametrize('y', [ - np.asarray([1.0] * 25 + [2.0] * 25 + [3.0] * 25 + [4.0] * 25) -]) + if params["ensemble_only"] and "ensemble_weight" in columns: + assert all(leaderboard["ensemble_weight"] > 0) + + +@pytest.mark.parametrize("estimator", [AutoSklearnRegressor]) +@pytest.mark.parametrize("resampling_strategy", ["holdout"]) +@pytest.mark.parametrize( + "X", + [ + np.asarray( + [[1.0, 1.0, 1.0]] * 25 + + [[2.0, 2.0, 2.0]] * 25 + + [[3.0, 3.0, 3.0]] * 25 + + [[4.0, 4.0, 4.0]] * 25 + ) + ], +) +@pytest.mark.parametrize( + "y", [np.asarray([1.0] * 25 + [2.0] * 25 + [3.0] * 25 + [4.0] * 25)] +) def test_show_models_with_holdout( tmp_dir: str, dask_client: Client, estimator: AutoSklearnEstimator, resampling_strategy: str, X: np.ndarray, - y: np.ndarray + y: np.ndarray, ) -> None: """ Parameters @@ -521,39 +554,44 @@ def test_show_models_with_holdout( per_run_time_limit=5, tmp_folder=tmp_dir, resampling_strategy=resampling_strategy, - dask_client=dask_client + dask_client=dask_client, ) automl.fit(X, y) models = automl.show_models().values() - model_keys = set([ - 'model_id', 'rank', 'cost', 'ensemble_weight', - 'data_preprocessor', 'feature_preprocessor', - 'regressor', 'sklearn_regressor' - ]) + model_keys = set( + [ + "model_id", + "rank", + "cost", + "ensemble_weight", + "data_preprocessor", + "feature_preprocessor", + "regressor", + "sklearn_regressor", + ] + ) assert all([model_keys == set(model.keys()) for model in models]) - assert all([model['regressor'] for model in models]) - assert all([model['sklearn_regressor'] for model in models]) + assert all([model["regressor"] for model in models]) + assert all([model["sklearn_regressor"] for model in models]) assert not any([None in model.values() for model in models]) -@pytest.mark.parametrize('estimator', [AutoSklearnClassifier]) -@pytest.mark.parametrize('resampling_strategy', ['cv']) -@pytest.mark.parametrize('X', [ - np.asarray([[1.0, 1.0, 1.0]] * 50 + [[2.0, 2.0, 2.0]] * 50) -]) -@pytest.mark.parametrize('y', [ - np.asarray([1] * 50 + [2] * 50) -]) +@pytest.mark.parametrize("estimator", [AutoSklearnClassifier]) +@pytest.mark.parametrize("resampling_strategy", ["cv"]) +@pytest.mark.parametrize( + "X", [np.asarray([[1.0, 1.0, 1.0]] * 50 + [[2.0, 2.0, 2.0]] * 50)] +) +@pytest.mark.parametrize("y", [np.asarray([1] * 50 + [2] * 50)]) def test_show_models_with_cv( tmp_dir: str, dask_client: Client, estimator: AutoSklearnEstimator, resampling_strategy: str, X: np.ndarray, - y: np.ndarray + y: np.ndarray, ) -> None: """ Parameters @@ -578,12 +616,12 @@ def test_show_models_with_cv( Expects ------- - * Expects all the model dictionaries to have ``model_keys`` - * Expects no model to have any ``None`` value - * Expects all the estimators in a model to have ``estimator_keys`` - * Expects all model estimators to have an auto-sklearn wrapped model ``classifier`` - * Expects all model estimators to have a sklearn wrapped model ``sklearn_classifier`` - * Expects no estimator to have ``None`` value + * Expects all the model dictionaries to have `model_keys` + * Expects no model to have any `None` value + * Expects all the estimators in a model to have `estimator_keys` + * Expects all model estimators to have an auto-sklearn wrapped model `classifier` + * Expects all model estimators to have a sklearn wrapped model `sklearn_classifier` + * Expects no estimator to have None """ automl = estimator( @@ -591,37 +629,59 @@ def test_show_models_with_cv( per_run_time_limit=5, tmp_folder=tmp_dir, resampling_strategy=resampling_strategy, - dask_client=dask_client + dask_client=dask_client, ) automl.fit(X, y) models = automl.show_models().values() - model_keys = set([ - 'model_id', 'rank', - 'cost', 'ensemble_weight', - 'voting_model', 'estimators' - ]) + model_keys = set( + ["model_id", "rank", "cost", "ensemble_weight", "voting_model", "estimators"] + ) - estimator_keys = set([ - 'data_preprocessor', 'balancing', - 'feature_preprocessor', 'classifier', - 'sklearn_classifier' - ]) + estimator_keys = set( + [ + "data_preprocessor", + "balancing", + "feature_preprocessor", + "classifier", + "sklearn_classifier", + ] + ) assert all([model_keys == set(model.keys()) for model in models]) assert not any([None in model.values() for model in models]) - assert all([estimator_keys == set(estimator.keys()) - for model in models for estimator in model['estimators']]) - assert all([estimator['classifier'] - for model in models for estimator in model['estimators']]) - assert all([estimator['sklearn_classifier'] - for model in models for estimator in model['estimators']]) - assert not any([None in estimator.values() - for model in models for estimator in model['estimators']]) + assert all( + [ + estimator_keys == set(estimator.keys()) + for model in models + for estimator in model["estimators"] + ] + ) + assert all( + [ + estimator["classifier"] + for model in models + for estimator in model["estimators"] + ] + ) + assert all( + [ + estimator["sklearn_classifier"] + for model in models + for estimator in model["estimators"] + ] + ) + assert not any( + [ + None in estimator.values() + for model in models + for estimator in model["estimators"] + ] + ) -@unittest.mock.patch('autosklearn.estimators.AutoSklearnEstimator.build_automl') +@unittest.mock.patch("autosklearn.estimators.AutoSklearnEstimator.build_automl") def test_fit_n_jobs_negative(build_automl_patch): n_cores = cpu_count() cls = AutoSklearnEstimator(n_jobs=-1, ensemble_size=0) @@ -634,12 +694,17 @@ def test_get_number_of_available_cores(): assert n_cores >= 1, n_cores -@unittest.mock.patch('autosklearn.automl.AutoML.predict') +@unittest.mock.patch("autosklearn.automl.AutoML.predict") def test_multiclass_prediction(predict_mock, dask_client): - predicted_probabilities = [[0, 0, 0.99], [0, 0.99, 0], [0.99, 0, 0], - [0, 0.99, 0], [0, 0, 0.99]] + predicted_probabilities = [ + [0, 0, 0.99], + [0, 0.99, 0], + [0.99, 0, 0], + [0, 0.99, 0], + [0, 0, 0.99], + ] predicted_indexes = [2, 1, 0, 1, 2] - expected_result = ['c', 'b', 'a', 'b', 'c'] + expected_result = ["c", "b", "a", "b", "c"] predict_mock.return_value = np.array(predicted_probabilities) @@ -650,7 +715,7 @@ def test_multiclass_prediction(predict_mock, dask_client): ) classifier.InputValidator = InputValidator(is_classification=True) classifier.InputValidator.target_validator.fit( - pd.DataFrame(expected_result, dtype='category'), + pd.DataFrame(expected_result, dtype="category"), ) classifier.InputValidator._is_fitted = True @@ -659,13 +724,15 @@ def test_multiclass_prediction(predict_mock, dask_client): np.testing.assert_array_equal(expected_result, actual_result) -@unittest.mock.patch('autosklearn.automl.AutoML.predict') +@unittest.mock.patch("autosklearn.automl.AutoML.predict") def test_multilabel_prediction(predict_mock, dask_client): - predicted_probabilities = [[0.99, 0], - [0.99, 0], - [0, 0.99], - [0.99, 0.99], - [0.99, 0.99]] + predicted_probabilities = [ + [0.99, 0], + [0.99, 0], + [0, 0.99], + [0.99, 0.99], + [0.99, 0.99], + ] predicted_indexes = np.array([[1, 0], [1, 0], [0, 1], [1, 1], [1, 1]]) predict_mock.return_value = np.array(predicted_probabilities) @@ -677,11 +744,14 @@ def test_multilabel_prediction(predict_mock, dask_client): ) classifier.InputValidator = InputValidator(is_classification=True) classifier.InputValidator.target_validator.fit( - pd.DataFrame(predicted_indexes, dtype='int64'), + pd.DataFrame(predicted_indexes, dtype="int64"), ) classifier.InputValidator._is_fitted = True - assert classifier.InputValidator.target_validator.type_of_target == 'multilabel-indicator' + assert ( + classifier.InputValidator.target_validator.type_of_target + == "multilabel-indicator" + ) actual_result = classifier.predict([None] * len(predicted_indexes)) @@ -689,68 +759,66 @@ def test_multilabel_prediction(predict_mock, dask_client): def test_can_pickle_classifier(tmp_dir, dask_client): - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') - automl = AutoSklearnClassifier(time_left_for_this_task=30, - delete_tmp_folder_after_terminate=False, - per_run_time_limit=5, - tmp_folder=tmp_dir, - dask_client=dask_client, - ) + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") + automl = AutoSklearnClassifier( + time_left_for_this_task=30, + delete_tmp_folder_after_terminate=False, + per_run_time_limit=5, + tmp_folder=tmp_dir, + dask_client=dask_client, + ) automl.fit(X_train, Y_train) initial_predictions = automl.predict(X_test) - initial_accuracy = sklearn.metrics.accuracy_score(Y_test, - initial_predictions) + initial_accuracy = sklearn.metrics.accuracy_score(Y_test, initial_predictions) assert initial_accuracy >= 0.75 assert count_succeses(automl.cv_results_) > 0 assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible(automl.performance_over_time_) is True # Test pickle - dump_file = os.path.join(tmp_dir, 'automl.dump.pkl') + dump_file = os.path.join(tmp_dir, "automl.dump.pkl") - with open(dump_file, 'wb') as f: + with open(dump_file, "wb") as f: pickle.dump(automl, f) - with open(dump_file, 'rb') as f: + with open(dump_file, "rb") as f: restored_automl = pickle.load(f) restored_predictions = restored_automl.predict(X_test) - restored_accuracy = sklearn.metrics.accuracy_score(Y_test, - restored_predictions) + restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions) assert restored_accuracy >= 0.75 assert initial_accuracy == restored_accuracy # Test joblib - dump_file = os.path.join(tmp_dir, 'automl.dump.joblib') + dump_file = os.path.join(tmp_dir, "automl.dump.joblib") joblib.dump(automl, dump_file) restored_automl = joblib.load(dump_file) restored_predictions = restored_automl.predict(X_test) - restored_accuracy = sklearn.metrics.accuracy_score(Y_test, - restored_predictions) + restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions) assert restored_accuracy >= 0.75 assert initial_accuracy == restored_accuracy def test_multilabel(tmp_dir, dask_client): - X_train, Y_train, X_test, Y_test = putil.get_dataset( - 'iris', make_multilabel=True) - automl = AutoSklearnClassifier(time_left_for_this_task=30, - per_run_time_limit=5, - tmp_folder=tmp_dir, - dask_client=dask_client, - ) + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris", make_multilabel=True) + automl = AutoSklearnClassifier( + time_left_for_this_task=30, + per_run_time_limit=5, + tmp_folder=tmp_dir, + dask_client=dask_client, + ) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (50, 3), print_debug_information(automl) - assert count_succeses(automl.cv_results_) > 0, print_debug_information(automl) + assert count_succeses(automl.cv_results_) > 0, print_debug_information(automl) assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible(automl.performance_over_time_) is True @@ -763,20 +831,25 @@ def test_multilabel(tmp_dir, dask_client): def test_binary(tmp_dir, dask_client): - X_train, Y_train, X_test, Y_test = putil.get_dataset( - 'iris', make_binary=True) - automl = AutoSklearnClassifier(time_left_for_this_task=40, - delete_tmp_folder_after_terminate=False, - per_run_time_limit=10, - tmp_folder=tmp_dir, - dask_client=dask_client, - ) + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris", make_binary=True) + automl = AutoSklearnClassifier( + time_left_for_this_task=40, + delete_tmp_folder_after_terminate=False, + per_run_time_limit=10, + tmp_folder=tmp_dir, + dask_client=dask_client, + ) - automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test, - dataset_name='binary_test_dataset') + automl.fit( + X_train, + Y_train, + X_test=X_test, + y_test=Y_test, + dataset_name="binary_test_dataset", + ) predictions = automl.predict(X_test) - assert predictions.shape == (50, ), print_debug_information(automl) + assert predictions.shape == (50,), print_debug_information(automl) score = accuracy(Y_test, predictions) assert score > 0.9, print_debug_information(automl) @@ -794,7 +867,7 @@ def test_classification_pandas_support(tmp_dir, dask_client): ) # Drop NAN!! - X = X.dropna(axis='columns') + X = X.dropna(axis="columns") # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) @@ -802,7 +875,7 @@ def test_classification_pandas_support(tmp_dir, dask_client): automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, - exclude={'classifier': ['libsvm_svc']}, + exclude={"classifier": ["libsvm_svc"]}, dask_client=dask_client, seed=5, tmp_folder=tmp_dir, @@ -828,12 +901,13 @@ def test_classification_pandas_support(tmp_dir, dask_client): def test_regression(tmp_dir, dask_client): - X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') - automl = AutoSklearnRegressor(time_left_for_this_task=30, - per_run_time_limit=5, - tmp_folder=tmp_dir, - dask_client=dask_client, - ) + X_train, Y_train, X_test, Y_test = putil.get_dataset("boston") + automl = AutoSklearnRegressor( + time_left_for_this_task=30, + per_run_time_limit=5, + tmp_folder=tmp_dir, + dask_client=dask_client, + ) automl.fit(X_train, Y_train) @@ -842,8 +916,9 @@ def test_regression(tmp_dir, dask_client): score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average - # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds - # constraint. With more time_left_for_this_task this is no longer an issue + # Results with select rates drops avg score to a range of -32.40 to -37, + # on 30 seconds constraint. + # With more time_left_for_this_task this is no longer an issue assert score >= -37, print_debug_information(automl) assert count_succeses(automl.cv_results_) > 0 assert includes_train_scores(automl.performance_over_time_.columns) is True @@ -856,13 +931,16 @@ def test_cv_regression(tmp_dir, dask_client): a regressor """ - X_train, Y_train, X_test, Y_test = putil.get_dataset('boston', train_size_maximum=300) - automl = AutoSklearnRegressor(time_left_for_this_task=60, - per_run_time_limit=10, - resampling_strategy='cv', - tmp_folder=tmp_dir, - dask_client=dask_client, - ) + X_train, Y_train, X_test, Y_test = putil.get_dataset( + "boston", train_size_maximum=300 + ) + automl = AutoSklearnRegressor( + time_left_for_this_task=60, + per_run_time_limit=10, + resampling_strategy="cv", + tmp_folder=tmp_dir, + dask_client=dask_client, + ) automl.fit(X_train, Y_train) @@ -913,13 +991,15 @@ def test_autosklearn_classification_methods_returns_self(dask_client): Currently this method only tests that the methods of AutoSklearnClassifier is able to fit using fit(), fit_ensemble() and refit() """ - X_train, y_train, X_test, y_test = putil.get_dataset('iris') - automl = AutoSklearnClassifier(time_left_for_this_task=60, - delete_tmp_folder_after_terminate=False, - per_run_time_limit=10, - ensemble_size=0, - dask_client=dask_client, - exclude={'feature_preprocessor': ['fast_ica']}) + X_train, y_train, X_test, y_test = putil.get_dataset("iris") + automl = AutoSklearnClassifier( + time_left_for_this_task=60, + delete_tmp_folder_after_terminate=False, + per_run_time_limit=10, + ensemble_size=0, + dask_client=dask_client, + exclude={"feature_preprocessor": ["fast_ica"]}, + ) automl_fitted = automl.fit(X_train, y_train) @@ -936,12 +1016,14 @@ def test_autosklearn_classification_methods_returns_self(dask_client): # Currently this class only tests that the methods of AutoSklearnRegressor # that should return self actually return self. def test_autosklearn_regression_methods_returns_self(dask_client): - X_train, y_train, X_test, y_test = putil.get_dataset('boston') - automl = AutoSklearnRegressor(time_left_for_this_task=30, - delete_tmp_folder_after_terminate=False, - per_run_time_limit=5, - dask_client=dask_client, - ensemble_size=0) + X_train, y_train, X_test, y_test = putil.get_dataset("boston") + automl = AutoSklearnRegressor( + time_left_for_this_task=30, + delete_tmp_folder_after_terminate=False, + per_run_time_limit=5, + dask_client=dask_client, + ensemble_size=0, + ) automl_fitted = automl.fit(X_train, y_train) assert automl is automl_fitted @@ -954,10 +1036,13 @@ def test_autosklearn_regression_methods_returns_self(dask_client): def test_autosklearn2_classification_methods_returns_self(dask_client): - X_train, y_train, X_test, y_test = putil.get_dataset('iris') - automl = AutoSklearn2Classifier(time_left_for_this_task=60, ensemble_size=0, - delete_tmp_folder_after_terminate=False, - dask_client=dask_client) + X_train, y_train, X_test, y_test = putil.get_dataset("iris") + automl = AutoSklearn2Classifier( + time_left_for_this_task=60, + ensemble_size=0, + delete_tmp_folder_after_terminate=False, + dask_client=dask_client, + ) automl_fitted = automl.fit(X_train, y_train) @@ -971,18 +1056,23 @@ def test_autosklearn2_classification_methods_returns_self(dask_client): assert automl is automl_refitted predictions = automl_fitted.predict(X_test) - assert sklearn.metrics.accuracy_score( - y_test, predictions - ) >= 2 / 3, print_debug_information(automl) + assert ( + sklearn.metrics.accuracy_score(y_test, predictions) >= 2 / 3 + ), print_debug_information(automl) pickle.dumps(automl_fitted) def test_autosklearn2_classification_methods_returns_self_sparse(dask_client): - X_train, y_train, X_test, y_test = putil.get_dataset('breast_cancer', make_sparse=True) - automl = AutoSklearn2Classifier(time_left_for_this_task=60, ensemble_size=0, - delete_tmp_folder_after_terminate=False, - dask_client=dask_client) + X_train, y_train, X_test, y_test = putil.get_dataset( + "breast_cancer", make_sparse=True + ) + automl = AutoSklearn2Classifier( + time_left_for_this_task=60, + ensemble_size=0, + delete_tmp_folder_after_terminate=False, + dask_client=dask_client, + ) automl_fitted = automl.fit(X_train, y_train) @@ -996,32 +1086,39 @@ def test_autosklearn2_classification_methods_returns_self_sparse(dask_client): assert automl is automl_refitted predictions = automl_fitted.predict(X_test) - assert sklearn.metrics.accuracy_score( - y_test, predictions - ) >= 2 / 3, print_debug_information(automl) + assert ( + sklearn.metrics.accuracy_score(y_test, predictions) >= 2 / 3 + ), print_debug_information(automl) assert "boosting" not in str(automl.get_configuration_space(X=X_train, y=y_train)) pickle.dumps(automl_fitted) -@pytest.mark.parametrize("class_", [AutoSklearnClassifier, AutoSklearnRegressor, - AutoSklearn2Classifier]) +@pytest.mark.parametrize( + "class_", [AutoSklearnClassifier, AutoSklearnRegressor, AutoSklearn2Classifier] +) def test_check_estimator_signature(class_): # Make sure signature is store in self - expected_subclass = ClassifierMixin if 'Classifier' in str(class_) else RegressorMixin + expected_subclass = ( + ClassifierMixin if "Classifier" in str(class_) else RegressorMixin + ) assert issubclass(class_, expected_subclass) estimator = class_() for expected in list(inspect.signature(class_).parameters): assert hasattr(estimator, expected) -@pytest.mark.parametrize("selector_path", [None, # No XDG_CACHE_HOME provided - '/', # XDG_CACHE_HOME has no permission - tempfile.gettempdir(), # in the user cache - ]) +@pytest.mark.parametrize( + "selector_path", + [ + None, # No XDG_CACHE_HOME provided + "/", # XDG_CACHE_HOME has no permission + tempfile.gettempdir(), # in the user cache + ], +) def test_selector_file_askl2_can_be_created(selector_path): - with unittest.mock.patch('os.environ.get') as mock_foo: + with unittest.mock.patch("os.environ.get") as mock_foo: mock_foo.return_value = selector_path if selector_path is not None and not os.access(selector_path, os.W_OK): with pytest.raises(PermissionError): @@ -1029,7 +1126,9 @@ def test_selector_file_askl2_can_be_created(selector_path): else: importlib.reload(autosklearn.experimental.askl2) for metric in autosklearn.experimental.askl2.metrics: - assert os.path.exists(autosklearn.experimental.askl2.selector_files[metric.name]) + assert os.path.exists( + autosklearn.experimental.askl2.selector_files[metric.name] + ) if selector_path is None or not os.access(selector_path, os.W_OK): # We default to home in worst case assert os.path.expanduser("~") in str( @@ -1047,34 +1146,38 @@ def test_selector_file_askl2_can_be_created(selector_path): def test_check_askl2_same_arguments_as_askl() -> None: """Check the asklearn2 has the same args as asklearn1 - This test is useful for when adding args to asklearn1 to make sure we update asklearn2 - Expects ------- - * The set of arguments for AutoSklearnClassifier is the same as AutoSklearn2Classifier - except for a few expected arugments + * The set of arguments for AutoSklearnClassifier is the same as + AutoSklearn2Classifier except for a few expected arugments. """ - autosklearn1_classifier_args = set(inspect.getfullargspec(AutoSklearnEstimator.__init__).args) - autosklearn2_classifier_args = set(inspect.getfullargspec(AutoSklearn2Classifier.__init__).args) + autosklearn1_classifier_args = set( + inspect.getfullargspec(AutoSklearnEstimator.__init__).args + ) + autosklearn2_classifier_args = set( + inspect.getfullargspec(AutoSklearn2Classifier.__init__).args + ) extra_arguments = autosklearn1_classifier_args - autosklearn2_classifier_args - expected_extra_args = set([ - 'exclude', - 'include', - 'resampling_strategy_arguments', - 'get_smac_object_callback', - 'initial_configurations_via_metalearning', - 'resampling_strategy', - 'metadata_directory', - 'get_trials_callback', - ]) + expected_extra_args = set( + [ + "exclude", + "include", + "resampling_strategy_arguments", + "get_smac_object_callback", + "initial_configurations_via_metalearning", + "resampling_strategy", + "metadata_directory", + "get_trials_callback", + ] + ) unexpected_args = extra_arguments - expected_extra_args assert len(unexpected_args) == 0, unexpected_args -@pytest.mark.parametrize("task_type", ['classification', 'regression']) -@pytest.mark.parametrize("resampling_strategy", ['test', 'cv', 'holdout']) +@pytest.mark.parametrize("task_type", ["classification", "regression"]) +@pytest.mark.parametrize("resampling_strategy", ["test", "cv", "holdout"]) @pytest.mark.parametrize("disable_file_output", [True, False]) def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_output): """ @@ -1082,14 +1185,16 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_ space, fit a classification pipeline with an acceptable score """ X_train, y_train, X_test, y_test = putil.get_dataset( - 'iris' if task_type == 'classification' else 'boston' + "iris" if task_type == "classification" else "boston" + ) + estimator = ( + AutoSklearnClassifier if task_type == "classification" else AutoSklearnRegressor ) - estimator = AutoSklearnClassifier if task_type == 'classification' else AutoSklearnRegressor seed = 3 if task_type == "classification": - include = {'classifier': ['random_forest']} + include = {"classifier": ["random_forest"]} else: - include = {'regressor': ['random_forest']} + include = {"regressor": ["random_forest"]} automl = estimator( delete_tmp_folder_after_terminate=False, time_left_for_this_task=120, @@ -1101,11 +1206,16 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_ include=include, seed=seed, # We cannot get the configuration space with 'test' not fit with it - resampling_strategy=resampling_strategy if resampling_strategy != 'test' else 'holdout', + resampling_strategy=resampling_strategy + if resampling_strategy != "test" + else "holdout", ) - config = automl.get_configuration_space(X_train, y_train, - X_test=X_test, y_test=y_test, - ).get_default_configuration() + config = automl.get_configuration_space( + X_train, + y_train, + X_test=X_test, + y_test=y_test, + ).get_default_configuration() pipeline, run_info, run_value = automl.fit_pipeline( X=X_train, @@ -1114,7 +1224,7 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_ X_test=X_test, y_test=y_test, disable_file_output=disable_file_output, - resampling_strategy=resampling_strategy + resampling_strategy=resampling_strategy, ) assert isinstance(run_info.config, Configuration) @@ -1124,20 +1234,20 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_ assert run_value.cost < 0.2 # Make sure that the pipeline can be pickled - dump_file = os.path.join(tempfile.gettempdir(), 'automl.dump.pkl') - with open(dump_file, 'wb') as f: + dump_file = os.path.join(tempfile.gettempdir(), "automl.dump.pkl") + with open(dump_file, "wb") as f: pickle.dump(pipeline, f) - if resampling_strategy == 'test' or disable_file_output: + if resampling_strategy == "test" or disable_file_output: # We do not produce a pipeline in 'test' assert pipeline is None - elif resampling_strategy == 'cv': + elif resampling_strategy == "cv": # We should have fitted a Voting estimator - assert hasattr(pipeline, 'estimators_') + assert hasattr(pipeline, "estimators_") else: # We should have fitted a pipeline with named_steps - assert hasattr(pipeline, 'named_steps') - assert 'RandomForest' in pipeline.steps[-1][-1].choice.__class__.__name__ + assert hasattr(pipeline, "named_steps") + assert "RandomForest" in pipeline.steps[-1][-1].choice.__class__.__name__ # Num run should be 2, as 1 is for dummy classifier and we have not launch # another pipeline @@ -1145,25 +1255,30 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_ # Check the re-sampling strategy num_run_dir = automl.automl_._backend.get_numrun_directory( - seed, num_run, budget=0.0) - cv_model_path = os.path.join(num_run_dir, automl.automl_._backend.get_cv_model_filename( - seed, num_run, budget=0.0)) - model_path = os.path.join(num_run_dir, automl.automl_._backend.get_model_filename( - seed, num_run, budget=0.0)) - if resampling_strategy == 'test' or disable_file_output: + seed, num_run, budget=0.0 + ) + cv_model_path = os.path.join( + num_run_dir, + automl.automl_._backend.get_cv_model_filename(seed, num_run, budget=0.0), + ) + model_path = os.path.join( + num_run_dir, + automl.automl_._backend.get_model_filename(seed, num_run, budget=0.0), + ) + if resampling_strategy == "test" or disable_file_output: # No file output is expected assert not os.path.exists(num_run_dir) else: # We expect the model path always # And the cv model only on 'cv' assert os.path.exists(model_path) - if resampling_strategy == 'cv': + if resampling_strategy == "cv": assert os.path.exists(cv_model_path) - elif resampling_strategy == 'holdout': + elif resampling_strategy == "holdout": assert not os.path.exists(cv_model_path) -@pytest.mark.parametrize("data_type", ['pandas', 'numpy']) +@pytest.mark.parametrize("data_type", ["pandas", "numpy"]) @pytest.mark.parametrize("include_categorical", [True, False]) def test_pass_categorical_and_numeric_columns_to_pipeline( dask_client, data_type, include_categorical @@ -1179,17 +1294,17 @@ def test_pass_categorical_and_numeric_columns_to_pipeline( if include_categorical: X = np.insert(X, n_features, values=0, axis=1) - if data_type == 'pandas': + if data_type == "pandas": X = pd.DataFrame(X) y = pd.DataFrame(y, dtype="category") # Set the last column to categorical if include_categorical: - X.loc[:, n_features] = X.loc[:, n_features].astype('category') # type: ignore + X.loc[:, n_features] = X.loc[:, n_features].astype("category") # Specify the feature_types - if data_type == 'numpy' and include_categorical: - feat_type = ['numerical'] * n_features + ['categorical'] + if data_type == "numpy" and include_categorical: + feat_type = ["numerical"] * n_features + ["categorical"] else: feat_type = None @@ -1207,17 +1322,25 @@ def test_pass_categorical_and_numeric_columns_to_pipeline( ensemble_size=0, seed=0, dask_client=dask_client, - include={'classifier': ['random_forest']}, + include={"classifier": ["random_forest"]}, ) config_space = automl.get_configuration_space( - X_train, y_train, X_test=X_test, y_test=y_test, feat_type=feat_type, + X_train, + y_train, + X_test=X_test, + y_test=y_test, + feat_type=feat_type, ) config = config_space.get_default_configuration() pipeline, _, run_value = automl.fit_pipeline( - X=X_train, y=y_train, X_test=X_test, y_test=y_test, - config=config, feat_type=feat_type, + X=X_train, + y=y_train, + X_test=X_test, + y_test=y_test, + config=config, + feat_type=feat_type, ) assert pipeline is not None, "Expected a pipeline from automl.fit_pipeline" @@ -1237,18 +1360,17 @@ def test_pass_categorical_and_numeric_columns_to_pipeline( if include_categorical: expected_feat_types = { i: feature_type - for i, feature_type - in enumerate(['numerical'] * (n_columns-1) + ['categorical']) + for i, feature_type in enumerate( + ["numerical"] * (n_columns - 1) + ["categorical"] + ) } else: expected_feat_types = { - i: feature_type - for i, feature_type - in enumerate(['numerical'] * n_columns) + i: feature_type for i, feature_type in enumerate(["numerical"] * n_columns) } - pipeline_feat_types = pipeline.named_steps['data_preprocessor'].choice.feat_type + pipeline_feat_types = pipeline.named_steps["data_preprocessor"].choice.feat_type assert expected_feat_types == pipeline_feat_types @@ -1260,20 +1382,27 @@ def test_autosklearn_anneal(as_frame): so is a good testcase for unit-testing """ X, y = sklearn.datasets.fetch_openml(data_id=2, return_X_y=True, as_frame=as_frame) - automl = AutoSklearnClassifier(time_left_for_this_task=60, ensemble_size=0, - delete_tmp_folder_after_terminate=False, - initial_configurations_via_metalearning=0, - smac_scenario_args={'runcount_limit': 6}, - resampling_strategy='holdout-iterative-fit') + automl = AutoSklearnClassifier( + time_left_for_this_task=60, + ensemble_size=0, + delete_tmp_folder_after_terminate=False, + initial_configurations_via_metalearning=0, + smac_scenario_args={"runcount_limit": 6}, + resampling_strategy="holdout-iterative-fit", + ) if as_frame: # Let autosklearn calculate the feat types automl_fitted = automl.fit(X, y) else: - X_, y_ = sklearn.datasets.fetch_openml(data_id=2, return_X_y=True, as_frame=True) - feat_type = ['categorical' if X_[col].dtype.name == 'category' else 'numerical' - for col in X_.columns] + X_, y_ = sklearn.datasets.fetch_openml( + data_id=2, return_X_y=True, as_frame=True + ) + feat_type = [ + "categorical" if X_[col].dtype.name == "category" else "numerical" + for col in X_.columns + ] automl_fitted = automl.fit(X, y, feat_type=feat_type) @@ -1289,9 +1418,9 @@ def test_autosklearn_anneal(as_frame): assert automl_fitted.score(X, y) > 0.75 -@pytest.mark.parametrize("dataset_compression", [ - False, True, {"memory_allocation": 0.2} -]) +@pytest.mark.parametrize( + "dataset_compression", [False, True, {"memory_allocation": 0.2}] +) def test_param_dataset_compression(dataset_compression: Union[bool, Dict[str, Any]]): """We expect this does not get parsed and modified until it gets to the AutoML class, In the meantime, it's value remains whatever was passed in. diff --git a/test/test_data/__init__.py b/test/test_data/__init__.py index cc3cd7becd..e298f0f075 100644 --- a/test/test_data/__init__.py +++ b/test/test_data/__init__.py @@ -1,2 +1,2 @@ # -*- encoding: utf-8 -*- -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 012ef1a179..0414cd31b4 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -1,14 +1,10 @@ import numpy as np - import pandas as pd -from pandas.api.types import is_numeric_dtype, is_categorical_dtype, is_string_dtype - import pytest - -from scipy import sparse - import sklearn.datasets import sklearn.model_selection +from pandas.api.types import is_categorical_dtype, is_numeric_dtype, is_string_dtype +from scipy import sparse from autosklearn.data.feature_validator import FeatureValidator @@ -16,118 +12,135 @@ # Fixtures to be used in this class. By default all elements have 100 datapoints @pytest.fixture def input_data_featuretest(request): - if request.param == 'numpy_categoricalonly_nonan': + if request.param == "numpy_categoricalonly_nonan": return np.random.randint(10, size=(100, 10)) - elif request.param == 'numpy_numericalonly_nonan': + elif request.param == "numpy_numericalonly_nonan": return np.random.uniform(10, size=(100, 10)) - elif request.param == 'numpy_mixed_nonan': - return np.column_stack([ - np.random.uniform(10, size=(100, 3)), - np.random.randint(10, size=(100, 3)), - np.random.uniform(10, size=(100, 3)), - np.random.randint(10, size=(100, 1)), - ]) - elif request.param == 'numpy_string_nonan': - return np.array([ - ['a', 'b', 'c', 'a', 'b', 'c'], - ['a', 'b', 'd', 'r', 'b', 'c'], - ]) - elif request.param == 'numpy_categoricalonly_nan': - array = np.random.randint(10, size=(100, 10)).astype('float') + elif request.param == "numpy_mixed_nonan": + return np.column_stack( + [ + np.random.uniform(10, size=(100, 3)), + np.random.randint(10, size=(100, 3)), + np.random.uniform(10, size=(100, 3)), + np.random.randint(10, size=(100, 1)), + ] + ) + elif request.param == "numpy_string_nonan": + return np.array( + [ + ["a", "b", "c", "a", "b", "c"], + ["a", "b", "d", "r", "b", "c"], + ] + ) + elif request.param == "numpy_categoricalonly_nan": + array = np.random.randint(10, size=(100, 10)).astype("float") array[50, 0:5] = np.nan return array - elif request.param == 'numpy_numericalonly_nan': - array = np.random.uniform(10, size=(100, 10)).astype('float') + elif request.param == "numpy_numericalonly_nan": + array = np.random.uniform(10, size=(100, 10)).astype("float") array[50, 0:5] = np.nan # Somehow array is changed to dtype object after np.nan - return array.astype('float') - elif request.param == 'numpy_mixed_nan': - array = np.column_stack([ - np.random.uniform(10, size=(100, 3)), - np.random.randint(10, size=(100, 3)), - np.random.uniform(10, size=(100, 3)), - np.random.randint(10, size=(100, 1)), - ]) + return array.astype("float") + elif request.param == "numpy_mixed_nan": + array = np.column_stack( + [ + np.random.uniform(10, size=(100, 3)), + np.random.randint(10, size=(100, 3)), + np.random.uniform(10, size=(100, 3)), + np.random.randint(10, size=(100, 1)), + ] + ) array[50, 0:5] = np.nan return array - elif request.param == 'numpy_string_nan': - return np.array([ - ['a', 'b', 'c', 'a', 'b', 'c'], - [np.nan, 'b', 'd', 'r', 'b', 'c'], - ]) - elif request.param == 'pandas_categoricalonly_nonan': - return pd.DataFrame([ - {'A': 1, 'B': 2}, - {'A': 3, 'B': 4}, - ], dtype='category') - elif request.param == 'pandas_numericalonly_nonan': - return pd.DataFrame([ - {'A': 1, 'B': 2}, - {'A': 3, 'B': 4}, - ], dtype='float') - elif request.param == 'pandas_mixed_nonan': - frame = pd.DataFrame([ - {'A': 1, 'B': 2}, - {'A': 3, 'B': 4}, - ], dtype='category') - frame['B'] = pd.to_numeric(frame['B']) + elif request.param == "numpy_string_nan": + return np.array( + [ + ["a", "b", "c", "a", "b", "c"], + [np.nan, "b", "d", "r", "b", "c"], + ] + ) + elif request.param == "pandas_categoricalonly_nonan": + return pd.DataFrame( + [ + {"A": 1, "B": 2}, + {"A": 3, "B": 4}, + ], + dtype="category", + ) + elif request.param == "pandas_numericalonly_nonan": + return pd.DataFrame( + [ + {"A": 1, "B": 2}, + {"A": 3, "B": 4}, + ], + dtype="float", + ) + elif request.param == "pandas_mixed_nonan": + frame = pd.DataFrame( + [ + {"A": 1, "B": 2}, + {"A": 3, "B": 4}, + ], + dtype="category", + ) + frame["B"] = pd.to_numeric(frame["B"]) return frame - elif request.param == 'pandas_categoricalonly_nan': - return pd.DataFrame([ - {'A': 1, 'B': 2, 'C': np.nan}, - {'A': 3, 'C': np.nan}, - ], dtype='category') - elif request.param == 'pandas_numericalonly_nan': - return pd.DataFrame([ - {'A': 1, 'B': 2, 'C': np.nan}, - {'A': 3, 'C': np.nan}, - ], dtype='float') - elif request.param == 'pandas_mixed_nan': - frame = pd.DataFrame([ - {'A': 1, 'B': 2, 'C': 8}, - {'A': 3, 'B': 4}, - ], dtype='category') - frame['B'] = pd.to_numeric(frame['B']) + elif request.param == "pandas_categoricalonly_nan": + return pd.DataFrame( + [ + {"A": 1, "B": 2, "C": np.nan}, + {"A": 3, "C": np.nan}, + ], + dtype="category", + ) + elif request.param == "pandas_numericalonly_nan": + return pd.DataFrame( + [ + {"A": 1, "B": 2, "C": np.nan}, + {"A": 3, "C": np.nan}, + ], + dtype="float", + ) + elif request.param == "pandas_mixed_nan": + frame = pd.DataFrame( + [ + {"A": 1, "B": 2, "C": 8}, + {"A": 3, "B": 4}, + ], + dtype="category", + ) + frame["B"] = pd.to_numeric(frame["B"]) return frame - elif request.param == 'pandas_string_nonan': - return pd.DataFrame([ - {'A': 1, 'B': 2}, - {'A': 3, 'B': 4}, - ], dtype='string') - elif request.param == 'list_categoricalonly_nonan': - return [ - ['a', 'b', 'c', 'd'], - ['e', 'f', 'c', 'd'], - ] - elif request.param == 'list_numericalonly_nonan': - return [ - [1, 2, 3, 4], - [5, 6, 7, 8] - ] - elif request.param == 'list_mixed_nonan': - return [ - ['a', 2, 3, 4], - ['b', 6, 7, 8] - ] - elif request.param == 'list_categoricalonly_nan': - return [ - ['a', 'b', 'c', np.nan], - ['e', 'f', 'c', 'd'], - ] - elif request.param == 'list_numericalonly_nan': + elif request.param == "pandas_string_nonan": + return pd.DataFrame( + [ + {"A": 1, "B": 2}, + {"A": 3, "B": 4}, + ], + dtype="string", + ) + elif request.param == "list_categoricalonly_nonan": return [ - [1, 2, 3, np.nan], - [5, 6, 7, 8] + ["a", "b", "c", "d"], + ["e", "f", "c", "d"], ] - elif request.param == 'list_mixed_nan': + elif request.param == "list_numericalonly_nonan": + return [[1, 2, 3, 4], [5, 6, 7, 8]] + elif request.param == "list_mixed_nonan": + return [["a", 2, 3, 4], ["b", 6, 7, 8]] + elif request.param == "list_categoricalonly_nan": return [ - ['a', np.nan, 3, 4], - ['b', 6, 7, 8] + ["a", "b", "c", np.nan], + ["e", "f", "c", "d"], ] - elif 'sparse' in request.param: + elif request.param == "list_numericalonly_nan": + return [[1, 2, 3, np.nan], [5, 6, 7, 8]] + elif request.param == "list_mixed_nan": + return [["a", np.nan, 3, 4], ["b", 6, 7, 8]] + elif "sparse" in request.param: # We expect the names to be of the type sparse_csc_nonan - sparse_, type_, nan_ = request.param.split('_') - if 'nonan' in nan_: + sparse_, type_, nan_ = request.param.split("_") + if "nonan" in nan_: data = np.ones(3) else: data = np.array([1, 2, np.nan]) @@ -135,26 +148,27 @@ def input_data_featuretest(request): # Then the type of sparse row_ind = np.array([0, 1, 2]) col_ind = np.array([1, 2, 1]) - if 'csc' in type_: + if "csc" in type_: return sparse.csc_matrix((data, (row_ind, col_ind))) - elif 'csr' in type_: + elif "csr" in type_: return sparse.csr_matrix((data, (row_ind, col_ind))) - elif 'coo' in type_: + elif "coo" in type_: return sparse.coo_matrix((data, (row_ind, col_ind))) - elif 'bsr' in type_: + elif "bsr" in type_: return sparse.bsr_matrix((data, (row_ind, col_ind))) - elif 'lil' in type_: + elif "lil" in type_: return sparse.lil_matrix((data)) - elif 'dok' in type_: + elif "dok" in type_: return sparse.dok_matrix(np.vstack((data, data, data))) - elif 'dia' in type_: + elif "dia" in type_: return sparse.dia_matrix(np.vstack((data, data, data))) else: ValueError("Unsupported indirect fixture {}".format(request.param)) - elif 'openml' in request.param: - _, openml_id = request.param.split('_') - X, y = sklearn.datasets.fetch_openml(data_id=int(openml_id), - return_X_y=True, as_frame=True) + elif "openml" in request.param: + _, openml_id = request.param.split("_") + X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), return_X_y=True, as_frame=True + ) return X else: ValueError("Unsupported indirect fixture {}".format(request.param)) @@ -162,37 +176,37 @@ def input_data_featuretest(request): # Actual checks for the features @pytest.mark.parametrize( - 'input_data_featuretest', + "input_data_featuretest", ( - 'numpy_categoricalonly_nonan', - 'numpy_numericalonly_nonan', - 'numpy_mixed_nonan', - 'numpy_categoricalonly_nan', - 'numpy_numericalonly_nan', - 'numpy_mixed_nan', - 'pandas_categoricalonly_nonan', - 'pandas_numericalonly_nonan', - 'pandas_mixed_nonan', - 'pandas_numericalonly_nan', - 'list_numericalonly_nonan', - 'list_numericalonly_nan', - 'sparse_bsr_nonan', - 'sparse_bsr_nan', - 'sparse_coo_nonan', - 'sparse_coo_nan', - 'sparse_csc_nonan', - 'sparse_csc_nan', - 'sparse_csr_nonan', - 'sparse_csr_nan', - 'sparse_dia_nonan', - 'sparse_dia_nan', - 'sparse_dok_nonan', - 'sparse_dok_nan', - 'sparse_lil_nonan', - 'sparse_lil_nan', - 'openml_40981', # Australian + "numpy_categoricalonly_nonan", + "numpy_numericalonly_nonan", + "numpy_mixed_nonan", + "numpy_categoricalonly_nan", + "numpy_numericalonly_nan", + "numpy_mixed_nan", + "pandas_categoricalonly_nonan", + "pandas_numericalonly_nonan", + "pandas_mixed_nonan", + "pandas_numericalonly_nan", + "list_numericalonly_nonan", + "list_numericalonly_nan", + "sparse_bsr_nonan", + "sparse_bsr_nan", + "sparse_coo_nonan", + "sparse_coo_nan", + "sparse_csc_nonan", + "sparse_csc_nan", + "sparse_csr_nonan", + "sparse_csr_nan", + "sparse_dia_nonan", + "sparse_dia_nan", + "sparse_dok_nonan", + "sparse_dok_nan", + "sparse_lil_nonan", + "sparse_lil_nan", + "openml_40981", # Australian ), - indirect=True + indirect=True, ) def test_featurevalidator_supported_types(input_data_featuretest): validator = FeatureValidator() @@ -209,43 +223,45 @@ def test_featurevalidator_supported_types(input_data_featuretest): @pytest.mark.parametrize( - 'input_data_featuretest', + "input_data_featuretest", ( - 'numpy_string_nonan', - 'numpy_string_nan', + "numpy_string_nonan", + "numpy_string_nan", ), - indirect=True + indirect=True, ) def test_featurevalidator_unsupported_numpy(input_data_featuretest): validator = FeatureValidator() - with pytest.raises(ValueError, match=r".*When providing a numpy array.*not supported."): + with pytest.raises( + ValueError, match=r".*When providing a numpy array.*not supported." + ): validator.fit(input_data_featuretest) @pytest.mark.parametrize( - 'input_data_featuretest', + "input_data_featuretest", ( - 'numpy_categoricalonly_nonan', - 'numpy_mixed_nonan', - 'numpy_categoricalonly_nan', - 'numpy_mixed_nan', - 'pandas_categoricalonly_nonan', - 'pandas_mixed_nonan', - 'sparse_bsr_nonan', - 'sparse_bsr_nan', - 'sparse_coo_nonan', - 'sparse_coo_nan', - 'sparse_csc_nonan', - 'sparse_csc_nan', - 'sparse_csr_nonan', - 'sparse_csr_nan', - 'sparse_dia_nonan', - 'sparse_dia_nan', - 'sparse_dok_nonan', - 'sparse_dok_nan', - 'sparse_lil_nonan', + "numpy_categoricalonly_nonan", + "numpy_mixed_nonan", + "numpy_categoricalonly_nan", + "numpy_mixed_nan", + "pandas_categoricalonly_nonan", + "pandas_mixed_nonan", + "sparse_bsr_nonan", + "sparse_bsr_nan", + "sparse_coo_nonan", + "sparse_coo_nan", + "sparse_csc_nonan", + "sparse_csc_nan", + "sparse_csr_nonan", + "sparse_csr_nan", + "sparse_dia_nonan", + "sparse_dia_nan", + "sparse_dok_nonan", + "sparse_dok_nan", + "sparse_lil_nonan", ), - indirect=True + indirect=True, ) def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest): """ @@ -276,20 +292,24 @@ def test_featurevalidatorget_feat_type_from_columns(): """ validator = FeatureValidator() - df = pd.DataFrame([ - {'int': 1, 'float': 1.0, 'category': 'one', 'bool': True}, - {'int': 2, 'float': 2.0, 'category': 'two', 'bool': False}, - ]) + df = pd.DataFrame( + [ + {"int": 1, "float": 1.0, "category": "one", "bool": True}, + {"int": 2, "float": 2.0, "category": "two", "bool": False}, + ] + ) for col in df.columns: df[col] = df[col].astype(col) feature_types = validator.get_feat_type_from_columns(df) - assert feature_types == {'int': 'numerical', - 'float': 'numerical', - 'category': 'categorical', - 'bool': 'categorical'} + assert feature_types == { + "int": "numerical", + "float": "numerical", + "category": "categorical", + "bool": "categorical", + } def test_features_unsupported_calls_are_raised(): @@ -300,28 +320,37 @@ def test_features_unsupported_calls_are_raised(): """ validator = FeatureValidator() with pytest.raises(ValueError, match=r"Auto-sklearn does not support time"): + validator.fit(pd.DataFrame({"datetime": [pd.Timestamp("20180310")]})) + with pytest.raises( + ValueError, match=r"Auto-sklearn only supports.*yet, the provided input" + ): + validator.fit({"input1": 1, "input2": 2}) + validator = FeatureValidator() + with pytest.raises( + ValueError, match=r"The feature dimensionality of the train and test" + ): validator.fit( - pd.DataFrame({'datetime': [pd.Timestamp('20180310')]}) + X_train=np.array([[1, 2, 3], [4, 5, 6]]), + X_test=np.array([[1, 2, 3, 4], [4, 5, 6, 7]]), ) - with pytest.raises(ValueError, match=r"Auto-sklearn only supports.*yet, the provided input"): - validator.fit({'input1': 1, 'input2': 2}) - validator = FeatureValidator() - with pytest.raises(ValueError, match=r"The feature dimensionality of the train and test"): - validator.fit(X_train=np.array([[1, 2, 3], [4, 5, 6]]), - X_test=np.array([[1, 2, 3, 4], [4, 5, 6, 7]]), - ) - with pytest.raises(ValueError, match=r"Cannot call transform on a validator that is not fit"): + with pytest.raises( + ValueError, match=r"Cannot call transform on a validator that is not fit" + ): validator.transform(np.array([[1, 2, 3], [4, 5, 6]])) - validator = FeatureValidator(feat_type=['Numerical']) - with pytest.raises(ValueError, match=r"providing the option feat_type to the fit method is.*"): + validator = FeatureValidator(feat_type=["Numerical"]) + with pytest.raises( + ValueError, match=r"providing the option feat_type to the fit method is.*" + ): validator.fit(pd.DataFrame([[1, 2, 3], [4, 5, 6]])) with pytest.raises(ValueError, match=r"feat_type does not have same number of.*"): validator.fit(np.array([[1, 2, 3], [4, 5, 6]])) validator = FeatureValidator(feat_type=[1, 2, 3]) with pytest.raises(ValueError, match=r"feat_type must only contain strings.*"): validator.fit(np.array([[1, 2, 3], [4, 5, 6]])) - validator = FeatureValidator(feat_type=['1', '2', '3']) - with pytest.raises(ValueError, match=r"Only `Categorical`, `Numerical` and `String` are.*"): + validator = FeatureValidator(feat_type=["1", "2", "3"]) + with pytest.raises( + ValueError, match=r"Only `Categorical`, `Numerical` and `String` are.*" + ): validator.fit(np.array([[1, 2, 3], [4, 5, 6]])) @@ -331,16 +360,16 @@ def test_no_new_category_after_fit(): without throwing an error """ # Then make sure we catch categorical extra categories - x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, dtype='category') + x = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, dtype="category") validator = FeatureValidator() validator.fit(x) - x['A'] = x['A'].apply(lambda x: x*x) + x["A"] = x["A"].apply(lambda x: x * x) validator.transform(x) # Actual checks for the features @pytest.mark.parametrize( - 'openml_id', + "openml_id", ( 40981, # Australian 3, # kr-vs-kp @@ -349,32 +378,37 @@ def test_no_new_category_after_fit(): 40984, # Segment ), ) -@pytest.mark.parametrize('train_data_type', ('numpy', 'pandas', 'list')) -@pytest.mark.parametrize('test_data_type', ('numpy', 'pandas', 'list')) -def test_featurevalidator_new_data_after_fit(openml_id, - train_data_type, test_data_type): +@pytest.mark.parametrize("train_data_type", ("numpy", "pandas", "list")) +@pytest.mark.parametrize("test_data_type", ("numpy", "pandas", "list")) +def test_featurevalidator_new_data_after_fit( + openml_id, train_data_type, test_data_type +): # List is currently not supported as infer_objects # cast list objects to type objects - if train_data_type == 'list' or test_data_type == 'list': + if train_data_type == "list" or test_data_type == "list": pytest.skip() validator = FeatureValidator() - if train_data_type == 'numpy': - X, y = sklearn.datasets.fetch_openml(data_id=openml_id, - return_X_y=True, as_frame=False) - elif train_data_type == 'pandas': - X, y = sklearn.datasets.fetch_openml(data_id=openml_id, - return_X_y=True, as_frame=True) + if train_data_type == "numpy": + X, y = sklearn.datasets.fetch_openml( + data_id=openml_id, return_X_y=True, as_frame=False + ) + elif train_data_type == "pandas": + X, y = sklearn.datasets.fetch_openml( + data_id=openml_id, return_X_y=True, as_frame=True + ) else: - X, y = sklearn.datasets.fetch_openml(data_id=openml_id, - return_X_y=True, as_frame=True) + X, y = sklearn.datasets.fetch_openml( + data_id=openml_id, return_X_y=True, as_frame=True + ) X = X.values.tolist() y = y.values.tolist() X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, y, random_state=1) + X, y, random_state=1 + ) validator.fit(X_train) @@ -391,7 +425,7 @@ def test_featurevalidator_new_data_after_fit(openml_id, @pytest.mark.parametrize( - 'openml_id', + "openml_id", ( 40981, # Australian 3, # kr-vs-kp @@ -403,10 +437,12 @@ def test_featurevalidator_new_data_after_fit(openml_id, ) def test_list_to_dataframe(openml_id): - X_pandas, y_pandas = sklearn.datasets.fetch_openml(data_id=openml_id, - return_X_y=True, as_frame=True) + X_pandas, y_pandas = sklearn.datasets.fetch_openml( + data_id=openml_id, return_X_y=True, as_frame=True + ) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X_pandas, y_pandas, random_state=1) + X_pandas, y_pandas, random_state=1 + ) X_list = X_train.values.tolist() validator = FeatureValidator() @@ -439,24 +475,24 @@ def test_list_to_dataframe(openml_id): @pytest.mark.parametrize( - 'input_data_featuretest', + "input_data_featuretest", ( - 'sparse_bsr_nonan', - 'sparse_bsr_nan', - 'sparse_coo_nonan', - 'sparse_coo_nan', - 'sparse_csc_nonan', - 'sparse_csc_nan', - 'sparse_csr_nonan', - 'sparse_csr_nan', - 'sparse_dia_nonan', - 'sparse_dia_nan', - 'sparse_dok_nonan', - 'sparse_dok_nan', - 'sparse_lil_nonan', - 'sparse_lil_nan', + "sparse_bsr_nonan", + "sparse_bsr_nan", + "sparse_coo_nonan", + "sparse_coo_nan", + "sparse_csc_nonan", + "sparse_csc_nan", + "sparse_csr_nonan", + "sparse_csr_nan", + "sparse_dia_nonan", + "sparse_dia_nan", + "sparse_dok_nonan", + "sparse_dok_nan", + "sparse_lil_nonan", + "sparse_lil_nan", ), - indirect=True + indirect=True, ) def test_sparse_output_is_csr(input_data_featuretest): validator = FeatureValidator() @@ -467,7 +503,9 @@ def test_sparse_output_is_csr(input_data_featuretest): def test_unsupported_dataframe_sparse(): - df = pd.DataFrame({'A': pd.Series(pd.arrays.SparseArray(np.random.randn(10)))}) + df = pd.DataFrame({"A": pd.Series(pd.arrays.SparseArray(np.random.randn(10)))}) validator = FeatureValidator() - with pytest.raises(ValueError, match=r"Auto-sklearn does not yet support sparse pandas"): + with pytest.raises( + ValueError, match=r"Auto-sklearn does not yet support sparse pandas" + ): validator.fit(df) diff --git a/test/test_data/test_target_validator.py b/test/test_data/test_target_validator.py index 09e075b85f..e57f464c72 100644 --- a/test/test_data/test_target_validator.py +++ b/test/test_data/test_target_validator.py @@ -1,14 +1,10 @@ import numpy as np - import pandas as pd - import pytest -from pandas.api.types import is_numeric_dtype, is_bool_dtype - -from scipy import sparse - import sklearn.datasets import sklearn.model_selection +from pandas.api.types import is_bool_dtype, is_numeric_dtype +from scipy import sparse from sklearn.utils.multiclass import type_of_target from autosklearn.data.target_validator import TargetValidator @@ -17,80 +13,85 @@ # Fixtures to be used in this class. By default all elements have 100 datapoints @pytest.fixture def input_data_targettest(request): - if request.param == 'series_binary': + if request.param == "series_binary": return pd.Series([1, -1, -1, 1]) - elif request.param == 'series_multiclass': + elif request.param == "series_multiclass": return pd.Series([1, 0, 2]) - elif request.param == 'series_multilabel': + elif request.param == "series_multilabel": return pd.Series([[1, 0], [0, 1]]) - elif request.param == 'series_continuous': + elif request.param == "series_continuous": return pd.Series([0.1, 0.6, 0.7]) - elif request.param == 'series_continuous-multioutput': + elif request.param == "series_continuous-multioutput": return pd.Series([[1.5, 2.0], [3.0, 1.6]]) - elif request.param == 'pandas_binary': + elif request.param == "pandas_binary": return pd.DataFrame([1, -1, -1, 1]) - elif request.param == 'pandas_multiclass': + elif request.param == "pandas_multiclass": return pd.DataFrame([1, 0, 2]) - elif request.param == 'pandas_multilabel': + elif request.param == "pandas_multilabel": return pd.DataFrame([[1, 0], [0, 1]]) - elif request.param == 'pandas_continuous': + elif request.param == "pandas_continuous": return pd.DataFrame([0.1, 0.6, 0.7]) - elif request.param == 'pandas_continuous-multioutput': + elif request.param == "pandas_continuous-multioutput": return pd.DataFrame([[1.5, 2.0], [3.0, 1.6]]) - elif request.param == 'numpy_binary': + elif request.param == "numpy_binary": return np.array([1, -1, -1, 1]) - elif request.param == 'numpy_multiclass': + elif request.param == "numpy_multiclass": return np.array([1, 0, 2]) - elif request.param == 'numpy_multilabel': + elif request.param == "numpy_multilabel": return np.array([[1, 0], [0, 1]]) - elif request.param == 'numpy_continuous': + elif request.param == "numpy_continuous": return np.array([0.1, 0.6, 0.7]) - elif request.param == 'numpy_continuous-multioutput': + elif request.param == "numpy_continuous-multioutput": return np.array([[1.5, 2.0], [3.0, 1.6]]) - elif request.param == 'list_binary': + elif request.param == "list_binary": return [1, -1, -1, 1] - elif request.param == 'list_multiclass': + elif request.param == "list_multiclass": return [1, 0, 2] - elif request.param == 'list_multilabel': + elif request.param == "list_multilabel": return [[0, 1], [1, 0]] - elif request.param == 'list_continuous': + elif request.param == "list_continuous": return [0.1, 0.6, 0.7] - elif request.param == 'list_continuous-multioutput': + elif request.param == "list_continuous-multioutput": return [[1.5, 2.0], [3.0, 1.6]] - elif 'openml' in request.param: - _, openml_id = request.param.split('_') - X, y = sklearn.datasets.fetch_openml(data_id=int(openml_id), - return_X_y=True, as_frame=True) - if len(y.shape) > 1 and y.shape[1] > 1 and np.any(y.eq('TRUE').any(1).to_numpy()): + elif "openml" in request.param: + _, openml_id = request.param.split("_") + X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), return_X_y=True, as_frame=True + ) + if ( + len(y.shape) > 1 + and y.shape[1] > 1 + and np.any(y.eq("TRUE").any(1).to_numpy()) + ): # This 'if' is only asserted for multi-label data # Force the downloaded data to be interpreted as multilabel y = y.dropna() - y.replace('FALSE', 0, inplace=True) - y.replace('TRUE', 1, inplace=True) + y.replace("FALSE", 0, inplace=True) + y.replace("TRUE", 1, inplace=True) y = y.astype(int) return y - elif 'sparse' in request.param: + elif "sparse" in request.param: # We expect the names to be of the type sparse_csc_nonan - sparse_, type_, nan_ = request.param.split('_') - if 'nonan' in nan_: + sparse_, type_, nan_ = request.param.split("_") + if "nonan" in nan_: data = np.ones(3) else: data = np.array([1, 2, np.nan]) # Then the type of sparse - if 'csc' in type_: + if "csc" in type_: return sparse.csc_matrix(data) - elif 'csr' in type_: + elif "csr" in type_: return sparse.csr_matrix(data) - elif 'coo' in type_: + elif "coo" in type_: return sparse.coo_matrix(data) - elif 'bsr' in type_: + elif "bsr" in type_: return sparse.bsr_matrix(data) - elif 'lil' in type_: + elif "lil" in type_: return sparse.lil_matrix(data) - elif 'dok' in type_: + elif "dok" in type_: return sparse.dok_matrix(np.vstack((data, data, data))) - elif 'dia' in type_: + elif "dia" in type_: return sparse.dia_matrix(np.vstack((data, data, data))) else: ValueError("Unsupported indirect fixture {}".format(request.param)) @@ -100,29 +101,29 @@ def input_data_targettest(request): # Actual checks for the targets @pytest.mark.parametrize( - 'input_data_targettest', + "input_data_targettest", ( - 'series_binary', - 'series_multiclass', - 'series_continuous', - 'pandas_binary', - 'pandas_multiclass', - 'pandas_multilabel', - 'pandas_continuous', - 'pandas_continuous-multioutput', - 'numpy_binary', - 'numpy_multiclass', - 'numpy_multilabel', - 'numpy_continuous', - 'numpy_continuous-multioutput', - 'list_binary', - 'list_multiclass', - 'list_multilabel', - 'list_continuous', - 'list_continuous-multioutput', - 'openml_204', + "series_binary", + "series_multiclass", + "series_continuous", + "pandas_binary", + "pandas_multiclass", + "pandas_multilabel", + "pandas_continuous", + "pandas_continuous-multioutput", + "numpy_binary", + "numpy_multiclass", + "numpy_multilabel", + "numpy_continuous", + "numpy_continuous-multioutput", + "list_binary", + "list_multiclass", + "list_multilabel", + "list_continuous", + "list_continuous-multioutput", + "openml_204", ), - indirect=True + indirect=True, ) def test_targetvalidator_supported_types_noclassification(input_data_targettest): y = input_data_targettest @@ -146,19 +147,19 @@ def test_targetvalidator_supported_types_noclassification(input_data_targettest) @pytest.mark.parametrize( - 'input_data_targettest', + "input_data_targettest", ( - 'series_binary', - 'series_multiclass', - 'pandas_binary', - 'pandas_multiclass', - 'numpy_binary', - 'numpy_multiclass', - 'list_binary', - 'list_multiclass', - 'openml_2', + "series_binary", + "series_multiclass", + "pandas_binary", + "pandas_multiclass", + "numpy_binary", + "numpy_multiclass", + "list_binary", + "list_multiclass", + "openml_2", ), - indirect=True + indirect=True, ) def test_targetvalidator_supported_types_classification(input_data_targettest): y = input_data_targettest # Just to remove visual clutter @@ -177,10 +178,7 @@ def test_targetvalidator_supported_types_classification(input_data_targettest): assert isinstance(y_inverse, np.ndarray) # Assert that y_encoded is numeric and not boolean - assert ( - is_numeric_dtype(y_encoded.dtype) - and not is_bool_dtype(y_encoded.dtype) - ) + assert is_numeric_dtype(y_encoded.dtype) and not is_bool_dtype(y_encoded.dtype) # Assert dtype is presevered with y -> y_encoded -> y_inverse def dtype(arr): @@ -205,7 +203,7 @@ def dtype(arr): if len(shape) == 2 and shape[1] == 1: # For cases where y = [[1], [2], [3]], # we expect y_inverse, y_encodedd to have been flattened to [1,2,3] - expected_shape = (shape[0], ) + expected_shape = (shape[0],) else: expected_shape = shape @@ -221,7 +219,7 @@ def dtype(arr): # # As a result of this, we don't encode 'multilabel-indicator' labels and # there is nothing else to check here - if validator.type_of_target == 'multilabel-indicator': + if validator.type_of_target == "multilabel-indicator": assert validator.encoder is None else: @@ -242,112 +240,112 @@ def dtype(arr): @pytest.mark.parametrize( - 'input_data_targettest', + "input_data_targettest", ( - 'series_binary', - 'pandas_binary', - 'numpy_binary', - 'list_binary', - 'openml_1066', + "series_binary", + "pandas_binary", + "numpy_binary", + "list_binary", + "openml_1066", ), - indirect=True + indirect=True, ) def test_targetvalidator_binary(input_data_targettest): - assert type_of_target(input_data_targettest) == 'binary' + assert type_of_target(input_data_targettest) == "binary" validator = TargetValidator(is_classification=True) # Test the X_test also! validator.fit(input_data_targettest, input_data_targettest) transformed_y = validator.transform(input_data_targettest) - assert type_of_target(transformed_y) == 'binary' + assert type_of_target(transformed_y) == "binary" @pytest.mark.parametrize( - 'input_data_targettest', + "input_data_targettest", ( - 'series_multiclass', - 'pandas_multiclass', - 'numpy_multiclass', - 'list_multiclass', - 'openml_54', + "series_multiclass", + "pandas_multiclass", + "numpy_multiclass", + "list_multiclass", + "openml_54", ), - indirect=True + indirect=True, ) def test_targetvalidator_multiclass(input_data_targettest): - assert type_of_target(input_data_targettest) == 'multiclass' + assert type_of_target(input_data_targettest) == "multiclass" validator = TargetValidator(is_classification=True) # Test the X_test also! validator.fit(input_data_targettest, input_data_targettest) transformed_y = validator.transform(input_data_targettest) - assert type_of_target(transformed_y) == 'multiclass' + assert type_of_target(transformed_y) == "multiclass" @pytest.mark.parametrize( - 'input_data_targettest', + "input_data_targettest", ( - 'pandas_multilabel', - 'numpy_multilabel', - 'list_multilabel', - 'openml_40594', + "pandas_multilabel", + "numpy_multilabel", + "list_multilabel", + "openml_40594", ), - indirect=True + indirect=True, ) def test_targetvalidator_multilabel(input_data_targettest): - assert type_of_target(input_data_targettest) == 'multilabel-indicator' + assert type_of_target(input_data_targettest) == "multilabel-indicator" validator = TargetValidator(is_classification=True) # Test the X_test also! validator.fit(input_data_targettest, input_data_targettest) transformed_y = validator.transform(input_data_targettest) - assert type_of_target(transformed_y) == 'multilabel-indicator' + assert type_of_target(transformed_y) == "multilabel-indicator" @pytest.mark.parametrize( - 'input_data_targettest', + "input_data_targettest", ( - 'series_continuous', - 'pandas_continuous', - 'numpy_continuous', - 'list_continuous', - 'openml_531', + "series_continuous", + "pandas_continuous", + "numpy_continuous", + "list_continuous", + "openml_531", ), - indirect=True + indirect=True, ) def test_targetvalidator_continuous(input_data_targettest): - assert type_of_target(input_data_targettest) == 'continuous' + assert type_of_target(input_data_targettest) == "continuous" validator = TargetValidator(is_classification=False) # Test the X_test also! validator.fit(input_data_targettest, input_data_targettest) transformed_y = validator.transform(input_data_targettest) - assert type_of_target(transformed_y) == 'continuous' + assert type_of_target(transformed_y) == "continuous" @pytest.mark.parametrize( - 'input_data_targettest', + "input_data_targettest", ( - 'pandas_continuous-multioutput', - 'numpy_continuous-multioutput', - 'list_continuous-multioutput', - 'openml_41483', + "pandas_continuous-multioutput", + "numpy_continuous-multioutput", + "list_continuous-multioutput", + "openml_41483", ), - indirect=True + indirect=True, ) def test_targetvalidator_continuous_multioutput(input_data_targettest): - assert type_of_target(input_data_targettest) == 'continuous-multioutput' + assert type_of_target(input_data_targettest) == "continuous-multioutput" validator = TargetValidator(is_classification=False) # Test the X_test also! validator.fit(input_data_targettest, input_data_targettest) transformed_y = validator.transform(input_data_targettest) - assert type_of_target(transformed_y) == 'continuous-multioutput' + assert type_of_target(transformed_y) == "continuous-multioutput" @pytest.mark.parametrize( - 'input_data_targettest', + "input_data_targettest", ( - 'series_binary', - 'pandas_binary', - 'numpy_binary', - 'list_binary', + "series_binary", + "pandas_binary", + "numpy_binary", + "list_binary", ), - indirect=True + indirect=True, ) def test_targetvalidator_fitontypeA_transformtypeB(input_data_targettest): """ @@ -370,12 +368,12 @@ def test_targetvalidator_fitontypeA_transformtypeB(input_data_targettest): @pytest.mark.parametrize( - 'input_data_targettest', + "input_data_targettest", ( - 'series_multilabel', - 'series_continuous-multioutput', + "series_multilabel", + "series_continuous-multioutput", ), - indirect=True + indirect=True, ) def test_type_of_target_unsupported(input_data_targettest): """ @@ -393,43 +391,63 @@ def test_target_unsupported(): when providing not supported data input """ validator = TargetValidator(is_classification=True) - with pytest.raises(ValueError, match=r"The dimensionality of the train and test targets"): + with pytest.raises( + ValueError, match=r"The dimensionality of the train and test targets" + ): validator.fit( np.array([[0, 1, 0], [0, 1, 1]]), np.array([[0, 1, 0, 0], [0, 1, 1, 1]]), ) - with pytest.raises(ValueError, match=r"Train and test targets must both have the same dtypes"): + with pytest.raises( + ValueError, match=r"Train and test targets must both have the same dtypes" + ): validator.fit( - pd.DataFrame({'a': [1, 2, 3]}), - pd.DataFrame({'a': [True, False, False]}), + pd.DataFrame({"a": [1, 2, 3]}), + pd.DataFrame({"a": [True, False, False]}), ) with pytest.raises(ValueError, match=r"Provided targets are not supported.*"): validator.fit( np.array([[0, 1, 2], [0, 3, 4]]), np.array([[0, 1, 2, 5], [0, 3, 4, 6]]), ) - with pytest.raises(ValueError, match="Train and test targets must both have the same"): + with pytest.raises( + ValueError, match="Train and test targets must both have the same" + ): validator.fit( - pd.DataFrame({'string': ['foo']}), - pd.DataFrame({'int': [1]}), + pd.DataFrame({"string": ["foo"]}), + pd.DataFrame({"int": [1]}), ) - with pytest.raises(ValueError, match=r"Auto-sklearn only supports Numpy arrays, .*"): - validator.fit({'input1': 1, 'input2': 2}) - with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"): + with pytest.raises( + ValueError, match=r"Auto-sklearn only supports Numpy arrays, .*" + ): + validator.fit({"input1": 1, "input2": 2}) + with pytest.raises( + ValueError, match=r"arget values cannot contain missing/NaN values" + ): validator.fit(np.array([np.nan, 1, 2])) - with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"): + with pytest.raises( + ValueError, match=r"arget values cannot contain missing/NaN values" + ): validator.fit(sparse.csr_matrix(np.array([1, 2, np.nan]))) - with pytest.raises(ValueError, match=r"TargetValidator must have fit\(\) called first"): + with pytest.raises( + ValueError, match=r"TargetValidator must have fit\(\) called first" + ): validator.transform(np.array([1, 2, 3])) - with pytest.raises(ValueError, match=r"TargetValidator must have fit\(\) called first"): + with pytest.raises( + ValueError, match=r"TargetValidator must have fit\(\) called first" + ): validator.inverse_transform(np.array([1, 2, 3])) - with pytest.raises(ValueError, match=r"Multi-dimensional classification is not yet supported"): + with pytest.raises( + ValueError, match=r"Multi-dimensional classification is not yet supported" + ): validator._fit(np.array([[1, 2, 3], [1, 5, 6]])) # Dia/ DOK are not supported as type of target makes calls len on the array # which causes TypeError: len() of unsized object. Basically, sparse data as # multi-label is the only thing that makes sense in this format. - with pytest.raises(ValueError, match=r"The provided data could not be interpreted by Sklearn"): + with pytest.raises( + ValueError, match=r"The provided data could not be interpreted by Sklearn" + ): validator.fit(sparse.dia_matrix(np.array([1, 2, 3]))) validator.fit(np.array([[0, 1, 0], [0, 1, 1]])) @@ -443,22 +461,21 @@ def test_targetvalidator_inversetransform(): """ validator = TargetValidator(is_classification=True) validator.fit( - pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'), + pd.DataFrame(data=["a", "a", "b", "c", "a"], dtype="category"), ) y = validator.transform( - pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'), + pd.DataFrame(data=["a", "a", "b", "c", "a"], dtype="category"), ) np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y) y_decoded = validator.inverse_transform(y) - assert ['a', 'a', 'b', 'c', 'a'] == y_decoded.tolist() + assert ["a", "a", "b", "c", "a"] == y_decoded.tolist() - assert validator.classes_.tolist() == ['a', 'b', 'c'] + assert validator.classes_.tolist() == ["a", "b", "c"] validator = TargetValidator(is_classification=True) multi_label = pd.DataFrame( - np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]), - dtype=bool + np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]), dtype=bool ) validator.fit(multi_label) y = validator.transform(multi_label) @@ -473,18 +490,18 @@ def test_targetvalidator_inversetransform(): # Actual checks for the targets @pytest.mark.parametrize( - 'input_data_targettest', + "input_data_targettest", ( - 'series_binary', - 'series_multiclass', - 'pandas_binary', - 'pandas_multiclass', - 'numpy_binary', - 'numpy_multiclass', - 'list_binary', - 'list_multiclass', + "series_binary", + "series_multiclass", + "pandas_binary", + "pandas_multiclass", + "numpy_binary", + "numpy_multiclass", + "list_binary", + "list_multiclass", ), - indirect=True + indirect=True, ) def test_unknown_categories_in_targets(input_data_targettest): validator = TargetValidator(is_classification=True) diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py index 7bc2cb3dc5..4d09c65075 100644 --- a/test/test_data/test_validation.py +++ b/test/test_data/test_validation.py @@ -1,34 +1,33 @@ import numpy as np - import pandas as pd - import pytest - -from scipy import sparse - import sklearn.datasets import sklearn.model_selection +from scipy import sparse from autosklearn.data.validation import InputValidator -@pytest.mark.parametrize('openmlid', [2, 40975, 40984]) -@pytest.mark.parametrize('as_frame', [True, False]) +@pytest.mark.parametrize("openmlid", [2, 40975, 40984]) +@pytest.mark.parametrize("as_frame", [True, False]) def test_data_validation_for_classification(openmlid, as_frame): - x, y = sklearn.datasets.fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame) + x, y = sklearn.datasets.fetch_openml( + data_id=openmlid, return_X_y=True, as_frame=as_frame + ) validator = InputValidator(is_classification=True) if as_frame: # NaN is not supported in categories, so # drop columns with them. nan_cols = [i for i in x.columns if x[i].isnull().any()] - cat_cols = [i for i in x.columns if x[i].dtype.name in ['category', 'bool']] + cat_cols = [i for i in x.columns if x[i].dtype.name in ["category", "bool"]] unsupported_columns = list(set(nan_cols) & set(cat_cols)) if len(unsupported_columns) > 0: x.drop(unsupported_columns, axis=1, inplace=True) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - x, y, test_size=0.33, random_state=0) + x, y, test_size=0.33, random_state=0 + ) validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) @@ -49,23 +48,26 @@ def test_data_validation_for_classification(openmlid, as_frame): validator.feature_validator.feat_type is not None -@pytest.mark.parametrize('openmlid', [505, 546, 531]) -@pytest.mark.parametrize('as_frame', [True, False]) +@pytest.mark.parametrize("openmlid", [505, 546, 531]) +@pytest.mark.parametrize("as_frame", [True, False]) def test_data_validation_for_regression(openmlid, as_frame): - x, y = sklearn.datasets.fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame) + x, y = sklearn.datasets.fetch_openml( + data_id=openmlid, return_X_y=True, as_frame=as_frame + ) validator = InputValidator(is_classification=False) if as_frame: # NaN is not supported in categories, so # drop columns with them. nan_cols = [i for i in x.columns if x[i].isnull().any()] - cat_cols = [i for i in x.columns if x[i].dtype.name in ['category', 'bool']] + cat_cols = [i for i in x.columns if x[i].dtype.name in ["category", "bool"]] unsupported_columns = list(set(nan_cols) & set(cat_cols)) if len(unsupported_columns) > 0: x.drop(unsupported_columns, axis=1, inplace=True) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - x, y, test_size=0.33, random_state=0) + x, y, test_size=0.33, random_state=0 + ) validator.fit(X_train=X_train, y_train=y_train) @@ -83,7 +85,9 @@ def test_data_validation_for_regression(openmlid, as_frame): def test_sparse_data_validation_for_regression(): - X, y = sklearn.datasets.make_regression(n_samples=100, n_features=50, random_state=0) + X, y = sklearn.datasets.make_regression( + n_samples=100, n_features=50, random_state=0 + ) X_sp = sparse.coo_matrix(X) validator = InputValidator(is_classification=False) @@ -118,7 +122,9 @@ def test_validation_unsupported(): X_test=np.array([[0, 1, 0], [0, 1, 1]]), y_test=np.array([0, 1, 0, 0, 0, 0]), ) - with pytest.raises(ValueError, match=r"Cannot call transform on a validator .*fitted"): + with pytest.raises( + ValueError, match=r"Cannot call transform on a validator .*fitted" + ): validator.transform( X=np.array([[0, 1, 0], [0, 1, 1]]), y=np.array([0, 1]), diff --git a/test/test_ensemble_builder/__init__.py b/test/test_ensemble_builder/__init__.py index 51b8efdf22..b74c2a5ccb 100644 --- a/test/test_ensemble_builder/__init__.py +++ b/test/test_ensemble_builder/__init__.py @@ -1,2 +1,2 @@ # -*- encoding: utf-8 -*- -__author__ = 'mlindauer' +__author__ = "mlindauer" diff --git a/test/test_ensemble_builder/ensemble_utils.py b/test/test_ensemble_builder/ensemble_utils.py index b98021c7bd..fa0f22e9e7 100644 --- a/test/test_ensemble_builder/ensemble_utils.py +++ b/test/test_ensemble_builder/ensemble_utils.py @@ -5,47 +5,55 @@ import numpy as np -from autosklearn.automl_common.common.ensemble_building.abstract_ensemble import AbstractEnsemble - -from autosklearn.metrics import make_scorer +from autosklearn.automl_common.common.ensemble_building.abstract_ensemble import ( + AbstractEnsemble, +) from autosklearn.ensemble_builder import EnsembleBuilder +from autosklearn.metrics import make_scorer def scorer_function(a, b): return 0.9 -MockMetric = make_scorer('mock', scorer_function) +MockMetric = make_scorer("mock", scorer_function) class BackendMock(object): - def __init__(self, target_directory): - this_directory = os.path.abspath( - os.path.dirname(__file__) + this_directory = os.path.abspath(os.path.dirname(__file__)) + shutil.copytree( + os.path.join(this_directory, "data"), os.path.join(target_directory) ) - shutil.copytree(os.path.join(this_directory, 'data'), os.path.join(target_directory)) self.temporary_directory = target_directory - self.internals_directory = os.path.join(self.temporary_directory, '.auto-sklearn') + self.internals_directory = os.path.join( + self.temporary_directory, ".auto-sklearn" + ) def load_datamanager(self): manager = unittest.mock.Mock() manager.__reduce__ = lambda self: (unittest.mock.MagicMock, ()) - array = np.load(os.path.join( - self.temporary_directory, - '.auto-sklearn', - 'runs', '0_3_100.0', - 'predictions_test_0_3_100.0.npy' - )) + array = np.load( + os.path.join( + self.temporary_directory, + ".auto-sklearn", + "runs", + "0_3_100.0", + "predictions_test_0_3_100.0.npy", + ) + ) manager.data.get.return_value = array return manager def load_targets_ensemble(self): - with open(os.path.join( - self.temporary_directory, - ".auto-sklearn", - "predictions_ensemble_true.npy" - ), "rb") as fp: + with open( + os.path.join( + self.temporary_directory, + ".auto-sklearn", + "predictions_ensemble_true.npy", + ), + "rb", + ) as fp: y = np.load(fp, allow_pickle=True) return y @@ -56,13 +64,15 @@ def save_predictions_as_txt(self, predictions, subset, idx, prefix, precision): return def get_runs_directory(self) -> str: - return os.path.join(self.temporary_directory, '.auto-sklearn', 'runs') + return os.path.join(self.temporary_directory, ".auto-sklearn", "runs") def get_numrun_directory(self, seed: int, num_run: int, budget: float) -> str: - return os.path.join(self.get_runs_directory(), '%d_%d_%s' % (seed, num_run, budget)) + return os.path.join( + self.get_runs_directory(), "%d_%d_%s" % (seed, num_run, budget) + ) def get_model_filename(self, seed: int, idx: int, budget: float) -> str: - return '%s.%s.%s.model' % (seed, idx, budget) + return "%s.%s.%s.model" % (seed, idx, budget) def compare_read_preds(read_preds1, read_preds2): @@ -91,13 +101,15 @@ def compare_read_preds(read_preds1, read_preds2): class EnsembleBuilderMemMock(EnsembleBuilder): - def fit_ensemble(self, selected_keys): return True - def predict(self, set_: str, - ensemble: AbstractEnsemble, - selected_keys: list, - n_preds: int, - index_run: int): + def predict( + self, + set_: str, + ensemble: AbstractEnsemble, + selected_keys: list, + n_preds: int, + index_run: int, + ): np.ones([10000000, 1000000]) diff --git a/test/test_ensemble_builder/test_ensemble.py b/test/test_ensemble_builder/test_ensemble.py index 335c07eca2..3533da37cd 100644 --- a/test/test_ensemble_builder/test_ensemble.py +++ b/test/test_ensemble_builder/test_ensemble.py @@ -1,35 +1,40 @@ import os +import pickle +import shutil import sys import time import unittest.mock -import pickle -import pytest -import shutil import dask.distributed import numpy as np import pandas as pd -from smac.runhistory.runhistory import RunValue, RunKey, RunHistory +import pytest +from smac.runhistory.runhistory import RunHistory, RunKey, RunValue -from autosklearn.constants import MULTILABEL_CLASSIFICATION, BINARY_CLASSIFICATION -from autosklearn.metrics import roc_auc, accuracy, log_loss +from autosklearn.constants import BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION from autosklearn.ensemble_builder import ( - EnsembleBuilder, - EnsembleBuilderManager, Y_ENSEMBLE, - Y_VALID, Y_TEST, + Y_VALID, + EnsembleBuilder, + EnsembleBuilderManager, ) from autosklearn.ensembles.singlebest_ensemble import SingleBest +from autosklearn.metrics import accuracy, log_loss, roc_auc this_directory = os.path.dirname(__file__) sys.path.append(this_directory) -from ensemble_utils import BackendMock, compare_read_preds, EnsembleBuilderMemMock, MockMetric # noqa (E402: module level import not at top of file) +from ensemble_utils import ( # noqa (E402: module level import not at top of file) + BackendMock, + EnsembleBuilderMemMock, + MockMetric, + compare_read_preds, +) @pytest.fixture(scope="function") def ensemble_backend(request): - test_id = '%s_%s' % (request.module.__name__, request.node.name) + test_id = "%s_%s" % (request.module.__name__, request.node.name) test_dir = os.path.join(this_directory, test_id) try: @@ -46,7 +51,9 @@ def session_run_at_end(): shutil.rmtree(test_dir) except: # noqa E722 pass + return session_run_at_end + request.addfinalizer(get_finalizer(backend)) return backend @@ -58,10 +65,7 @@ def ensemble_run_history(request): run_history = RunHistory() run_history._add( RunKey( - config_id=3, - instance_id='{"task_id": "breast_cancer"}', - seed=1, - budget=3.0 + config_id=3, instance_id='{"task_id": "breast_cancer"}', seed=1, budget=3.0 ), RunValue( cost=0.11347517730496459, @@ -70,30 +74,29 @@ def ensemble_run_history(request): starttime=time.time(), endtime=time.time(), additional_info={ - 'duration': 0.20323538780212402, - 'num_run': 3, - 'configuration_origin': 'Random Search'} + "duration": 0.20323538780212402, + "num_run": 3, + "configuration_origin": "Random Search", + }, ), status=None, origin=None, ) run_history._add( RunKey( - config_id=6, - instance_id='{"task_id": "breast_cancer"}', - seed=1, - budget=6.0 + config_id=6, instance_id='{"task_id": "breast_cancer"}', seed=1, budget=6.0 ), RunValue( - cost=2*0.11347517730496459, - time=2*0.21858787536621094, + cost=2 * 0.11347517730496459, + time=2 * 0.21858787536621094, status=None, starttime=time.time(), endtime=time.time(), additional_info={ - 'duration': 0.20323538780212402, - 'num_run': 6, - 'configuration_origin': 'Random Search'} + "duration": 0.20323538780212402, + "num_run": 6, + "configuration_origin": "Random Search", + }, ), status=None, origin=None, @@ -118,13 +121,13 @@ def testRead(ensemble_backend): filename = os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy" + ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy", ) assert ensbuilder.read_losses[filename]["ens_loss"] == 0.5 filename = os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy" + ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy", ) assert ensbuilder.read_losses[filename]["ens_loss"] == 0.0 @@ -132,13 +135,13 @@ def testRead(ensemble_backend): @pytest.mark.parametrize( "ensemble_nbest,max_models_on_disc,exp", ( - (1, None, 1), - (1.0, None, 2), - (0.1, None, 1), - (0.9, None, 1), - (1, 2, 1), - (2, 1, 1), - ) + (1, None, 1), + (1.0, None, 2), + (0.1, None, 1), + (0.9, None, 1), + (1, 2, 1), + (2, 1, 1), + ), ) def testNBest(ensemble_backend, ensemble_nbest, max_models_on_disc, exp): ensbuilder = EnsembleBuilder( @@ -158,26 +161,29 @@ def testNBest(ensemble_backend, ensemble_nbest, max_models_on_disc, exp): fixture = os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy" + ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy", ) assert sel_keys[0] == fixture -@pytest.mark.parametrize("test_case,exp", [ - # If None, no reduction - (None, 2), - # If Int, limit only on exceed - (4, 2), - (1, 1), - # If Float, translate float to # models. - # below, mock of each file is 100 Mb and 4 files .model and .npy (test/val/pred) exist - # per run (except for run3, there they are 5). Now, it takes 500MB for run 3 and - # another 500 MB of slack because we keep as much space as the largest model - # available as slack - (1499.0, 1), - (1500.0, 2), - (9999.0, 2), -]) +@pytest.mark.parametrize( + "test_case,exp", + [ + # If None, no reduction + (None, 2), + # If Int, limit only on exceed + (4, 2), + (1, 1), + # If Float, translate float to # models. + # below, mock of each file is 100 Mb and 4 files .model and .npy (test/val/pred) + # per run (except for run3, there they are 5). Now, it takes 500MB for run 3 and + # another 500 MB of slack because we keep as much space as the largest model + # available as slack + (1499.0, 1), + (1500.0, 2), + (9999.0, 2), + ], +) def testMaxModelsOnDisc(ensemble_backend, test_case, exp): ensemble_nbest = 4 ensbuilder = EnsembleBuilder( @@ -190,8 +196,8 @@ def testMaxModelsOnDisc(ensemble_backend, test_case, exp): max_models_on_disc=test_case, ) - with unittest.mock.patch('os.path.getsize') as mock: - mock.return_value = 100*1024*1024 + with unittest.mock.patch("os.path.getsize") as mock: + mock.return_value = 100 * 1024 * 1024 ensbuilder.compute_loss_per_model() sel_keys = ensbuilder.get_n_best_preds() assert len(sel_keys) == exp, test_case @@ -211,26 +217,26 @@ def testMaxModelsOnDisc2(ensemble_backend): ) ensbuilder.read_preds = {} for i in range(50): - ensbuilder.read_losses['pred'+str(i)] = { - 'ens_loss': -i*10, - 'num_run': i, - 'loaded': 1, + ensbuilder.read_losses["pred" + str(i)] = { + "ens_loss": -i * 10, + "num_run": i, + "loaded": 1, "seed": 1, - "disc_space_cost_mb": 50*i, + "disc_space_cost_mb": 50 * i, } - ensbuilder.read_preds['pred'+str(i)] = {Y_ENSEMBLE: True} + ensbuilder.read_preds["pred" + str(i)] = {Y_ENSEMBLE: True} sel_keys = ensbuilder.get_n_best_preds() - assert ['pred49', 'pred48', 'pred47'] == sel_keys + assert ["pred49", "pred48", "pred47"] == sel_keys # Make sure at least one model is kept alive ensbuilder.max_models_on_disc = 0.0 sel_keys = ensbuilder.get_n_best_preds() - assert ['pred49'] == sel_keys + assert ["pred49"] == sel_keys @pytest.mark.parametrize( "performance_range_threshold,exp", - ((0.0, 4), (0.1, 4), (0.3, 3), (0.5, 2), (0.6, 2), (0.8, 1), (1.0, 1), (1, 1)) + ((0.0, 4), (0.1, 4), (0.3, 3), (0.5, 2), (0.6, 2), (0.8, 1), (1.0, 1), (1, 1)), ) def testPerformanceRangeThreshold(ensemble_backend, performance_range_threshold, exp): ensbuilder = EnsembleBuilder( @@ -240,14 +246,14 @@ def testPerformanceRangeThreshold(ensemble_backend, performance_range_threshold, metric=roc_auc, seed=0, # important to find the test files ensemble_nbest=100, - performance_range_threshold=performance_range_threshold + performance_range_threshold=performance_range_threshold, ) ensbuilder.read_losses = { - 'A': {'ens_loss': -1, 'num_run': 1, 'loaded': -1, "seed": 1}, - 'B': {'ens_loss': -2, 'num_run': 2, 'loaded': -1, "seed": 1}, - 'C': {'ens_loss': -3, 'num_run': 3, 'loaded': -1, "seed": 1}, - 'D': {'ens_loss': -4, 'num_run': 4, 'loaded': -1, "seed": 1}, - 'E': {'ens_loss': -5, 'num_run': 5, 'loaded': -1, "seed": 1}, + "A": {"ens_loss": -1, "num_run": 1, "loaded": -1, "seed": 1}, + "B": {"ens_loss": -2, "num_run": 2, "loaded": -1, "seed": 1}, + "C": {"ens_loss": -3, "num_run": 3, "loaded": -1, "seed": 1}, + "D": {"ens_loss": -4, "num_run": 4, "loaded": -1, "seed": 1}, + "E": {"ens_loss": -5, "num_run": 5, "loaded": -1, "seed": 1}, } ensbuilder.read_preds = { key: {key_2: True for key_2 in (Y_ENSEMBLE, Y_VALID, Y_TEST)} @@ -261,12 +267,19 @@ def testPerformanceRangeThreshold(ensemble_backend, performance_range_threshold, @pytest.mark.parametrize( "performance_range_threshold,ensemble_nbest,exp", ( - (0.0, 1, 1), (0.0, 1.0, 4), (0.1, 2, 2), (0.3, 4, 3), - (0.5, 1, 1), (0.6, 10, 2), (0.8, 0.5, 1), (1, 1.0, 1) - ) + (0.0, 1, 1), + (0.0, 1.0, 4), + (0.1, 2, 2), + (0.3, 4, 3), + (0.5, 1, 1), + (0.6, 10, 2), + (0.8, 0.5, 1), + (1, 1.0, 1), + ), ) -def testPerformanceRangeThresholdMaxBest(ensemble_backend, performance_range_threshold, - ensemble_nbest, exp): +def testPerformanceRangeThresholdMaxBest( + ensemble_backend, performance_range_threshold, ensemble_nbest, exp +): ensbuilder = EnsembleBuilder( backend=ensemble_backend, dataset_name="TEST", @@ -278,11 +291,11 @@ def testPerformanceRangeThresholdMaxBest(ensemble_backend, performance_range_thr max_models_on_disc=None, ) ensbuilder.read_losses = { - 'A': {'ens_loss': -1, 'num_run': 1, 'loaded': -1, "seed": 1}, - 'B': {'ens_loss': -2, 'num_run': 2, 'loaded': -1, "seed": 1}, - 'C': {'ens_loss': -3, 'num_run': 3, 'loaded': -1, "seed": 1}, - 'D': {'ens_loss': -4, 'num_run': 4, 'loaded': -1, "seed": 1}, - 'E': {'ens_loss': -5, 'num_run': 5, 'loaded': -1, "seed": 1}, + "A": {"ens_loss": -1, "num_run": 1, "loaded": -1, "seed": 1}, + "B": {"ens_loss": -2, "num_run": 2, "loaded": -1, "seed": 1}, + "C": {"ens_loss": -3, "num_run": 3, "loaded": -1, "seed": 1}, + "D": {"ens_loss": -4, "num_run": 4, "loaded": -1, "seed": 1}, + "E": {"ens_loss": -5, "num_run": 5, "loaded": -1, "seed": 1}, } ensbuilder.read_preds = { key: {key_2: True for key_2 in (Y_ENSEMBLE, Y_VALID, Y_TEST)} @@ -295,13 +308,14 @@ def testPerformanceRangeThresholdMaxBest(ensemble_backend, performance_range_thr def testFallBackNBest(ensemble_backend): - ensbuilder = EnsembleBuilder(backend=ensemble_backend, - dataset_name="TEST", - task_type=BINARY_CLASSIFICATION, - metric=roc_auc, - seed=0, # important to find the test files - ensemble_nbest=1 - ) + ensbuilder = EnsembleBuilder( + backend=ensemble_backend, + dataset_name="TEST", + task_type=BINARY_CLASSIFICATION, + metric=roc_auc, + seed=0, # important to find the test files + ensemble_nbest=1, + ) ensbuilder.compute_loss_per_model() print() @@ -311,19 +325,19 @@ def testFallBackNBest(ensemble_backend): filename = os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy" + ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy", ) ensbuilder.read_losses[filename]["ens_loss"] = -1 filename = os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy" + ".auto-sklearn/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy", ) ensbuilder.read_losses[filename]["ens_loss"] = -1 filename = os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy" + ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy", ) ensbuilder.read_losses[filename]["ens_loss"] = -1 @@ -331,7 +345,7 @@ def testFallBackNBest(ensemble_backend): fixture = os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy" + ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy", ) assert len(sel_keys) == 1 assert sel_keys[0] == fixture @@ -339,13 +353,14 @@ def testFallBackNBest(ensemble_backend): def testGetValidTestPreds(ensemble_backend): - ensbuilder = EnsembleBuilder(backend=ensemble_backend, - dataset_name="TEST", - task_type=BINARY_CLASSIFICATION, - metric=roc_auc, - seed=0, # important to find the test files - ensemble_nbest=1 - ) + ensbuilder = EnsembleBuilder( + backend=ensemble_backend, + dataset_name="TEST", + task_type=BINARY_CLASSIFICATION, + metric=roc_auc, + seed=0, # important to find the test files + ensemble_nbest=1, + ) ensbuilder.compute_loss_per_model() @@ -353,15 +368,15 @@ def testGetValidTestPreds(ensemble_backend): # different name. num_run=2 is selected when doing sorted() d1 = os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy" + ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy", ) d2 = os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy" + ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy", ) d3 = os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy" + ".auto-sklearn/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy", ) sel_keys = ensbuilder.get_n_best_preds() @@ -371,10 +386,13 @@ def testGetValidTestPreds(ensemble_backend): # Number of read files should be three and # predictions_ensemble_0_4_0.0.npy must not be in there assert len(ensbuilder.read_preds) == 3 - assert os.path.join( + assert ( + os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_4_0.0/predictions_ensemble_0_4_0.0.npy" - ) not in ensbuilder.read_preds + ".auto-sklearn/runs/0_4_0.0/predictions_ensemble_0_4_0.0.npy", + ) + not in ensbuilder.read_preds + ) # not selected --> should still be None assert ensbuilder.read_preds[d1][Y_VALID] is None @@ -403,7 +421,7 @@ def testEntireEnsembleBuilder(ensemble_backend): d2 = os.path.join( ensemble_backend.temporary_directory, - ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy" + ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy", ) sel_keys = ensbuilder.get_n_best_preds() @@ -454,11 +472,13 @@ def test_main(ensemble_backend): seed=0, # important to find the test files ensemble_nbest=2, max_models_on_disc=None, - ) + ) ensbuilder.SAVE2DISC = False run_history, ensemble_nbest, _, _, _ = ensbuilder.main( - time_left=np.inf, iteration=1, return_predictions=False, + time_left=np.inf, + iteration=1, + return_predictions=False, ) assert len(ensbuilder.read_preds) == 3 @@ -473,26 +493,26 @@ def test_main(ensemble_backend): # As the data loader loads the same val/train/test # we expect 1.0 as score and all keys available expected_performance = { - 'ensemble_val_score': 1.0, - 'ensemble_test_score': 1.0, - 'ensemble_optimization_score': 1.0, + "ensemble_val_score": 1.0, + "ensemble_test_score": 1.0, + "ensemble_optimization_score": 1.0, } # Make sure that expected performance is a subset of the run history assert all(item in run_history[0].items() for item in expected_performance.items()) - assert 'Timestamp' in run_history[0] - assert isinstance(run_history[0]['Timestamp'], pd.Timestamp) + assert "Timestamp" in run_history[0] + assert isinstance(run_history[0]["Timestamp"], pd.Timestamp) assert os.path.exists( - os.path.join(ensemble_backend.internals_directory, 'ensemble_read_preds.pkl') + os.path.join(ensemble_backend.internals_directory, "ensemble_read_preds.pkl") ), os.listdir(ensemble_backend.internals_directory) assert os.path.exists( - os.path.join(ensemble_backend.internals_directory, 'ensemble_read_losses.pkl') + os.path.join(ensemble_backend.internals_directory, "ensemble_read_losses.pkl") ), os.listdir(ensemble_backend.internals_directory) def test_run_end_at(ensemble_backend): - with unittest.mock.patch('pynisher.enforce_limits') as pynisher_mock: + with unittest.mock.patch("pynisher.enforce_limits") as pynisher_mock: ensbuilder = EnsembleBuilder( backend=ensemble_backend, dataset_name="TEST", @@ -501,69 +521,74 @@ def test_run_end_at(ensemble_backend): seed=0, # important to find the test files ensemble_nbest=2, max_models_on_disc=None, - ) + ) ensbuilder.SAVE2DISC = False current_time = time.time() - ensbuilder.run(end_at=current_time + 10, iteration=1, pynisher_context='forkserver') - # 4 seconds left because: 10 seconds - 5 seconds overhead - very little overhead, + ensbuilder.run( + end_at=current_time + 10, iteration=1, pynisher_context="forkserver" + ) + # 4 seconds left because: 10 seconds - 5 seconds overhead - little overhead # but then rounded to an integer assert pynisher_mock.call_args_list[0][1]["wall_time_in_s"], 4 def testLimit(ensemble_backend): - ensbuilder = EnsembleBuilderMemMock(backend=ensemble_backend, - dataset_name="TEST", - task_type=BINARY_CLASSIFICATION, - metric=roc_auc, - seed=0, # important to find the test files - ensemble_nbest=10, - # small to trigger MemoryException - memory_limit=100, - ) + ensbuilder = EnsembleBuilderMemMock( + backend=ensemble_backend, + dataset_name="TEST", + task_type=BINARY_CLASSIFICATION, + metric=roc_auc, + seed=0, # important to find the test files + ensemble_nbest=10, + # small to trigger MemoryException + memory_limit=100, + ) ensbuilder.SAVE2DISC = False read_losses_file = os.path.join( - ensemble_backend.internals_directory, - 'ensemble_read_losses.pkl' + ensemble_backend.internals_directory, "ensemble_read_losses.pkl" ) read_preds_file = os.path.join( - ensemble_backend.internals_directory, - 'ensemble_read_preds.pkl' + ensemble_backend.internals_directory, "ensemble_read_preds.pkl" ) def mtime_mock(filename): mtimes = { - 'predictions_ensemble_0_1_0.0.npy': 0, - 'predictions_valid_0_1_0.0.npy': 0.1, - 'predictions_test_0_1_0.0.npy': 0.2, - 'predictions_ensemble_0_2_0.0.npy': 1, - 'predictions_valid_0_2_0.0.npy': 1.1, - 'predictions_test_0_2_0.0.npy': 1.2, - 'predictions_ensemble_0_3_100.0.npy': 2, - 'predictions_valid_0_3_100.0.npy': 2.1, - 'predictions_test_0_3_100.0.npy': 2.2, + "predictions_ensemble_0_1_0.0.npy": 0, + "predictions_valid_0_1_0.0.npy": 0.1, + "predictions_test_0_1_0.0.npy": 0.2, + "predictions_ensemble_0_2_0.0.npy": 1, + "predictions_valid_0_2_0.0.npy": 1.1, + "predictions_test_0_2_0.0.npy": 1.2, + "predictions_ensemble_0_3_100.0.npy": 2, + "predictions_valid_0_3_100.0.npy": 2.1, + "predictions_test_0_3_100.0.npy": 2.2, } return mtimes[os.path.split(filename)[1]] - with unittest.mock.patch('logging.getLogger') as get_logger_mock, \ - unittest.mock.patch('logging.config.dictConfig') as _, \ - unittest.mock.patch('os.path.getmtime') as mtime: + with unittest.mock.patch( + "logging.getLogger" + ) as get_logger_mock, unittest.mock.patch( + "logging.config.dictConfig" + ) as _, unittest.mock.patch( + "os.path.getmtime" + ) as mtime: logger_mock = unittest.mock.Mock() logger_mock.handlers = [] get_logger_mock.return_value = logger_mock mtime.side_effect = mtime_mock - ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') + ensbuilder.run(time_left=1000, iteration=0, pynisher_context="fork") assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 1 - ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') + ensbuilder.run(time_left=1000, iteration=0, pynisher_context="fork") assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 2 - ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') + ensbuilder.run(time_left=1000, iteration=0, pynisher_context="fork") assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 3 @@ -571,7 +596,7 @@ def mtime_mock(filename): # it should try to reduce ensemble_nbest until it also failed at 2 assert ensbuilder.ensemble_nbest == 1 - ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') + ensbuilder.run(time_left=1000, iteration=0, pynisher_context="fork") assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 4 @@ -579,9 +604,9 @@ def mtime_mock(filename): # it should next reduce the number of models to read at most assert ensbuilder.read_at_most == 1 - # And then it still runs, but basically won't do anything any more except for raising error - # messages via the logger - ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') + # And then it still runs, but basically won't do anything any more except for + # raising error messages via the logger + ensbuilder.run(time_left=1000, iteration=0, pynisher_context="fork") assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 4 @@ -592,8 +617,9 @@ def mtime_mock(filename): logger_mock.error.call_args_list ) for i in range(len(logger_mock.error.call_args_list)): - assert 'Memory Exception -- Unable to further reduce' in str( - logger_mock.error.call_args_list[i]) + assert "Memory Exception -- Unable to further reduce" in str( + logger_mock.error.call_args_list[i] + ) def test_read_pickle_read_preds(ensemble_backend): @@ -610,15 +636,14 @@ def test_read_pickle_read_preds(ensemble_backend): seed=0, # important to find the test files ensemble_nbest=2, max_models_on_disc=None, - ) + ) ensbuilder.SAVE2DISC = False ensbuilder.main(time_left=np.inf, iteration=1, return_predictions=False) # Check that the memory was created ensemble_memory_file = os.path.join( - ensemble_backend.internals_directory, - 'ensemble_read_preds.pkl' + ensemble_backend.internals_directory, "ensemble_read_preds.pkl" ) assert os.path.exists(ensemble_memory_file) @@ -630,8 +655,7 @@ def test_read_pickle_read_preds(ensemble_backend): assert last_hash == ensbuilder.last_hash ensemble_memory_file = os.path.join( - ensemble_backend.internals_directory, - 'ensemble_read_losses.pkl' + ensemble_backend.internals_directory, "ensemble_read_losses.pkl" ) assert os.path.exists(ensemble_memory_file) @@ -650,21 +674,23 @@ def test_read_pickle_read_preds(ensemble_backend): seed=0, # important to find the test files ensemble_nbest=2, max_models_on_disc=None, - ) + ) compare_read_preds(ensbuilder2.read_preds, ensbuilder.read_preds) compare_read_preds(ensbuilder2.read_losses, ensbuilder.read_losses) assert ensbuilder2.last_hash == ensbuilder.last_hash @pytest.mark.parametrize("metric", [log_loss, accuracy]) -@unittest.mock.patch('os.path.exists') -def test_get_identifiers_from_run_history(exists, metric, ensemble_run_history, ensemble_backend): +@unittest.mock.patch("os.path.exists") +def test_get_identifiers_from_run_history( + exists, metric, ensemble_run_history, ensemble_backend +): exists.return_value = True ensemble = SingleBest( - metric=log_loss, - seed=1, - run_history=ensemble_run_history, - backend=ensemble_backend, + metric=log_loss, + seed=1, + run_history=ensemble_run_history, + backend=ensemble_backend, ) # Just one model @@ -682,7 +708,7 @@ def test_ensemble_builder_process_realrun(dask_client_single_worker, ensemble_ba start_time=time.time(), time_left_for_ensembles=1000, backend=ensemble_backend, - dataset_name='Test', + dataset_name="Test", task=BINARY_CLASSIFICATION, metric=MockMetric, ensemble_size=50, @@ -701,12 +727,12 @@ def test_ensemble_builder_process_realrun(dask_client_single_worker, ensemble_ba result = future.result() history, _, _, _, _ = result - assert 'ensemble_optimization_score' in history[0] - assert history[0]['ensemble_optimization_score'] == 0.9 - assert 'ensemble_val_score' in history[0] - assert history[0]['ensemble_val_score'] == 0.9 - assert 'ensemble_test_score' in history[0] - assert history[0]['ensemble_test_score'] == 0.9 + assert "ensemble_optimization_score" in history[0] + assert history[0]["ensemble_optimization_score"] == 0.9 + assert "ensemble_val_score" in history[0] + assert history[0]["ensemble_val_score"] == 0.9 + assert "ensemble_test_score" in history[0] + assert history[0]["ensemble_test_score"] == 0.9 def test_ensemble_builder_nbest_remembered( @@ -722,7 +748,7 @@ def test_ensemble_builder_nbest_remembered( start_time=time.time(), time_left_for_ensembles=1000, backend=ensemble_backend, - dataset_name='Test', + dataset_name="Test", task=MULTILABEL_CLASSIFICATION, metric=roc_auc, ensemble_size=50, @@ -740,7 +766,9 @@ def test_ensemble_builder_nbest_remembered( future = manager.futures[0] dask.distributed.wait([future]) # wait for the ensemble process to finish assert future.result() == ([], 5, None, None, None) - file_path = os.path.join(ensemble_backend.internals_directory, 'ensemble_read_preds.pkl') + file_path = os.path.join( + ensemble_backend.internals_directory, "ensemble_read_preds.pkl" + ) assert not os.path.exists(file_path) manager.build_ensemble(dask_client_single_worker, unit_test=True) diff --git a/test/test_ensemble_builder/test_ensemble_selection.py b/test/test_ensemble_builder/test_ensemble_selection.py index c03060c037..44e00229fb 100644 --- a/test/test_ensemble_builder/test_ensemble_selection.py +++ b/test/test_ensemble_builder/test_ensemble_selection.py @@ -1,5 +1,4 @@ import numpy as np - import pytest from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION @@ -12,10 +11,12 @@ def testEnsembleSelection(): Makes sure ensemble selection fit method creates an ensemble correctly """ - ensemble = EnsembleSelection(ensemble_size=10, - task_type=REGRESSION, - random_state=0, - metric=root_mean_squared_error) + ensemble = EnsembleSelection( + ensemble_size=10, + task_type=REGRESSION, + random_state=0, + metric=root_mean_squared_error, + ) # We create a problem such that we encourage the addition of members to the ensemble # Fundamentally, the average of 10 sequential number is 5.5 @@ -23,24 +24,57 @@ def testEnsembleSelection(): predictions = [] for i in range(1, 20): pred = np.full((100), i, dtype=np.float32) - pred[i*5:5*(i+1)] = 5.5 * i + pred[i * 5 : 5 * (i + 1)] = 5.5 * i predictions.append(pred) ensemble.fit(predictions, y_true, identifiers=[(i, i, i) for i in range(20)]) - np.testing.assert_array_equal(ensemble.weights_, - np.array([0.1, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, - 0., 0., 0., 0., 0., 0., 0., 0., - 0., 0., 0.])) + np.testing.assert_array_equal( + ensemble.weights_, + np.array( + [ + 0.1, + 0.2, + 0.2, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ] + ), + ) assert ensemble.identifiers_ == [(i, i, i) for i in range(20)] - np.testing.assert_array_almost_equal(np.array(ensemble.trajectory_), - np.array([3.462296925452813, 2.679202306657711, - 2.2748626436960375, 2.065717187806695, - 1.7874562615598728, 1.6983448128441783, - 1.559451106330085, 1.5316326052614575, - 1.3801950121782542, 1.3554980575295374])) + np.testing.assert_array_almost_equal( + np.array(ensemble.trajectory_), + np.array( + [ + 3.462296925452813, + 2.679202306657711, + 2.2748626436960375, + 2.065717187806695, + 1.7874562615598728, + 1.6983448128441783, + 1.559451106330085, + 1.5316326052614575, + 1.3801950121782542, + 1.3554980575295374, + ] + ), + ) def testPredict(): @@ -54,52 +88,38 @@ def testPredict(): # we first exclude all occurrences of zero in self.weights_, and then # apply the weights. # If none of the above is the case, predict() raises Error. - ensemble = EnsembleSelection(ensemble_size=3, - task_type=BINARY_CLASSIFICATION, - random_state=0, - metric=accuracy, - ) + ensemble = EnsembleSelection( + ensemble_size=3, + task_type=BINARY_CLASSIFICATION, + random_state=0, + metric=accuracy, + ) # Test for case 1. Create (3, 2, 2) predictions. - per_model_pred = np.array([ - [[0.9, 0.1], - [0.4, 0.6]], - [[0.8, 0.2], - [0.3, 0.7]], - [[1.0, 0.0], - [0.1, 0.9]] - ]) + per_model_pred = np.array( + [[[0.9, 0.1], [0.4, 0.6]], [[0.8, 0.2], [0.3, 0.7]], [[1.0, 0.0], [0.1, 0.9]]] + ) # Weights of 3 hypothetical models ensemble.weights_ = [0.7, 0.2, 0.1] pred = ensemble.predict(per_model_pred) - truth = np.array([[0.89, 0.11], # This should be the true prediction. - [0.35, 0.65]]) + truth = np.array( + [[0.89, 0.11], [0.35, 0.65]] # This should be the true prediction. + ) assert np.allclose(pred, truth) # Test for case 2. - per_model_pred = np.array([ - [[0.9, 0.1], - [0.4, 0.6]], - [[0.8, 0.2], - [0.3, 0.7]], - [[1.0, 0.0], - [0.1, 0.9]] - ]) + per_model_pred = np.array( + [[[0.9, 0.1], [0.4, 0.6]], [[0.8, 0.2], [0.3, 0.7]], [[1.0, 0.0], [0.1, 0.9]]] + ) # The third model now has weight of zero. ensemble.weights_ = [0.7, 0.2, 0.0, 0.1] pred = ensemble.predict(per_model_pred) - truth = np.array([[0.89, 0.11], - [0.35, 0.65]]) + truth = np.array([[0.89, 0.11], [0.35, 0.65]]) assert np.allclose(pred, truth) # Test for error case. - per_model_pred = np.array([ - [[0.9, 0.1], - [0.4, 0.6]], - [[0.8, 0.2], - [0.3, 0.7]], - [[1.0, 0.0], - [0.1, 0.9]] - ]) + per_model_pred = np.array( + [[[0.9, 0.1], [0.4, 0.6]], [[0.8, 0.2], [0.3, 0.7]], [[1.0, 0.0], [0.1, 0.9]]] + ) # Now the weights have 2 zero weights and 2 non-zero weights, # which is incompatible. ensemble.weights_ = [0.6, 0.0, 0.0, 0.4] diff --git a/test/test_evaluation/__init__.py b/test/test_evaluation/__init__.py index cc3cd7becd..e298f0f075 100644 --- a/test/test_evaluation/__init__.py +++ b/test/test_evaluation/__init__.py @@ -1,2 +1,2 @@ # -*- encoding: utf-8 -*- -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py index e8ba4edf07..d8bf017c35 100644 --- a/test/test_evaluation/evaluation_util.py +++ b/test/test_evaluation/evaluation_util.py @@ -1,28 +1,53 @@ import functools -import traceback import tempfile +import traceback import unittest import numpy as np -from numpy.linalg import LinAlgError import sklearn.datasets -from sklearn import preprocessing import sklearn.model_selection +from numpy.linalg import LinAlgError +from sklearn import preprocessing from autosklearn.automl_common.common.utils.backend import Backend - -from autosklearn.constants import \ - MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION, BINARY_CLASSIFICATION, REGRESSION -from autosklearn.util.data import convert_to_bin +from autosklearn.constants import ( + BINARY_CLASSIFICATION, + MULTICLASS_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, + REGRESSION, +) from autosklearn.data.xy_data_manager import XYDataManager +from autosklearn.metrics import ( + accuracy, + balanced_accuracy, + f1_macro, + f1_micro, + f1_weighted, + log_loss, + precision_macro, + precision_micro, + precision_weighted, + recall_macro, + recall_micro, + recall_weighted, +) from autosklearn.pipeline.util import get_dataset -from autosklearn.metrics import accuracy, balanced_accuracy, f1_macro, f1_micro, f1_weighted, \ - log_loss, precision_macro, precision_micro, precision_weighted, recall_macro, \ - recall_micro, recall_weighted +from autosklearn.util.data import convert_to_bin -SCORER_LIST = [accuracy, balanced_accuracy, f1_macro, f1_micro, f1_weighted, log_loss, - precision_macro, precision_micro, precision_weighted, recall_macro, - recall_micro, recall_weighted] +SCORER_LIST = [ + accuracy, + balanced_accuracy, + f1_macro, + f1_micro, + f1_weighted, + log_loss, + precision_macro, + precision_micro, + precision_weighted, + recall_macro, + recall_micro, + recall_weighted, +] N_TEST_RUNS = 5 @@ -32,14 +57,16 @@ def get_evaluation_backend(): backend_mock.temporary_directory = tempfile.gettempdir() # Assign a default data - backend_mock.load_datamanager.return_value = get_multiclass_classification_datamanager() + backend_mock.load_datamanager.return_value = ( + get_multiclass_classification_datamanager() + ) return backend_mock class Dummy(object): def __init__(self): - self.name = 'Dummy' + self.name = "Dummy" class BaseEvaluatorTest(unittest.TestCase): @@ -61,82 +88,85 @@ def __fit(self, function_handle): function_handle() return True except KeyError as e: - if 'Floating-point under-/overflow occurred at epoch' in \ - e.args[0] or \ - 'removed all features' in e.args[0] or \ - 'failed to create intent' in e.args[0]: + if ( + "Floating-point under-/overflow occurred at epoch" in e.args[0] + or "removed all features" in e.args[0] + or "failed to create intent" in e.args[0] + ): pass else: traceback.print_exc() raise e except ValueError as e: - if 'Floating-point under-/overflow occurred at epoch' in e.args[ - 0] or \ - 'removed all features' in e.args[0] or \ - 'failed to create intent' in e.args[0]: + if ( + "Floating-point under-/overflow occurred at epoch" in e.args[0] + or "removed all features" in e.args[0] + or "failed to create intent" in e.args[0] + ): pass else: raise e except LinAlgError as e: - if 'not positive definite, even with jitter' in e.args[0]: + if "not positive definite, even with jitter" in e.args[0]: pass else: raise e except RuntimeWarning as e: - if 'invalid value encountered in sqrt' in e.args[0]: + if "invalid value encountered in sqrt" in e.args[0]: pass - elif 'divide by zero encountered in divide' in e.args[0]: + elif "divide by zero encountered in divide" in e.args[0]: pass else: raise e except UserWarning as e: - if 'FastICA did not converge' in e.args[0]: + if "FastICA did not converge" in e.args[0]: pass else: raise e def get_multiclass_classification_datamanager(): - X_train, Y_train, X_test, Y_test = get_dataset('iris') + X_train, Y_train, X_test, Y_test = get_dataset("iris") indices = list(range(X_train.shape[0])) np.random.seed(1) np.random.shuffle(indices) X_train = X_train[indices] Y_train = Y_train[indices] - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] + X_valid = X_test[ + :25, + ] + Y_valid = Y_test[ + :25, + ] + X_test = X_test[ + 25:, + ] + Y_test = Y_test[ + 25:, + ] D = Dummy() - D.info = { - 'task': MULTICLASS_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 3 - } + D.info = {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False, "label_num": 3} D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'Y_valid': Y_valid, - 'X_test': X_test, - 'Y_test': Y_test + "X_train": X_train, + "Y_train": Y_train, + "X_valid": X_valid, + "Y_valid": Y_valid, + "X_test": X_test, + "Y_test": Y_test, } - D.feat_type = {0: 'numerical', - 1: 'Numerical', - 2: 'numerical', - 3: 'numerical'} + D.feat_type = {0: "numerical", 1: "Numerical", 2: "numerical", 3: "numerical"} return D def get_abalone_datamanager(): # https://www.openml.org/d/183 - dataset_name = 'abalone' + dataset_name = "abalone" data = sklearn.datasets.fetch_openml(data_id=183, as_frame=True) feat_type = { - i: 'Categorical' if x.name == 'category' else 'Numerical' - for i, x in enumerate(data['data'].dtypes) + i: "Categorical" if x.name == "category" else "Numerical" + for i, x in enumerate(data["data"].dtypes) } X, y = sklearn.datasets.fetch_openml(data_id=183, return_X_y=True, as_frame=False) y = preprocessing.LabelEncoder().fit_transform(y) @@ -145,17 +175,19 @@ def get_abalone_datamanager(): ) D = XYDataManager( - X_train, y_train, - X_test, y_test, + X_train, + y_train, + X_test, + y_test, MULTICLASS_CLASSIFICATION, feat_type, - dataset_name + dataset_name, ) return D def get_multilabel_classification_datamanager(): - X_train, Y_train, X_test, Y_test = get_dataset('iris') + X_train, Y_train, X_test, Y_test = get_dataset("iris") indices = list(range(X_train.shape[0])) np.random.seed(1) np.random.shuffle(indices) @@ -171,34 +203,35 @@ def get_multilabel_classification_datamanager(): # Y_test_[:, Y_test[i]] = 1 # Y_test = Y_test_ - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] + X_valid = X_test[ + :25, + ] + Y_valid = Y_test[ + :25, + ] + X_test = X_test[ + 25:, + ] + Y_test = Y_test[ + 25:, + ] D = Dummy() - D.info = { - 'task': MULTILABEL_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 3 - } + D.info = {"task": MULTILABEL_CLASSIFICATION, "is_sparse": False, "label_num": 3} D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'Y_valid': Y_valid, - 'X_test': X_test, - 'Y_test': Y_test + "X_train": X_train, + "Y_train": Y_train, + "X_valid": X_valid, + "Y_valid": Y_valid, + "X_test": X_test, + "Y_test": Y_test, } - D.feat_type = {0: 'numerical', - 1: 'Numerical', - 2: 'numerical', - 3: 'numerical'} + D.feat_type = {0: "numerical", 1: "Numerical", 2: "numerical", 3: "numerical"} return D def get_binary_classification_datamanager(): - X_train, Y_train, X_test, Y_test = get_dataset('iris') + X_train, Y_train, X_test, Y_test = get_dataset("iris") indices = list(range(X_train.shape[0])) np.random.seed(1) np.random.shuffle(indices) @@ -213,99 +246,108 @@ def get_binary_classification_datamanager(): X_test = X_test[eliminate_class_two] Y_test = Y_test[eliminate_class_two] - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] + X_valid = X_test[ + :25, + ] + Y_valid = Y_test[ + :25, + ] + X_test = X_test[ + 25:, + ] + Y_test = Y_test[ + 25:, + ] D = Dummy() - D.info = { - 'task': BINARY_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 2 - } + D.info = {"task": BINARY_CLASSIFICATION, "is_sparse": False, "label_num": 2} D.data = { - 'X_train': X_train, - 'Y_train': Y_train.reshape((-1, 1)), - 'X_valid': X_valid, - 'Y_valid': Y_valid.reshape((-1, 1)), - 'X_test': X_test, - 'Y_test': Y_test.reshape((-1, 1)) + "X_train": X_train, + "Y_train": Y_train.reshape((-1, 1)), + "X_valid": X_valid, + "Y_valid": Y_valid.reshape((-1, 1)), + "X_test": X_test, + "Y_test": Y_test.reshape((-1, 1)), } - D.feat_type = {0: 'numerical', - 1: 'Numerical', - 2: 'numerical', - 3: 'numerical'} + D.feat_type = {0: "numerical", 1: "Numerical", 2: "numerical", 3: "numerical"} return D def get_regression_datamanager(): - X_train, Y_train, X_test, Y_test = get_dataset('boston') + X_train, Y_train, X_test, Y_test = get_dataset("boston") indices = list(range(X_train.shape[0])) np.random.seed(1) np.random.shuffle(indices) X_train = X_train[indices] Y_train = Y_train[indices] - X_valid = X_test[:200, ] - Y_valid = Y_test[:200, ] - X_test = X_test[200:, ] - Y_test = Y_test[200:, ] + X_valid = X_test[ + :200, + ] + Y_valid = Y_test[ + :200, + ] + X_test = X_test[ + 200:, + ] + Y_test = Y_test[ + 200:, + ] D = Dummy() - D.info = { - 'task': REGRESSION, - 'is_sparse': False, - 'label_num': 1 - } + D.info = {"task": REGRESSION, "is_sparse": False, "label_num": 1} D.data = { - 'X_train': X_train, - 'Y_train': Y_train.reshape((-1, 1)), - 'X_valid': X_valid, - 'Y_valid': Y_valid.reshape((-1, 1)), - 'X_test': X_test, - 'Y_test': Y_test.reshape((-1, 1)) + "X_train": X_train, + "Y_train": Y_train.reshape((-1, 1)), + "X_valid": X_valid, + "Y_valid": Y_valid.reshape((-1, 1)), + "X_test": X_test, + "Y_test": Y_test.reshape((-1, 1)), } - D.feat_type = {i: 'numerical' for i in range(X_train.shape[1])} + D.feat_type = {i: "numerical" for i in range(X_train.shape[1])} return D def get_500_classes_datamanager(): weights = ([0.002] * 475) + ([0.001] * 25) - X, Y = sklearn.datasets.make_classification(n_samples=1000, - n_features=20, - n_classes=500, - n_clusters_per_class=1, - n_informative=15, - n_redundant=5, - n_repeated=0, - weights=weights, - flip_y=0, - class_sep=1.0, - hypercube=True, - shift=None, - scale=1.0, - shuffle=True, - random_state=1) + X, Y = sklearn.datasets.make_classification( + n_samples=1000, + n_features=20, + n_classes=500, + n_clusters_per_class=1, + n_informative=15, + n_redundant=5, + n_repeated=0, + weights=weights, + flip_y=0, + class_sep=1.0, + hypercube=True, + shift=None, + scale=1.0, + shuffle=True, + random_state=1, + ) D = Dummy() - D.info = { - 'task': MULTICLASS_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 500 + D.info = {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False, "label_num": 500} + D.data = { + "X_train": X[:700], + "Y_train": Y[:700], + "X_valid": X[700:710], + "Y_valid": Y[700:710], + "X_test": X[710:], + "Y_test": Y[710:], } - D.data = {'X_train': X[:700], 'Y_train': Y[:700], - 'X_valid': X[700:710], 'Y_valid': Y[700:710], - 'X_test': X[710:], 'Y_test': Y[710:] - } - D.feat_type = {i: 'numerical' for i in range(20)} + D.feat_type = {i: "numerical" for i in range(20)} return D def get_dataset_getters(): - return [get_binary_classification_datamanager, - get_multiclass_classification_datamanager, - get_multilabel_classification_datamanager, - get_500_classes_datamanager, - get_abalone_datamanager, - get_regression_datamanager] + return [ + get_binary_classification_datamanager, + get_multiclass_classification_datamanager, + get_multilabel_classification_datamanager, + get_500_classes_datamanager, + get_abalone_datamanager, + get_regression_datamanager, + ] diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py index f51820221b..c668a82ffd 100644 --- a/test/test_evaluation/test_abstract_evaluator.py +++ b/test/test_evaluation/test_abstract_evaluator.py @@ -3,19 +3,18 @@ import os import shutil import sys +import tempfile import unittest import unittest.mock -import tempfile import numpy as np import sklearn.dummy +from smac.tae import StatusType from autosklearn.automl_common.common.utils.backend import Backend, BackendContext - from autosklearn.evaluation.abstract_evaluator import AbstractEvaluator -from autosklearn.pipeline.components.base import _addons from autosklearn.metrics import accuracy -from smac.tae import StatusType +from autosklearn.pipeline.components.base import _addons this_directory = os.path.dirname(__file__) sys.path.append(this_directory) @@ -29,7 +28,7 @@ def setUp(self): """ Creates a backend mock """ - self.ev_path = os.path.join(this_directory, '.tmp_evaluations') + self.ev_path = os.path.join(this_directory, ".tmp_evaluations") if not os.path.exists(self.ev_path): os.mkdir(self.ev_path) dummy_model_files = [os.path.join(self.ev_path, str(n)) for n in range(100)] @@ -46,7 +45,7 @@ def setUp(self): self.port = logging.handlers.DEFAULT_TCP_LOGGING_PORT - self.working_directory = os.path.join(this_directory, '.tmp_%s' % self.id()) + self.working_directory = os.path.join(this_directory, ".tmp_%s" % self.id()) def tearDown(self): if os.path.exists(self.ev_path): @@ -56,16 +55,18 @@ def tearDown(self): pass def test_finish_up_model_predicts_NaN(self): - '''Tests by handing in predictions which contain NaNs''' + """Tests by handing in predictions which contain NaNs""" rs = np.random.RandomState(1) queue_mock = unittest.mock.Mock() - ae = AbstractEvaluator(backend=self.backend_mock, - port=self.port, - output_y_hat_optimization=False, - queue=queue_mock, metric=accuracy, - additional_components=dict(), - ) + ae = AbstractEvaluator( + backend=self.backend_mock, + port=self.port, + output_y_hat_optimization=False, + queue=queue_mock, + metric=accuracy, + additional_components=dict(), + ) ae.Y_optimization = rs.rand(33, 3) predictions_ensemble = rs.rand(33, 3) predictions_test = rs.rand(25, 3) @@ -85,9 +86,10 @@ def test_finish_up_model_predicts_NaN(self): status=StatusType.SUCCESS, ) self.assertEqual(loss, 1.0) - self.assertEqual(additional_run_info, - {'error': 'Model predictions for optimization set ' - 'contains NaNs.'}) + self.assertEqual( + additional_run_info, + {"error": "Model predictions for optimization set " "contains NaNs."}, + ) # NaNs in prediction validation predictions_ensemble[5, 2] = 0.5 @@ -104,9 +106,10 @@ def test_finish_up_model_predicts_NaN(self): status=StatusType.SUCCESS, ) self.assertEqual(loss, 1.0) - self.assertEqual(additional_run_info, - {'error': 'Model predictions for validation set ' - 'contains NaNs.'}) + self.assertEqual( + additional_run_info, + {"error": "Model predictions for validation set " "contains NaNs."}, + ) # NaNs in prediction test predictions_valid[5, 2] = 0.5 @@ -123,9 +126,10 @@ def test_finish_up_model_predicts_NaN(self): status=StatusType.SUCCESS, ) self.assertEqual(loss, 1.0) - self.assertEqual(additional_run_info, - {'error': 'Model predictions for test set contains ' - 'NaNs.'}) + self.assertEqual( + additional_run_info, + {"error": "Model predictions for test set contains " "NaNs."}, + ) self.assertEqual(self.backend_mock.save_predictions_as_npy.call_count, 0) @@ -147,12 +151,10 @@ def test_disable_file_output(self): predictions_test = rs.rand(25, 3) predictions_valid = rs.rand(25, 3) - loss_, additional_run_info_ = ( - ae.file_output( - predictions_ensemble, - predictions_valid, - predictions_test, - ) + loss_, additional_run_info_ = ae.file_output( + predictions_ensemble, + predictions_valid, + predictions_test, ) self.assertIsNone(loss_) @@ -160,7 +162,7 @@ def test_disable_file_output(self): # This function is never called as there is a return before self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 0) - for call_count, disable in enumerate(['model', 'cv_model'], start=1): + for call_count, disable in enumerate(["model", "cv_model"], start=1): ae = AbstractEvaluator( backend=self.backend_mock, output_y_hat_optimization=False, @@ -174,38 +176,49 @@ def test_disable_file_output(self): ae.model = unittest.mock.Mock() ae.models = [unittest.mock.Mock()] - loss_, additional_run_info_ = ( - ae.file_output( - predictions_ensemble, - predictions_valid, - predictions_test, - ) + loss_, additional_run_info_ = ae.file_output( + predictions_ensemble, + predictions_valid, + predictions_test, ) self.assertIsNone(loss_) self.assertEqual(additional_run_info_, {}) - self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, call_count) - if disable == 'model': + self.assertEqual( + self.backend_mock.save_numrun_to_dir.call_count, call_count + ) + if disable == "model": self.assertIsNone( - self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model']) + self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["model"] + ) self.assertIsNotNone( - self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model']) + self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][ + "cv_model" + ] + ) else: self.assertIsNotNone( - self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model']) + self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["model"] + ) self.assertIsNone( - self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model']) + self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][ + "cv_model" + ] + ) self.assertIsNotNone( self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][ - 'ensemble_predictions'] + "ensemble_predictions" + ] ) self.assertIsNotNone( self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][ - 'valid_predictions'] + "valid_predictions" + ] ) self.assertIsNotNone( self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][ - 'test_predictions'] + "test_predictions" + ] ) ae = AbstractEvaluator( @@ -213,20 +226,18 @@ def test_disable_file_output(self): output_y_hat_optimization=False, queue=queue_mock, metric=accuracy, - disable_file_output=['y_optimization'], + disable_file_output=["y_optimization"], port=self.port, additional_components=dict(), ) ae.Y_optimization = predictions_ensemble - ae.model = 'model' + ae.model = "model" ae.models = [unittest.mock.Mock()] - loss_, additional_run_info_ = ( - ae.file_output( - predictions_ensemble, - predictions_valid, - predictions_test, - ) + loss_, additional_run_info_ = ae.file_output( + predictions_ensemble, + predictions_valid, + predictions_test, ) self.assertIsNone(loss_) @@ -234,15 +245,18 @@ def test_disable_file_output(self): self.assertIsNone( self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][ - 'ensemble_predictions'] + "ensemble_predictions" + ] ) self.assertIsNotNone( self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][ - 'valid_predictions'] + "valid_predictions" + ] ) self.assertIsNotNone( self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][ - 'test_predictions'] + "test_predictions" + ] ) def test_file_output(self): @@ -252,14 +266,18 @@ def test_file_output(self): queue_mock = unittest.mock.Mock() context = BackendContext( - temporary_directory=os.path.join(self.working_directory, 'tmp'), - output_directory=os.path.join(self.working_directory, 'tmp_output'), + temporary_directory=os.path.join(self.working_directory, "tmp"), + output_directory=os.path.join(self.working_directory, "tmp_output"), delete_tmp_folder_after_terminate=True, delete_output_folder_after_terminate=True, - prefix="auto-sklearn" + prefix="auto-sklearn", ) - with unittest.mock.patch.object(Backend, 'load_datamanager') as load_datamanager_mock: - load_datamanager_mock.return_value = get_multiclass_classification_datamanager() + with unittest.mock.patch.object( + Backend, "load_datamanager" + ) as load_datamanager_mock: + load_datamanager_mock.return_value = ( + get_multiclass_classification_datamanager() + ) backend = Backend(context, prefix="auto-sklearn") @@ -285,8 +303,17 @@ def test_file_output(self): Y_test_pred=predictions_test, ) - self.assertTrue(os.path.exists(os.path.join(self.working_directory, 'tmp', - '.auto-sklearn', 'runs', '1_0_None'))) + self.assertTrue( + os.path.exists( + os.path.join( + self.working_directory, + "tmp", + ".auto-sklearn", + "runs", + "1_0_None", + ) + ) + ) shutil.rmtree(self.working_directory, ignore_errors=True) @@ -297,26 +324,34 @@ def test_add_additional_components(self): queue_mock = unittest.mock.Mock() context = BackendContext( - temporary_directory=os.path.join(self.working_directory, 'tmp'), - output_directory=os.path.join(self.working_directory, 'tmp_output'), + temporary_directory=os.path.join(self.working_directory, "tmp"), + output_directory=os.path.join(self.working_directory, "tmp_output"), delete_tmp_folder_after_terminate=True, delete_output_folder_after_terminate=True, - prefix="auto-sklearn" + prefix="auto-sklearn", ) - with unittest.mock.patch.object(Backend, 'load_datamanager') as load_datamanager_mock: - load_datamanager_mock.return_value = get_multiclass_classification_datamanager() + with unittest.mock.patch.object( + Backend, "load_datamanager" + ) as load_datamanager_mock: + load_datamanager_mock.return_value = ( + get_multiclass_classification_datamanager() + ) backend = Backend(context, prefix="auto-sklearn") - with unittest.mock.patch.object(_addons['classification'], 'add_component') as _: + with unittest.mock.patch.object( + _addons["classification"], "add_component" + ) as _: - # If the components in the argument `additional_components` are an empty dict - # there is no call to `add_component`, if there's something in it, `add_component - # is called (2nd case) - for fixture, case in ((0, dict()), (1, dict(abc='def'))): + # If the components in the argument `additional_components` are an + # empty dict there is no call to `add_component`, + # if there's something in it, `add_component is called (2nd case) + for fixture, case in ((0, dict()), (1, dict(abc="def"))): thirdparty_components_patch = unittest.mock.Mock() thirdparty_components_patch.components = case - additional_components = dict(classification=thirdparty_components_patch) + additional_components = dict( + classification=thirdparty_components_patch + ) AbstractEvaluator( backend=backend, output_y_hat_optimization=False, @@ -325,4 +360,6 @@ def test_add_additional_components(self): port=self.port, additional_components=additional_components, ) - self.assertEqual(_addons['classification'].add_component.call_count, fixture) + self.assertEqual( + _addons["classification"].add_component.call_count, fixture + ) diff --git a/test/test_evaluation/test_custom_splitters.py b/test/test_evaluation/test_custom_splitters.py index 4922442228..64f9dc2f18 100644 --- a/test/test_evaluation/test_custom_splitters.py +++ b/test/test_evaluation/test_custom_splitters.py @@ -1,37 +1,44 @@ -import pytest - import numpy as np +import pytest -from autosklearn.evaluation.splitter import CustomStratifiedShuffleSplit from autosklearn.constants import ( - BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION + BINARY_CLASSIFICATION, + MULTICLASS_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, ) +from autosklearn.evaluation.splitter import CustomStratifiedShuffleSplit -@pytest.mark.parametrize("task, X, y", [ - ( - BINARY_CLASSIFICATION, - np.asarray(10000 * [[1, 1, 1, 1, 1]]), - np.asarray(9999 * [0] + 1 * [1]) - ), - ( - MULTICLASS_CLASSIFICATION, - np.asarray(10000 * [[1, 1, 1, 1, 1]]), - np.asarray(4999 * [1] + 4999 * [2] + 1 * [3] + 1 * [4])), - ( - MULTILABEL_CLASSIFICATION, - np.asarray(10000 * [[1, 1, 1, 1, 1]]), - np.asarray(4999 * [[0, 1, 1]] + 4999 * [[1, 1, 0]] + 1 * [[1, 0, 1]] + 1 * [[0, 0, 0]]) - ) -]) -@pytest.mark.parametrize('train_size', [100, 0.5, 200, 0.75]) +@pytest.mark.parametrize( + "task, X, y", + [ + ( + BINARY_CLASSIFICATION, + np.asarray(10000 * [[1, 1, 1, 1, 1]]), + np.asarray(9999 * [0] + 1 * [1]), + ), + ( + MULTICLASS_CLASSIFICATION, + np.asarray(10000 * [[1, 1, 1, 1, 1]]), + np.asarray(4999 * [1] + 4999 * [2] + 1 * [3] + 1 * [4]), + ), + ( + MULTILABEL_CLASSIFICATION, + np.asarray(10000 * [[1, 1, 1, 1, 1]]), + np.asarray( + 4999 * [[0, 1, 1]] + + 4999 * [[1, 1, 0]] + + 1 * [[1, 0, 1]] + + 1 * [[0, 0, 0]] + ), + ), + ], +) +@pytest.mark.parametrize("train_size", [100, 0.5, 200, 0.75]) def test_custom_stratified_shuffle_split_returns_unique_labels_and_maintains_size( task, X, y, train_size ): - splitter = CustomStratifiedShuffleSplit( - train_size=train_size, - random_state=1 - ) + splitter = CustomStratifiedShuffleSplit(train_size=train_size, random_state=1) left_idxs, _ = next(splitter.split(X=X, y=y)) y_sampled = y[left_idxs] X_sampled = X[left_idxs] @@ -46,5 +53,6 @@ def test_custom_stratified_shuffle_split_returns_unique_labels_and_maintains_siz assert len(X_sampled) == n_samples # Assert all the unique labels are present in the training set - assert all(label in np.unique(y_sampled) for label in np.unique(y)), \ - f"{task} failed, {np.unique(y)} != {np.unique(y_sampled)}" + assert all( + label in np.unique(y_sampled) for label in np.unique(y) + ), f"{task} failed, {np.unique(y)} != {np.unique(y_sampled)}" diff --git a/test/test_evaluation/test_dummy_pipelines.py b/test/test_evaluation/test_dummy_pipelines.py index ed7c499711..3d5f1d0f59 100644 --- a/test/test_evaluation/test_dummy_pipelines.py +++ b/test/test_evaluation/test_dummy_pipelines.py @@ -1,20 +1,21 @@ import numpy as np - import pytest - from sklearn.base import clone from sklearn.datasets import make_classification, make_regression from sklearn.utils.validation import check_is_fitted -from autosklearn.evaluation.abstract_evaluator import MyDummyClassifier, MyDummyRegressor +from autosklearn.evaluation.abstract_evaluator import ( + MyDummyClassifier, + MyDummyRegressor, +) -@pytest.mark.parametrize("task_type", ['classification', 'regression']) +@pytest.mark.parametrize("task_type", ["classification", "regression"]) def test_dummy_pipeline(task_type): - if task_type == 'classification': + if task_type == "classification": estimator_class = MyDummyClassifier data_maker = make_classification - elif task_type == 'regression': + elif task_type == "regression": estimator_class = MyDummyRegressor data_maker = make_regression else: diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py index 77f6e5c4bf..67d9e0ca8b 100644 --- a/test/test_evaluation/test_evaluation.py +++ b/test/test_evaluation/test_evaluation.py @@ -1,6 +1,6 @@ -import os import logging import logging.handlers +import os import shutil import sys import time @@ -19,20 +19,21 @@ this_directory = os.path.dirname(__file__) sys.path.append(this_directory) -from evaluation_util import get_multiclass_classification_datamanager, get_evaluation_backend # noqa E402 +from evaluation_util import ( # noqa E402 + get_evaluation_backend, + get_multiclass_classification_datamanager, +) def safe_eval_success_mock(*args, **kwargs): - queue = kwargs['queue'] - queue.put({'status': StatusType.SUCCESS, - 'loss': 0.5, - 'additional_run_info': ''}) + queue = kwargs["queue"] + queue.put({"status": StatusType.SUCCESS, "loss": 0.5, "additional_run_info": ""}) class EvaluationTest(unittest.TestCase): def setUp(self): self.datamanager = get_multiclass_classification_datamanager() - self.tmp = os.path.join(os.getcwd(), '.test_evaluation') + self.tmp = os.path.join(os.getcwd(), ".test_evaluation") self.logger = logging.getLogger() scenario_mock = unittest.mock.Mock() scenario_mock.wallclock_limit = 10 @@ -72,142 +73,211 @@ def test_pynisher_timeout(self): def run_over_time(): time.sleep(2) - safe_eval = pynisher.enforce_limits(wall_time_in_s=1, - grace_period_in_s=0)(run_over_time) + safe_eval = pynisher.enforce_limits(wall_time_in_s=1, grace_period_in_s=0)( + run_over_time + ) safe_eval() self.assertEqual(safe_eval.exit_status, pynisher.TimeoutException) ############################################################################ # Test ExecuteTaFuncWithQueue.run_wrapper() - @unittest.mock.patch('autosklearn.evaluation.train_evaluator.eval_holdout') + @unittest.mock.patch("autosklearn.evaluation.train_evaluator.eval_holdout") def test_eval_with_limits_holdout(self, pynisher_mock): pynisher_mock.side_effect = safe_eval_success_mock config = unittest.mock.Mock() config.config_id = 198 - ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1, - port=self.logger_port, - resampling_strategy='holdout', - stats=self.stats, - memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), - abort_on_first_run_crash=False, - pynisher_context='fork', - ) - info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None, - instance_specific=None, seed=1, capped=False)) + ta = ExecuteTaFuncWithQueue( + backend=self.backend, + autosklearn_seed=1, + port=self.logger_port, + resampling_strategy="holdout", + stats=self.stats, + memory_limit=3072, + metric=accuracy, + cost_for_crash=get_cost_of_crash(accuracy), + abort_on_first_run_crash=False, + pynisher_context="fork", + ) + info = ta.run_wrapper( + RunInfo( + config=config, + cutoff=30, + instance=None, + instance_specific=None, + seed=1, + capped=False, + ) + ) self.assertEqual(info[0].config.config_id, 198) self.assertEqual(info[1].status, StatusType.SUCCESS) self.assertEqual(info[1].cost, 0.5) self.assertIsInstance(info[1].time, float) - @unittest.mock.patch('pynisher.enforce_limits') + @unittest.mock.patch("pynisher.enforce_limits") def test_zero_or_negative_cutoff(self, pynisher_mock): config = unittest.mock.Mock() config.config_id = 198 - ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1, - port=self.logger_port, - resampling_strategy='holdout', - stats=self.stats, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), - abort_on_first_run_crash=False, - pynisher_context='forkserver', - ) + ta = ExecuteTaFuncWithQueue( + backend=self.backend, + autosklearn_seed=1, + port=self.logger_port, + resampling_strategy="holdout", + stats=self.stats, + metric=accuracy, + cost_for_crash=get_cost_of_crash(accuracy), + abort_on_first_run_crash=False, + pynisher_context="forkserver", + ) self.scenario.wallclock_limit = 5 self.stats.submitted_ta_runs += 1 - run_info, run_value = ta.run_wrapper(RunInfo(config=config, cutoff=9, instance=None, - instance_specific=None, seed=1, capped=False)) + run_info, run_value = ta.run_wrapper( + RunInfo( + config=config, + cutoff=9, + instance=None, + instance_specific=None, + seed=1, + capped=False, + ) + ) self.assertEqual(run_value.status, StatusType.STOP) - @unittest.mock.patch('pynisher.enforce_limits') + @unittest.mock.patch("pynisher.enforce_limits") def test_cutoff_lower_than_remaining_time(self, pynisher_mock): config = unittest.mock.Mock() config.config_id = 198 - ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1, - port=self.logger_port, - resampling_strategy='holdout', - stats=self.stats, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), - abort_on_first_run_crash=False, - pynisher_context='forkserver', - ) + ta = ExecuteTaFuncWithQueue( + backend=self.backend, + autosklearn_seed=1, + port=self.logger_port, + resampling_strategy="holdout", + stats=self.stats, + metric=accuracy, + cost_for_crash=get_cost_of_crash(accuracy), + abort_on_first_run_crash=False, + pynisher_context="forkserver", + ) self.stats.ta_runs = 1 - ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None, instance_specific=None, - seed=1, capped=False)) - self.assertEqual(pynisher_mock.call_args[1]['wall_time_in_s'], 4) - self.assertIsInstance(pynisher_mock.call_args[1]['wall_time_in_s'], int) + ta.run_wrapper( + RunInfo( + config=config, + cutoff=30, + instance=None, + instance_specific=None, + seed=1, + capped=False, + ) + ) + self.assertEqual(pynisher_mock.call_args[1]["wall_time_in_s"], 4) + self.assertIsInstance(pynisher_mock.call_args[1]["wall_time_in_s"], int) - @unittest.mock.patch('autosklearn.evaluation.train_evaluator.eval_holdout') + @unittest.mock.patch("autosklearn.evaluation.train_evaluator.eval_holdout") def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock): pynisher_mock.return_value = None config = unittest.mock.Mock() - config.origin = 'MOCK' + config.origin = "MOCK" config.config_id = 198 - ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1, - port=self.logger_port, - resampling_strategy='holdout', - stats=self.stats, - memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), - abort_on_first_run_crash=False, - pynisher_context='fork', - ) + ta = ExecuteTaFuncWithQueue( + backend=self.backend, + autosklearn_seed=1, + port=self.logger_port, + resampling_strategy="holdout", + stats=self.stats, + memory_limit=3072, + metric=accuracy, + cost_for_crash=get_cost_of_crash(accuracy), + abort_on_first_run_crash=False, + pynisher_context="fork", + ) # The following should not fail because abort on first config crashed is false - info = ta.run_wrapper(RunInfo(config=config, cutoff=60, instance=None, - instance_specific=None, seed=1, capped=False)) + info = ta.run_wrapper( + RunInfo( + config=config, + cutoff=60, + instance=None, + instance_specific=None, + seed=1, + capped=False, + ) + ) self.assertEqual(info[1].status, StatusType.CRASHED) self.assertEqual(info[1].cost, 1.0) self.assertIsInstance(info[1].time, float) - self.assertEqual(info[1].additional_info, {'configuration_origin': 'MOCK', - 'error': "Result queue is empty", - 'exit_status': 0, - 'exitcode': 0, - 'subprocess_stdout': '', - 'subprocess_stderr': ''}) + self.assertEqual( + info[1].additional_info, + { + "configuration_origin": "MOCK", + "error": "Result queue is empty", + "exit_status": 0, + "exitcode": 0, + "subprocess_stdout": "", + "subprocess_stderr": "", + }, + ) self.stats.submitted_ta_runs += 1 - info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None, - instance_specific=None, seed=1, capped=False)) + info = ta.run_wrapper( + RunInfo( + config=config, + cutoff=30, + instance=None, + instance_specific=None, + seed=1, + capped=False, + ) + ) self.assertEqual(info[1].status, StatusType.CRASHED) self.assertEqual(info[1].cost, 1.0) self.assertIsInstance(info[1].time, float) - self.assertEqual(info[1].additional_info, {'configuration_origin': 'MOCK', - 'error': "Result queue is empty", - 'exit_status': 0, - 'exitcode': 0, - 'subprocess_stdout': '', - 'subprocess_stderr': ''}) - - @unittest.mock.patch('autosklearn.evaluation.train_evaluator.eval_holdout') + self.assertEqual( + info[1].additional_info, + { + "configuration_origin": "MOCK", + "error": "Result queue is empty", + "exit_status": 0, + "exitcode": 0, + "subprocess_stdout": "", + "subprocess_stderr": "", + }, + ) + + @unittest.mock.patch("autosklearn.evaluation.train_evaluator.eval_holdout") def test_eval_with_limits_holdout_fail_memory_error(self, pynisher_mock): pynisher_mock.side_effect = MemoryError config = unittest.mock.Mock() config.config_id = 198 - ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1, - port=self.logger_port, - resampling_strategy='holdout', - stats=self.stats, - memory_limit=3072, - metric=log_loss, - cost_for_crash=get_cost_of_crash(log_loss), - abort_on_first_run_crash=False, - pynisher_context='fork', - ) - info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None, - instance_specific=None, seed=1, capped=False)) + ta = ExecuteTaFuncWithQueue( + backend=self.backend, + autosklearn_seed=1, + port=self.logger_port, + resampling_strategy="holdout", + stats=self.stats, + memory_limit=3072, + metric=log_loss, + cost_for_crash=get_cost_of_crash(log_loss), + abort_on_first_run_crash=False, + pynisher_context="fork", + ) + info = ta.run_wrapper( + RunInfo( + config=config, + cutoff=30, + instance=None, + instance_specific=None, + seed=1, + capped=False, + ) + ) self.assertEqual(info[1].status, StatusType.MEMOUT) # For logloss, worst possible result is MAXINT worst_possible_result = MAXINT self.assertEqual(info[1].cost, worst_possible_result) self.assertIsInstance(info[1].time, float) - self.assertNotIn('exitcode', info[1].additional_info) + self.assertNotIn("exitcode", info[1].additional_info) - @unittest.mock.patch('pynisher.enforce_limits') + @unittest.mock.patch("pynisher.enforce_limits") def test_eval_with_limits_holdout_fail_timeout(self, pynisher_mock): config = unittest.mock.Mock() config.config_id = 198 @@ -218,33 +288,46 @@ def test_eval_with_limits_holdout_fail_timeout(self, pynisher_mock): pynisher_mock.return_value = m1 m2.exit_status = pynisher.TimeoutException m2.wall_clock_time = 30 - ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1, - port=self.logger_port, - resampling_strategy='holdout', - stats=self.stats, - memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), - abort_on_first_run_crash=False, - pynisher_context='forkserver', - ) - info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None, - instance_specific=None, seed=1, capped=False)) + ta = ExecuteTaFuncWithQueue( + backend=self.backend, + autosklearn_seed=1, + port=self.logger_port, + resampling_strategy="holdout", + stats=self.stats, + memory_limit=3072, + metric=accuracy, + cost_for_crash=get_cost_of_crash(accuracy), + abort_on_first_run_crash=False, + pynisher_context="forkserver", + ) + info = ta.run_wrapper( + RunInfo( + config=config, + cutoff=30, + instance=None, + instance_specific=None, + seed=1, + capped=False, + ) + ) self.assertEqual(info[1].status, StatusType.TIMEOUT) self.assertEqual(info[1].cost, 1.0) self.assertIsInstance(info[1].time, float) - self.assertNotIn('exitcode', info[1].additional_info) + self.assertNotIn("exitcode", info[1].additional_info) - @unittest.mock.patch('pynisher.enforce_limits') - def test_eval_with_limits_holdout_timeout_with_results_in_queue(self, pynisher_mock): + @unittest.mock.patch("pynisher.enforce_limits") + def test_eval_with_limits_holdout_timeout_with_results_in_queue( + self, pynisher_mock + ): config = unittest.mock.Mock() config.config_id = 198 def side_effect(**kwargs): - queue = kwargs['queue'] - queue.put({'status': StatusType.SUCCESS, - 'loss': 0.5, - 'additional_run_info': {}}) + queue = kwargs["queue"] + queue.put( + {"status": StatusType.SUCCESS, "loss": 0.5, "additional_run_info": {}} + ) + m1 = unittest.mock.Mock() m2 = unittest.mock.Mock() m1.return_value = m2 @@ -254,137 +337,194 @@ def side_effect(**kwargs): m2.wall_clock_time = 30 # Test for a succesful run - ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1, - port=self.logger_port, - resampling_strategy='holdout', - stats=self.stats, - memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), - abort_on_first_run_crash=False, - pynisher_context='forkserver', - ) - info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None, - instance_specific=None, seed=1, capped=False)) + ta = ExecuteTaFuncWithQueue( + backend=self.backend, + autosklearn_seed=1, + port=self.logger_port, + resampling_strategy="holdout", + stats=self.stats, + memory_limit=3072, + metric=accuracy, + cost_for_crash=get_cost_of_crash(accuracy), + abort_on_first_run_crash=False, + pynisher_context="forkserver", + ) + info = ta.run_wrapper( + RunInfo( + config=config, + cutoff=30, + instance=None, + instance_specific=None, + seed=1, + capped=False, + ) + ) self.assertEqual(info[1].status, StatusType.SUCCESS) self.assertEqual(info[1].cost, 0.5) self.assertIsInstance(info[1].time, float) - self.assertNotIn('exitcode', info[1].additional_info) + self.assertNotIn("exitcode", info[1].additional_info) # And a crashed run which is in the queue def side_effect(**kwargs): - queue = kwargs['queue'] - queue.put({'status': StatusType.CRASHED, - 'loss': 2.0, - 'additional_run_info': {}}) + queue = kwargs["queue"] + queue.put( + {"status": StatusType.CRASHED, "loss": 2.0, "additional_run_info": {}} + ) + m2.side_effect = side_effect - ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1, - port=self.logger_port, - resampling_strategy='holdout', - stats=self.stats, - memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), - abort_on_first_run_crash=False, - pynisher_context='forkserver', - ) - info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None, - instance_specific=None, seed=1, capped=False)) + ta = ExecuteTaFuncWithQueue( + backend=self.backend, + autosklearn_seed=1, + port=self.logger_port, + resampling_strategy="holdout", + stats=self.stats, + memory_limit=3072, + metric=accuracy, + cost_for_crash=get_cost_of_crash(accuracy), + abort_on_first_run_crash=False, + pynisher_context="forkserver", + ) + info = ta.run_wrapper( + RunInfo( + config=config, + cutoff=30, + instance=None, + instance_specific=None, + seed=1, + capped=False, + ) + ) self.assertEqual(info[1].status, StatusType.CRASHED) self.assertEqual(info[1].cost, 1.0) self.assertIsInstance(info[1].time, float) - self.assertNotIn('exitcode', info[1].additional_info) + self.assertNotIn("exitcode", info[1].additional_info) - @unittest.mock.patch('autosklearn.evaluation.train_evaluator.eval_holdout') + @unittest.mock.patch("autosklearn.evaluation.train_evaluator.eval_holdout") def test_eval_with_limits_holdout_2(self, eval_houldout_mock): config = unittest.mock.Mock() config.config_id = 198 def side_effect(*args, **kwargs): - queue = kwargs['queue'] - queue.put({'status': StatusType.SUCCESS, - 'loss': 0.5, - 'additional_run_info': kwargs['instance']}) + queue = kwargs["queue"] + queue.put( + { + "status": StatusType.SUCCESS, + "loss": 0.5, + "additional_run_info": kwargs["instance"], + } + ) + eval_houldout_mock.side_effect = side_effect - ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1, - port=self.logger_port, - resampling_strategy='holdout', - stats=self.stats, - memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), - abort_on_first_run_crash=False, - pynisher_context='fork', - ) + ta = ExecuteTaFuncWithQueue( + backend=self.backend, + autosklearn_seed=1, + port=self.logger_port, + resampling_strategy="holdout", + stats=self.stats, + memory_limit=3072, + metric=accuracy, + cost_for_crash=get_cost_of_crash(accuracy), + abort_on_first_run_crash=False, + pynisher_context="fork", + ) self.scenario.wallclock_limit = 180 instance = "{'subsample': 30}" - info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=instance, - instance_specific=None, seed=1, capped=False)) + info = ta.run_wrapper( + RunInfo( + config=config, + cutoff=30, + instance=instance, + instance_specific=None, + seed=1, + capped=False, + ) + ) self.assertEqual(info[1].status, StatusType.SUCCESS) self.assertEqual(len(info[1].additional_info), 2) - self.assertIn('configuration_origin', info[1].additional_info) - self.assertEqual(info[1].additional_info['message'], "{'subsample': 30}") + self.assertIn("configuration_origin", info[1].additional_info) + self.assertEqual(info[1].additional_info["message"], "{'subsample': 30}") - @unittest.mock.patch('autosklearn.evaluation.train_evaluator.eval_holdout') + @unittest.mock.patch("autosklearn.evaluation.train_evaluator.eval_holdout") def test_exception_in_target_function(self, eval_holdout_mock): config = unittest.mock.Mock() config.config_id = 198 eval_holdout_mock.side_effect = ValueError - ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=1, - port=self.logger_port, - resampling_strategy='holdout', - stats=self.stats, - memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), - abort_on_first_run_crash=False, - pynisher_context='fork', - ) + ta = ExecuteTaFuncWithQueue( + backend=self.backend, + autosklearn_seed=1, + port=self.logger_port, + resampling_strategy="holdout", + stats=self.stats, + memory_limit=3072, + metric=accuracy, + cost_for_crash=get_cost_of_crash(accuracy), + abort_on_first_run_crash=False, + pynisher_context="fork", + ) self.stats.submitted_ta_runs += 1 - info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None, - instance_specific=None, seed=1, capped=False)) + info = ta.run_wrapper( + RunInfo( + config=config, + cutoff=30, + instance=None, + instance_specific=None, + seed=1, + capped=False, + ) + ) self.assertEqual(info[1].status, StatusType.CRASHED) self.assertEqual(info[1].cost, 1.0) self.assertIsInstance(info[1].time, float) - self.assertEqual(info[1].additional_info['error'], 'ValueError()') - self.assertIn('traceback', info[1].additional_info) - self.assertNotIn('exitcode', info[1].additional_info) + self.assertEqual(info[1].additional_info["error"], "ValueError()") + self.assertIn("traceback", info[1].additional_info) + self.assertNotIn("exitcode", info[1].additional_info) def test_silent_exception_in_target_function(self): config = unittest.mock.Mock() config.config_id = 198 - delattr(self.backend, 'save_targets_ensemble') - ta = ExecuteTaFuncWithQueue(backend=self.backend, - port=self.logger_port, - autosklearn_seed=1, - resampling_strategy='holdout', - stats=self.stats, - memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), - abort_on_first_run_crash=False, - iterative=False, - pynisher_context='fork', - ) + delattr(self.backend, "save_targets_ensemble") + ta = ExecuteTaFuncWithQueue( + backend=self.backend, + port=self.logger_port, + autosklearn_seed=1, + resampling_strategy="holdout", + stats=self.stats, + memory_limit=3072, + metric=accuracy, + cost_for_crash=get_cost_of_crash(accuracy), + abort_on_first_run_crash=False, + iterative=False, + pynisher_context="fork", + ) ta.pynisher_logger = unittest.mock.Mock() self.stats.submitted_ta_runs += 1 - info = ta.run_wrapper(RunInfo(config=config, cutoff=3000, instance=None, - instance_specific=None, seed=1, capped=False)) - self.assertEqual(info[1].status, StatusType.CRASHED, msg=str(info[1].additional_info)) + info = ta.run_wrapper( + RunInfo( + config=config, + cutoff=3000, + instance=None, + instance_specific=None, + seed=1, + capped=False, + ) + ) + self.assertEqual( + info[1].status, StatusType.CRASHED, msg=str(info[1].additional_info) + ) self.assertEqual(info[1].cost, 1.0) self.assertIsInstance(info[1].time, float) self.assertIn( - info[1].additional_info['error'], + info[1].additional_info["error"], ( """AttributeError("'BackendMock' object has no attribute """ """'save_targets_ensemble'",)""", """AttributeError("'BackendMock' object has no attribute """ """'save_targets_ensemble'")""", - """AttributeError('save_targets_ensemble')""" - ) + """AttributeError('save_targets_ensemble')""", + ), ) - self.assertNotIn('exitcode', info[1].additional_info) - self.assertNotIn('exit_status', info[1].additional_info) - self.assertNotIn('traceback', info[1]) + self.assertNotIn("exitcode", info[1].additional_info) + self.assertNotIn("exit_status", info[1].additional_info) + self.assertNotIn("traceback", info[1]) diff --git a/test/test_evaluation/test_test_evaluator.py b/test/test_evaluation/test_test_evaluator.py index 93ea0c2265..0a1b67faa9 100644 --- a/test/test_evaluation/test_test_evaluator.py +++ b/test/test_evaluation/test_test_evaluator.py @@ -14,24 +14,26 @@ from smac.tae import StatusType from autosklearn.automl_common.common.utils.backend import Backend - -from autosklearn.constants import MULTILABEL_CLASSIFICATION, BINARY_CLASSIFICATION, \ - MULTICLASS_CLASSIFICATION, REGRESSION +from autosklearn.constants import ( + BINARY_CLASSIFICATION, + MULTICLASS_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, + REGRESSION, +) from autosklearn.evaluation.test_evaluator import TestEvaluator, eval_t from autosklearn.evaluation.util import read_queue +from autosklearn.metrics import accuracy, f1_macro, r2 from autosklearn.util.pipeline import get_configuration_space -from autosklearn.metrics import accuracy, r2, f1_macro this_directory = os.path.dirname(__file__) sys.path.append(this_directory) from evaluation_util import ( # noqa (E402: module level import not at top of file) - get_evaluation_backend, - get_dataset_getters, + SCORER_LIST, BaseEvaluatorTest, + get_dataset_getters, + get_evaluation_backend, get_multiclass_classification_datamanager, - SCORER_LIST -) # noqa (E402: module level import not at top of file) - +) N_TEST_RUNS = 3 @@ -45,28 +47,31 @@ class TestEvaluator_Test(BaseEvaluatorTest, unittest.TestCase): def test_datasets(self): for getter in get_dataset_getters(): - testname = '%s_%s' % (os.path.basename(__file__). - replace('.pyc', '').replace('.py', ''), - getter.__name__) + testname = "%s_%s" % ( + os.path.basename(__file__).replace(".pyc", "").replace(".py", ""), + getter.__name__, + ) with self.subTest(testname): backend_mock = get_evaluation_backend() D = getter() D_ = copy.deepcopy(D) - y = D.data['Y_train'] + y = D.data["Y_train"] if len(y.shape) == 2 and y.shape[1] == 1: - D_.data['Y_train'] = y.flatten() + D_.data["Y_train"] = y.flatten() backend_mock.load_datamanager.return_value = D_ - metric_lookup = {MULTILABEL_CLASSIFICATION: f1_macro, - BINARY_CLASSIFICATION: accuracy, - MULTICLASS_CLASSIFICATION: accuracy, - REGRESSION: r2} + metric_lookup = { + MULTILABEL_CLASSIFICATION: f1_macro, + BINARY_CLASSIFICATION: accuracy, + MULTICLASS_CLASSIFICATION: accuracy, + REGRESSION: r2, + } queue_ = multiprocessing.Queue() evaluator = TestEvaluator( backend_mock, queue_, - metric=metric_lookup[D.info['task']], + metric=metric_lookup[D.info["task"]], port=logging.handlers.DEFAULT_TCP_LOGGING_PORT, additional_components=dict(), ) @@ -75,22 +80,21 @@ def test_datasets(self): rval = read_queue(evaluator.queue) self.assertEqual(len(rval), 1) self.assertEqual(len(rval[0]), 3) - self.assertTrue(np.isfinite(rval[0]['loss'])) + self.assertTrue(np.isfinite(rval[0]["loss"])) class FunctionsTest(unittest.TestCase): def setUp(self): self.queue = multiprocessing.Queue() self.configuration = get_configuration_space( - {'task': MULTICLASS_CLASSIFICATION, - 'is_sparse': False}).get_default_configuration() + {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False} + ).get_default_configuration() self.data = get_multiclass_classification_datamanager() - self.tmp_dir = os.path.join(os.path.dirname(__file__), - '.test_cv_functions') + self.tmp_dir = os.path.join(os.path.dirname(__file__), ".test_cv_functions") self.backend = unittest.mock.Mock(spec=Backend) self.backend.temporary_directory = tempfile.gettempdir() self.backend.load_datamanager.return_value = self.data - self.dataset_name = json.dumps({'task_id': 'test'}) + self.dataset_name = json.dumps({"task_id": "test"}) self.port = logging.handlers.DEFAULT_TCP_LOGGING_PORT @@ -106,7 +110,8 @@ def test_eval_test(self): backend=self.backend, config=self.configuration, metric=accuracy, - seed=1, num_run=1, + seed=1, + num_run=1, scoring_functions=None, output_y_hat_optimization=False, include=None, @@ -118,9 +123,9 @@ def test_eval_test(self): ) rval = read_queue(self.queue) self.assertEqual(len(rval), 1) - self.assertAlmostEqual(rval[0]['loss'], 0.040000000000000036) - self.assertEqual(rval[0]['status'], StatusType.SUCCESS) - self.assertNotIn('bac_metric', rval[0]['additional_run_info']) + self.assertAlmostEqual(rval[0]["loss"], 0.040000000000000036) + self.assertEqual(rval[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", rval[0]["additional_run_info"]) def test_eval_test_all_loss_functions(self): eval_t( @@ -128,7 +133,8 @@ def test_eval_test_all_loss_functions(self): backend=self.backend, config=self.configuration, metric=accuracy, - seed=1, num_run=1, + seed=1, + num_run=1, scoring_functions=SCORER_LIST, output_y_hat_optimization=False, include=None, @@ -142,25 +148,30 @@ def test_eval_test_all_loss_functions(self): self.assertEqual(len(rval), 1) # Note: All metric here should be minimized - fixture = {'accuracy': 0.040000000000000036, - 'balanced_accuracy': 0.02777777777777779, - 'f1_macro': 0.0341005967604433, - 'f1_micro': 0.040000000000000036, - 'f1_weighted': 0.039693094629155934, - 'log_loss': 0.13966929787769913, - 'precision_macro': 0.03703703703703709, - 'precision_micro': 0.040000000000000036, - 'precision_weighted': 0.03555555555555556, - 'recall_macro': 0.02777777777777779, - 'recall_micro': 0.040000000000000036, - 'recall_weighted': 0.040000000000000036, - 'num_run': -1} - - additional_run_info = rval[0]['additional_run_info'] + fixture = { + "accuracy": 0.040000000000000036, + "balanced_accuracy": 0.02777777777777779, + "f1_macro": 0.0341005967604433, + "f1_micro": 0.040000000000000036, + "f1_weighted": 0.039693094629155934, + "log_loss": 0.13966929787769913, + "precision_macro": 0.03703703703703709, + "precision_micro": 0.040000000000000036, + "precision_weighted": 0.03555555555555556, + "recall_macro": 0.02777777777777779, + "recall_micro": 0.040000000000000036, + "recall_weighted": 0.040000000000000036, + "num_run": -1, + } + + additional_run_info = rval[0]["additional_run_info"] for key, value in fixture.items(): self.assertAlmostEqual(additional_run_info[key], fixture[key], msg=key) - self.assertEqual(len(additional_run_info), len(fixture) + 1, - msg=sorted(additional_run_info.items())) - self.assertIn('duration', additional_run_info) - self.assertAlmostEqual(rval[0]['loss'], 0.040000000000000036) - self.assertEqual(rval[0]['status'], StatusType.SUCCESS) + self.assertEqual( + len(additional_run_info), + len(fixture) + 1, + msg=sorted(additional_run_info.items()), + ) + self.assertIn("duration", additional_run_info) + self.assertAlmostEqual(rval[0]["loss"], 0.040000000000000036) + self.assertEqual(rval[0]["status"], StatusType.SUCCESS) diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index 28bddcdb09..92e3cfcc10 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -1,49 +1,73 @@ import copy import json import logging.handlers -import queue import multiprocessing import os -import tempfile +import queue import shutil import sys +import tempfile import unittest import unittest.mock -from ConfigSpace import Configuration import numpy as np -from sklearn.model_selection import GroupKFold, GroupShuffleSplit, \ - KFold, LeaveOneGroupOut, LeavePGroupsOut, LeaveOneOut, LeavePOut, \ - PredefinedSplit, RepeatedKFold, RepeatedStratifiedKFold, ShuffleSplit, \ - StratifiedKFold, StratifiedShuffleSplit, TimeSeriesSplit import sklearn.model_selection +from ConfigSpace import Configuration +from sklearn.model_selection import ( + GroupKFold, + GroupShuffleSplit, + KFold, + LeaveOneGroupOut, + LeaveOneOut, + LeavePGroupsOut, + LeavePOut, + PredefinedSplit, + RepeatedKFold, + RepeatedStratifiedKFold, + ShuffleSplit, + StratifiedKFold, + StratifiedShuffleSplit, + TimeSeriesSplit, +) from smac.tae import StatusType, TAEAbortException -from autosklearn.automl_common.common.utils import backend - import autosklearn.evaluation.splitter +from autosklearn.automl_common.common.utils import backend +from autosklearn.constants import ( + BINARY_CLASSIFICATION, + MULTICLASS_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, + MULTIOUTPUT_REGRESSION, + REGRESSION, +) from autosklearn.data.abstract_data_manager import AbstractDataManager +from autosklearn.evaluation.train_evaluator import ( + TrainEvaluator, + eval_cv, + eval_holdout, + eval_iterative_holdout, + eval_partial_cv, + subsample_indices, +) from autosklearn.evaluation.util import read_queue -from autosklearn.evaluation.train_evaluator import TrainEvaluator, \ - eval_holdout, eval_iterative_holdout, eval_cv, eval_partial_cv, subsample_indices +from autosklearn.metrics import accuracy, f1_macro, r2 from autosklearn.util.pipeline import get_configuration_space -from autosklearn.constants import BINARY_CLASSIFICATION, \ - MULTILABEL_CLASSIFICATION,\ - MULTICLASS_CLASSIFICATION,\ - REGRESSION,\ - MULTIOUTPUT_REGRESSION -from autosklearn.metrics import accuracy, r2, f1_macro this_directory = os.path.dirname(__file__) sys.path.append(this_directory) -from evaluation_util import get_regression_datamanager, BaseEvaluatorTest, \ - get_binary_classification_datamanager, get_dataset_getters, \ - get_multiclass_classification_datamanager, SCORER_LIST # noqa (E402: module level import not at top of file) +from evaluation_util import ( # noqa (E402: module level import not at top of file) + SCORER_LIST, + BaseEvaluatorTest, + get_binary_classification_datamanager, + get_dataset_getters, + get_multiclass_classification_datamanager, + get_regression_datamanager, +) class Dummy(object): def __init__(self): - self.name = 'dummy' + self.name = "dummy" class TestTrainEvaluator(BaseEvaluatorTest, unittest.TestCase): @@ -54,13 +78,15 @@ def setUp(self): Creates a backend mock """ tmp_dir_name = self.id() - self.ev_path = os.path.join(this_directory, '.tmp_evaluations', tmp_dir_name) + self.ev_path = os.path.join(this_directory, ".tmp_evaluations", tmp_dir_name) if os.path.exists(self.ev_path): shutil.rmtree(self.ev_path) os.makedirs(self.ev_path, exist_ok=False) dummy_model_files = [os.path.join(self.ev_path, str(n)) for n in range(100)] dummy_pred_files = [os.path.join(self.ev_path, str(n)) for n in range(100, 200)] - dummy_cv_model_files = [os.path.join(self.ev_path, str(n)) for n in range(200, 300)] + dummy_cv_model_files = [ + os.path.join(self.ev_path, str(n)) for n in range(200, 300) + ] backend_mock = unittest.mock.Mock() backend_mock.temporary_directory = tempfile.gettempdir() backend_mock.get_model_dir.return_value = self.ev_path @@ -70,7 +96,7 @@ def setUp(self): backend_mock.get_prediction_output_path.side_effect = dummy_pred_files self.backend_mock = backend_mock - self.tmp_dir = os.path.join(self.ev_path, 'tmp_dir') + self.tmp_dir = os.path.join(self.ev_path, "tmp_dir") self.port = logging.handlers.DEFAULT_TCP_LOGGING_PORT @@ -78,15 +104,18 @@ def tearDown(self): if os.path.exists(self.ev_path): shutil.rmtree(self.ev_path) - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_holdout(self, pipeline_mock): # Binary iris, contains 69 train samples, 25 validation samples, # 6 test samples D = get_binary_classification_datamanager() - D.name = 'test' + D.name = "test" - pipeline_mock.predict_proba.side_effect = \ - lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1)) + pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile( + [0.6, 0.4], (len(X), 1) + ) pipeline_mock.side_effect = lambda **kwargs: pipeline_mock pipeline_mock.get_additional_run_info.return_value = None pipeline_mock.get_max_iter.return_value = 1 @@ -96,21 +125,23 @@ def test_holdout(self, pipeline_mock): backend_api = backend.create( temporary_directory=self.tmp_dir, output_directory=None, - prefix="auto-sklearn" + prefix="auto-sklearn", ) backend_api.load_datamanager = lambda: D queue_ = multiprocessing.Queue() - evaluator = TrainEvaluator(backend_api, queue_, - configuration=configuration, - resampling_strategy='holdout', - resampling_strategy_args={'train_size': 0.66}, - scoring_functions=None, - output_y_hat_optimization=True, - metric=accuracy, - port=self.port, - additional_components=dict(), - ) + evaluator = TrainEvaluator( + backend_api, + queue_, + configuration=configuration, + resampling_strategy="holdout", + resampling_strategy_args={"train_size": 0.66}, + scoring_functions=None, + output_y_hat_optimization=True, + metric=accuracy, + port=self.port, + additional_components=dict(), + ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) evaluator.file_output.return_value = (None, {}) @@ -118,7 +149,7 @@ def test_holdout(self, pipeline_mock): rval = read_queue(evaluator.queue) self.assertEqual(len(rval), 1) - result = rval[0]['loss'] + result = rval[0]["loss"] self.assertEqual(len(rval[0]), 3) self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) @@ -129,17 +160,21 @@ def test_holdout(self, pipeline_mock): self.assertEqual(pipeline_mock.predict_proba.call_count, 4) self.assertEqual(evaluator.file_output.call_count, 1) self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 24) - self.assertEqual(evaluator.file_output.call_args[0][1].shape[0], - D.data['Y_valid'].shape[0]) - self.assertEqual(evaluator.file_output.call_args[0][2].shape[0], - D.data['Y_test'].shape[0]) + self.assertEqual( + evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0] + ) + self.assertEqual( + evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0] + ) self.assertEqual(evaluator.model.fit.call_count, 1) - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_iterative_holdout(self, pipeline_mock): # Regular fitting D = get_binary_classification_datamanager() - D.name = 'test' + D.name = "test" class SideEffect(object): def __init__(self): @@ -152,55 +187,100 @@ def configuration_fully_fitted(self): # final call to iterative fit return self.fully_fitted_call_count > 18 - Xt_fixture = 'Xt_fixture' + Xt_fixture = "Xt_fixture" pipeline_mock.estimator_supports_iterative_fit.return_value = True - pipeline_mock.configuration_fully_fitted.side_effect = \ + pipeline_mock.configuration_fully_fitted.side_effect = ( SideEffect().configuration_fully_fitted + ) pipeline_mock.fit_transformer.return_value = Xt_fixture, {} - pipeline_mock.predict_proba.side_effect = \ - lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1)) + pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile( + [0.6, 0.4], (len(X), 1) + ) pipeline_mock.get_additional_run_info.return_value = None pipeline_mock.side_effect = lambda **kwargs: pipeline_mock pipeline_mock.get_max_iter.return_value = 512 - pipeline_mock.get_current_iter.side_effect = (2, 4, 8, 16, 32, 64, 128, 256, 512) + pipeline_mock.get_current_iter.side_effect = ( + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + ) configuration = unittest.mock.Mock(spec=Configuration) backend_api = backend.create( temporary_directory=self.tmp_dir, output_directory=None, - prefix="auto-sklearn" + prefix="auto-sklearn", ) backend_api.load_datamanager = lambda: D queue_ = multiprocessing.Queue() - evaluator = TrainEvaluator(backend_api, queue_, - port=self.port, - configuration=configuration, - resampling_strategy='holdout', - scoring_functions=None, - output_y_hat_optimization=True, - metric=accuracy, - budget=0.0, - additional_components=dict(),) + evaluator = TrainEvaluator( + backend_api, + queue_, + port=self.port, + configuration=configuration, + resampling_strategy="holdout", + scoring_functions=None, + output_y_hat_optimization=True, + metric=accuracy, + budget=0.0, + additional_components=dict(), + ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) evaluator.file_output.return_value = (None, {}) class LossSideEffect(object): def __init__(self): - self.losses = [1.0, 1.0, 1.0, 1.0, - 0.9, 0.9, 0.9, 0.9, - 0.8, 0.8, 0.8, 0.8, - 0.7, 0.7, 0.7, 0.7, - 0.6, 0.6, 0.6, 0.6, - 0.5, 0.5, 0.5, 0.5, - 0.4, 0.4, 0.4, 0.4, - 0.3, 0.3, 0.3, 0.3, - 0.2, 0.2, 0.2, 0.2] + self.losses = [ + 1.0, + 1.0, + 1.0, + 1.0, + 0.9, + 0.9, + 0.9, + 0.9, + 0.8, + 0.8, + 0.8, + 0.8, + 0.7, + 0.7, + 0.7, + 0.7, + 0.6, + 0.6, + 0.6, + 0.6, + 0.5, + 0.5, + 0.5, + 0.5, + 0.4, + 0.4, + 0.4, + 0.4, + 0.3, + 0.3, + 0.3, + 0.3, + 0.2, + 0.2, + 0.2, + 0.2, + ] self.iteration = 0 def side_effect(self, *args, **kwargs): self.iteration += 1 return self.losses[self.iteration - 1] + evaluator._loss = unittest.mock.Mock() evaluator._loss.side_effect = LossSideEffect().side_effect @@ -209,38 +289,42 @@ def side_effect(self, *args, **kwargs): for i in range(1, 10): rval = evaluator.queue.get(timeout=1) - result = rval['loss'] + result = rval["loss"] self.assertAlmostEqual(result, 1.0 - (0.1 * (i - 1))) if i < 9: - self.assertEqual(rval['status'], StatusType.DONOTADVANCE) + self.assertEqual(rval["status"], StatusType.DONOTADVANCE) self.assertEqual(len(rval), 3) else: - self.assertEqual(rval['status'], StatusType.SUCCESS) + self.assertEqual(rval["status"], StatusType.SUCCESS) self.assertEqual(len(rval), 4) self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(pipeline_mock.iterative_fit.call_count, 9) self.assertEqual( - [cal[1]['n_iter'] for cal in pipeline_mock.iterative_fit.call_args_list], - [2, 2, 4, 8, 16, 32, 64, 128, 256] + [cal[1]["n_iter"] for cal in pipeline_mock.iterative_fit.call_args_list], + [2, 2, 4, 8, 16, 32, 64, 128, 256], ) # 20 calls because of train, holdout, validation and test set # and a total of five calls because of five iterations of fitting self.assertEqual(evaluator.model.predict_proba.call_count, 36) # 1/3 of 69 self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23) - self.assertEqual(evaluator.file_output.call_args[0][1].shape[0], - D.data['Y_valid'].shape[0]) - self.assertEqual(evaluator.file_output.call_args[0][2].shape[0], - D.data['Y_test'].shape[0]) + self.assertEqual( + evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0] + ) + self.assertEqual( + evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0] + ) self.assertEqual(evaluator.file_output.call_count, 9) self.assertEqual(evaluator.model.fit.call_count, 0) - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_iterative_holdout_interuption(self, pipeline_mock): # Regular fitting D = get_binary_classification_datamanager() - D.name = 'test' + D.name = "test" class SideEffect(object): def __init__(self): @@ -252,61 +336,93 @@ def configuration_fully_fitted(self): # if we need to add a special indicator to show that this is the # final call to iterative fit if self.fully_fitted_call_count == 5: - raise ValueError('fixture') + raise ValueError("fixture") return self.fully_fitted_call_count > 10 - Xt_fixture = 'Xt_fixture' + Xt_fixture = "Xt_fixture" pipeline_mock.estimator_supports_iterative_fit.return_value = True - pipeline_mock.configuration_fully_fitted.side_effect = \ + pipeline_mock.configuration_fully_fitted.side_effect = ( SideEffect().configuration_fully_fitted + ) pipeline_mock.fit_transformer.return_value = Xt_fixture, {} - pipeline_mock.predict_proba.side_effect = \ - lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1)) + pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile( + [0.6, 0.4], (len(X), 1) + ) pipeline_mock.side_effect = lambda **kwargs: pipeline_mock pipeline_mock.get_additional_run_info.return_value = None pipeline_mock.get_max_iter.return_value = 512 - pipeline_mock.get_current_iter.side_effect = (2, 4, 8, 16, 32, 64, 128, 256, 512) + pipeline_mock.get_current_iter.side_effect = ( + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + ) configuration = unittest.mock.Mock(spec=Configuration) backend_api = backend.create( temporary_directory=self.tmp_dir, output_directory=None, - prefix="auto-sklearn" + prefix="auto-sklearn", ) backend_api.load_datamanager = lambda: D queue_ = multiprocessing.Queue() - evaluator = TrainEvaluator(backend_api, queue_, - port=self.port, - configuration=configuration, - resampling_strategy='holdout-iterative-fit', - scoring_functions=None, - output_y_hat_optimization=True, - metric=accuracy, - budget=0.0, - additional_components=dict(), - ) + evaluator = TrainEvaluator( + backend_api, + queue_, + port=self.port, + configuration=configuration, + resampling_strategy="holdout-iterative-fit", + scoring_functions=None, + output_y_hat_optimization=True, + metric=accuracy, + budget=0.0, + additional_components=dict(), + ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) evaluator.file_output.return_value = (None, {}) class LossSideEffect(object): def __init__(self): - self.losses = [0.8, 0.8, 0.8, 0.8, - 0.6, 0.6, 0.6, 0.6, - 0.4, 0.4, 0.4, 0.4, - 0.2, 0.2, 0.2, 0.2, - 0.0, 0.0, 0.0, 0.0] + self.losses = [ + 0.8, + 0.8, + 0.8, + 0.8, + 0.6, + 0.6, + 0.6, + 0.6, + 0.4, + 0.4, + 0.4, + 0.4, + 0.2, + 0.2, + 0.2, + 0.2, + 0.0, + 0.0, + 0.0, + 0.0, + ] self.iteration = 0 def side_effect(self, *args, **kwargs): self.iteration += 1 return self.losses[self.iteration - 1] + evaluator._loss = unittest.mock.Mock() evaluator._loss.side_effect = LossSideEffect().side_effect self.assertRaisesRegex( ValueError, - 'fixture', + "fixture", evaluator.fit_predict_and_loss, iterative=True, ) @@ -314,7 +430,7 @@ def side_effect(self, *args, **kwargs): for i in range(1, 3): rval = evaluator.queue.get(timeout=1) - self.assertAlmostEqual(rval['loss'], 1.0 - (0.2 * i)) + self.assertAlmostEqual(rval["loss"], 1.0 - (0.2 * i)) self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(pipeline_mock.iterative_fit.call_count, 2) @@ -322,24 +438,29 @@ def side_effect(self, *args, **kwargs): # and a total of two calls each because of two iterations of fitting self.assertEqual(evaluator.model.predict_proba.call_count, 8) self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23) - self.assertEqual(evaluator.file_output.call_args[0][1].shape[0], - D.data['Y_valid'].shape[0]) - self.assertEqual(evaluator.file_output.call_args[0][2].shape[0], - D.data['Y_test'].shape[0]) + self.assertEqual( + evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0] + ) + self.assertEqual( + evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0] + ) self.assertEqual(evaluator.file_output.call_count, 2) self.assertEqual(evaluator.model.fit.call_count, 0) - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_iterative_holdout_not_iterative(self, pipeline_mock): # Regular fitting D = get_binary_classification_datamanager() - D.name = 'test' + D.name = "test" - Xt_fixture = 'Xt_fixture' + Xt_fixture = "Xt_fixture" pipeline_mock.estimator_supports_iterative_fit.return_value = False pipeline_mock.fit_transformer.return_value = Xt_fixture, {} - pipeline_mock.predict_proba.side_effect = \ - lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1)) + pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile( + [0.6, 0.4], (len(X), 1) + ) pipeline_mock.side_effect = lambda **kwargs: pipeline_mock pipeline_mock.get_additional_run_info.return_value = None @@ -347,20 +468,22 @@ def test_iterative_holdout_not_iterative(self, pipeline_mock): backend_api = backend.create( temporary_directory=self.tmp_dir, output_directory=None, - prefix="auto-sklearn" + prefix="auto-sklearn", ) backend_api.load_datamanager = lambda: D queue_ = multiprocessing.Queue() - evaluator = TrainEvaluator(backend_api, queue_, - port=self.port, - configuration=configuration, - resampling_strategy='holdout-iterative-fit', - scoring_functions=None, - output_y_hat_optimization=True, - metric=accuracy, - additional_components=dict(), - ) + evaluator = TrainEvaluator( + backend_api, + queue_, + port=self.port, + configuration=configuration, + resampling_strategy="holdout-iterative-fit", + scoring_functions=None, + output_y_hat_optimization=True, + metric=accuracy, + additional_components=dict(), + ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) evaluator.file_output.return_value = (None, {}) @@ -368,26 +491,31 @@ def test_iterative_holdout_not_iterative(self, pipeline_mock): self.assertEqual(evaluator.file_output.call_count, 1) rval = evaluator.queue.get(timeout=1) - self.assertAlmostEqual(rval['loss'], 0.47826086956521741) + self.assertAlmostEqual(rval["loss"], 0.47826086956521741) self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(pipeline_mock.iterative_fit.call_count, 0) # four calls for train, opt, valid and test self.assertEqual(evaluator.model.predict_proba.call_count, 4) self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23) - self.assertEqual(evaluator.file_output.call_args[0][1].shape[0], - D.data['Y_valid'].shape[0]) - self.assertEqual(evaluator.file_output.call_args[0][2].shape[0], - D.data['Y_test'].shape[0]) + self.assertEqual( + evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0] + ) + self.assertEqual( + evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0] + ) self.assertEqual(evaluator.file_output.call_count, 1) self.assertEqual(evaluator.model.fit.call_count, 1) - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_cv(self, pipeline_mock): D = get_binary_classification_datamanager() - pipeline_mock.predict_proba.side_effect = \ - lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1)) + pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile( + [0.6, 0.4], (len(X), 1) + ) pipeline_mock.side_effect = lambda **kwargs: pipeline_mock pipeline_mock.get_additional_run_info.return_value = None @@ -395,21 +523,23 @@ def test_cv(self, pipeline_mock): backend_api = backend.create( temporary_directory=self.tmp_dir, output_directory=None, - prefix="auto-sklearn" + prefix="auto-sklearn", ) backend_api.load_datamanager = lambda: D queue_ = multiprocessing.Queue() - evaluator = TrainEvaluator(backend_api, queue_, - port=self.port, - configuration=configuration, - resampling_strategy='cv', - resampling_strategy_args={'folds': 5}, - scoring_functions=None, - output_y_hat_optimization=True, - metric=accuracy, - additional_components=dict(), - ) + evaluator = TrainEvaluator( + backend_api, + queue_, + port=self.port, + configuration=configuration, + resampling_strategy="cv", + resampling_strategy_args={"folds": 5}, + scoring_functions=None, + output_y_hat_optimization=True, + metric=accuracy, + additional_components=dict(), + ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) evaluator.file_output.return_value = (None, {}) @@ -417,7 +547,7 @@ def test_cv(self, pipeline_mock): rval = read_queue(evaluator.queue) self.assertEqual(len(rval), 1) - result = rval[0]['loss'] + result = rval[0]["loss"] self.assertEqual(len(rval[0]), 3) self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) @@ -427,49 +557,57 @@ def test_cv(self, pipeline_mock): # Fifteen calls because of the training, holdout, validation and # test set (4 sets x 5 folds = 20) self.assertEqual(pipeline_mock.predict_proba.call_count, 20) - self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], - D.data['Y_train'].shape[0]) - self.assertEqual(evaluator.file_output.call_args[0][1].shape[0], - D.data['Y_valid'].shape[0]) - self.assertEqual(evaluator.file_output.call_args[0][2].shape[0], - D.data['Y_test'].shape[0]) + self.assertEqual( + evaluator.file_output.call_args[0][0].shape[0], D.data["Y_train"].shape[0] + ) + self.assertEqual( + evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0] + ) + self.assertEqual( + evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0] + ) # The model prior to fitting is saved, this cannot be directly tested # because of the way the mock module is used. Instead, we test whether # the if block in which model assignment is done is accessed self.assertTrue(evaluator._added_empty_model) - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_partial_cv(self, pipeline_mock): D = get_binary_classification_datamanager() - pipeline_mock.predict_proba.side_effect = \ - lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1)) + pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile( + [0.6, 0.4], (len(X), 1) + ) pipeline_mock.side_effect = lambda **kwargs: pipeline_mock pipeline_mock.get_additional_run_info.return_value = None pipeline_mock.get_max_iter.return_value = 1 pipeline_mock.get_current_iter.return_value = 1 D = get_binary_classification_datamanager() - D.name = 'test' + D.name = "test" configuration = unittest.mock.Mock(spec=Configuration) backend_api = backend.create( temporary_directory=self.tmp_dir, output_directory=None, - prefix="auto-sklearn" + prefix="auto-sklearn", ) backend_api.load_datamanager = lambda: D queue_ = multiprocessing.Queue() - evaluator = TrainEvaluator(backend_api, queue_, - port=self.port, - configuration=configuration, - resampling_strategy='partial-cv', - resampling_strategy_args={'folds': 5}, - scoring_functions=None, - output_y_hat_optimization=True, - metric=accuracy, - additional_components=dict(), - ) + evaluator = TrainEvaluator( + backend_api, + queue_, + port=self.port, + configuration=configuration, + resampling_strategy="partial-cv", + resampling_strategy_args={"folds": 5}, + scoring_functions=None, + output_y_hat_optimization=True, + metric=accuracy, + additional_components=dict(), + ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) evaluator.file_output.return_value = (None, {}) @@ -480,19 +618,21 @@ def test_partial_cv(self, pipeline_mock): self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(evaluator.file_output.call_count, 0) - self.assertEqual(rval['loss'], 0.5) + self.assertEqual(rval["loss"], 0.5) self.assertEqual(pipeline_mock.fit.call_count, 1) self.assertEqual(pipeline_mock.predict_proba.call_count, 4) # The model prior to fitting is saved, this cannot be directly tested # because of the way the mock module is used. Instead, we test whether # the if block in which model assignment is done is accessed - self.assertTrue(hasattr(evaluator, 'model')) + self.assertTrue(hasattr(evaluator, "model")) - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_iterative_partial_cv(self, pipeline_mock): # Regular fitting D = get_binary_classification_datamanager() - D.name = 'test' + D.name = "test" class SideEffect(object): def __init__(self): @@ -505,57 +645,101 @@ def configuration_fully_fitted(self): # final call to iterative fit return self.fully_fitted_call_count > 18 - Xt_fixture = 'Xt_fixture' + Xt_fixture = "Xt_fixture" pipeline_mock.estimator_supports_iterative_fit.return_value = True - pipeline_mock.configuration_fully_fitted.side_effect = \ + pipeline_mock.configuration_fully_fitted.side_effect = ( SideEffect().configuration_fully_fitted + ) pipeline_mock.fit_transformer.return_value = Xt_fixture, {} - pipeline_mock.predict_proba.side_effect = \ - lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1)) + pipeline_mock.predict_proba.side_effect = lambda X, batch_size=None: np.tile( + [0.6, 0.4], (len(X), 1) + ) pipeline_mock.get_additional_run_info.return_value = None pipeline_mock.side_effect = lambda **kwargs: pipeline_mock pipeline_mock.get_max_iter.return_value = 512 - pipeline_mock.get_current_iter.side_effect = (2, 4, 8, 16, 32, 64, 128, 256, 512) + pipeline_mock.get_current_iter.side_effect = ( + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + ) configuration = unittest.mock.Mock(spec=Configuration) backend_api = backend.create( temporary_directory=self.tmp_dir, output_directory=None, - prefix="auto-sklearn" + prefix="auto-sklearn", ) backend_api.load_datamanager = lambda: D queue_ = multiprocessing.Queue() - evaluator = TrainEvaluator(backend_api, queue_, - port=self.port, - configuration=configuration, - resampling_strategy='partial-cv-iterative-fit', - resampling_strategy_args={'folds': 5}, - scoring_functions=None, - output_y_hat_optimization=True, - metric=accuracy, - budget=0.0, - additional_components=dict(), - ) + evaluator = TrainEvaluator( + backend_api, + queue_, + port=self.port, + configuration=configuration, + resampling_strategy="partial-cv-iterative-fit", + resampling_strategy_args={"folds": 5}, + scoring_functions=None, + output_y_hat_optimization=True, + metric=accuracy, + budget=0.0, + additional_components=dict(), + ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) evaluator.file_output.return_value = (None, {}) class LossSideEffect(object): def __init__(self): - self.losses = [1.0, 1.0, 1.0, 1.0, - 0.9, 0.9, 0.9, 0.9, - 0.8, 0.8, 0.8, 0.8, - 0.7, 0.7, 0.7, 0.7, - 0.6, 0.6, 0.6, 0.6, - 0.5, 0.5, 0.5, 0.5, - 0.4, 0.4, 0.4, 0.4, - 0.3, 0.3, 0.3, 0.3, - 0.2, 0.2, 0.2, 0.2] + self.losses = [ + 1.0, + 1.0, + 1.0, + 1.0, + 0.9, + 0.9, + 0.9, + 0.9, + 0.8, + 0.8, + 0.8, + 0.8, + 0.7, + 0.7, + 0.7, + 0.7, + 0.6, + 0.6, + 0.6, + 0.6, + 0.5, + 0.5, + 0.5, + 0.5, + 0.4, + 0.4, + 0.4, + 0.4, + 0.3, + 0.3, + 0.3, + 0.3, + 0.2, + 0.2, + 0.2, + 0.2, + ] self.iteration = 0 def side_effect(self, *args, **kwargs): self.iteration += 1 return self.losses[self.iteration - 1] + evaluator._loss = unittest.mock.Mock() evaluator._loss.side_effect = LossSideEffect().side_effect @@ -565,118 +749,145 @@ def side_effect(self, *args, **kwargs): for i in range(1, 10): rval = evaluator.queue.get(timeout=1) - self.assertAlmostEqual(rval['loss'], 1.0 - (0.1 * (i - 1))) + self.assertAlmostEqual(rval["loss"], 1.0 - (0.1 * (i - 1))) if i < 9: - self.assertEqual(rval['status'], StatusType.DONOTADVANCE) + self.assertEqual(rval["status"], StatusType.DONOTADVANCE) else: - self.assertEqual(rval['status'], StatusType.SUCCESS) + self.assertEqual(rval["status"], StatusType.SUCCESS) self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(pipeline_mock.iterative_fit.call_count, 9) self.assertEqual( - [cal[1]['n_iter'] for cal in pipeline_mock.iterative_fit.call_args_list], - [2, 2, 4, 8, 16, 32, 64, 128, 256] + [cal[1]["n_iter"] for cal in pipeline_mock.iterative_fit.call_args_list], + [2, 2, 4, 8, 16, 32, 64, 128, 256], ) # fifteen calls because of the holdout, the validation and the test set # and a total of five calls because of five iterations of fitting - self.assertTrue(hasattr(evaluator, 'model')) + self.assertTrue(hasattr(evaluator, "model")) self.assertEqual(pipeline_mock.iterative_fit.call_count, 9) # 20 calls because of train, holdout, the validation and the test set # and a total of five calls because of five iterations of fitting self.assertEqual(pipeline_mock.predict_proba.call_count, 36) - @unittest.mock.patch.object(TrainEvaluator, '_loss') - @unittest.mock.patch.object(TrainEvaluator, '_get_model') + @unittest.mock.patch.object(TrainEvaluator, "_loss") + @unittest.mock.patch.object(TrainEvaluator, "_get_model") def test_file_output(self, loss_mock, model_mock): D = get_regression_datamanager() - D.name = 'test' + D.name = "test" self.backend_mock.load_datamanager.return_value = D configuration = unittest.mock.Mock(spec=Configuration) queue_ = multiprocessing.Queue() loss_mock.return_value = None model_mock.return_value = None - evaluator = TrainEvaluator(self.backend_mock, queue=queue_, - port=self.port, - configuration=configuration, - resampling_strategy='cv', - resampling_strategy_args={'folds': 5}, - scoring_functions=SCORER_LIST, - output_y_hat_optimization=True, - metric=accuracy, - additional_components=dict(),) + evaluator = TrainEvaluator( + self.backend_mock, + queue=queue_, + port=self.port, + configuration=configuration, + resampling_strategy="cv", + resampling_strategy_args={"folds": 5}, + scoring_functions=SCORER_LIST, + output_y_hat_optimization=True, + metric=accuracy, + additional_components=dict(), + ) self.backend_mock.get_model_dir.return_value = True - evaluator.model = 'model' - evaluator.Y_optimization = D.data['Y_train'] + evaluator.model = "model" + evaluator.Y_optimization = D.data["Y_train"] rval = evaluator.file_output( - D.data['Y_train'], - D.data['Y_valid'], - D.data['Y_test'], + D.data["Y_train"], + D.data["Y_valid"], + D.data["Y_test"], ) self.assertEqual(rval, (None, {})) self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, 1) self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 1) - self.assertEqual(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(), - {'seed', 'idx', 'budget', 'model', 'cv_model', - 'ensemble_predictions', 'valid_predictions', 'test_predictions'}) - self.assertIsNotNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model']) - self.assertIsNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model']) + self.assertEqual( + self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(), + { + "seed", + "idx", + "budget", + "model", + "cv_model", + "ensemble_predictions", + "valid_predictions", + "test_predictions", + }, + ) + self.assertIsNotNone( + self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["model"] + ) + self.assertIsNone( + self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["cv_model"] + ) - evaluator.models = ['model2', 'model2'] + evaluator.models = ["model2", "model2"] rval = evaluator.file_output( - D.data['Y_train'], - D.data['Y_valid'], - D.data['Y_test'], + D.data["Y_train"], + D.data["Y_valid"], + D.data["Y_test"], ) self.assertEqual(rval, (None, {})) self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, 2) self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 2) - self.assertEqual(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(), - {'seed', 'idx', 'budget', 'model', 'cv_model', - 'ensemble_predictions', 'valid_predictions', 'test_predictions'}) - self.assertIsNotNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model']) - self.assertIsNotNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model']) + self.assertEqual( + self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(), + { + "seed", + "idx", + "budget", + "model", + "cv_model", + "ensemble_predictions", + "valid_predictions", + "test_predictions", + }, + ) + self.assertIsNotNone( + self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["model"] + ) + self.assertIsNotNone( + self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["cv_model"] + ) # Check for not containing NaNs - that the models don't predict nonsense # for unseen data - D.data['Y_valid'][0] = np.NaN + D.data["Y_valid"][0] = np.NaN rval = evaluator.file_output( - D.data['Y_train'], - D.data['Y_valid'], - D.data['Y_test'], + D.data["Y_train"], + D.data["Y_valid"], + D.data["Y_test"], ) self.assertEqual( rval, ( 1.0, - { - 'error': - 'Model predictions for validation set contains NaNs.' - }, - ) + {"error": "Model predictions for validation set contains NaNs."}, + ), ) - D.data['Y_train'][0] = np.NaN + D.data["Y_train"][0] = np.NaN rval = evaluator.file_output( - D.data['Y_train'], - D.data['Y_valid'], - D.data['Y_test'], + D.data["Y_train"], + D.data["Y_valid"], + D.data["Y_test"], ) self.assertEqual( rval, ( 1.0, - { - 'error': - 'Model predictions for optimization set contains NaNs.' - }, - ) + {"error": "Model predictions for optimization set contains NaNs."}, + ), ) - @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend') - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend") + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_subsample_indices_classification(self, mock, backend_mock): configuration = unittest.mock.Mock(spec=Configuration) @@ -684,26 +895,32 @@ def test_subsample_indices_classification(self, mock, backend_mock): D = get_binary_classification_datamanager() backend_mock.load_datamanager.return_value = D backend_mock.temporary_directory = tempfile.gettempdir() - evaluator = TrainEvaluator(backend_mock, queue_, - port=self.port, - configuration=configuration, - resampling_strategy='cv', - resampling_strategy_args={'folds': 10}, - metric=accuracy, - additional_components=dict(), - ) + evaluator = TrainEvaluator( + backend_mock, + queue_, + port=self.port, + configuration=configuration, + resampling_strategy="cv", + resampling_strategy_args={"folds": 10}, + metric=accuracy, + additional_components=dict(), + ) train_indices = np.arange(69, dtype=int) train_indices1 = subsample_indices( - train_indices, 0.1449, evaluator.task_type, evaluator.Y_train) + train_indices, 0.1449, evaluator.task_type, evaluator.Y_train + ) evaluator.subsample = 20 train_indices2 = subsample_indices( - train_indices, 0.2898, evaluator.task_type, evaluator.Y_train) + train_indices, 0.2898, evaluator.task_type, evaluator.Y_train + ) evaluator.subsample = 30 train_indices3 = subsample_indices( - train_indices, 0.4347, evaluator.task_type, evaluator.Y_train) + train_indices, 0.4347, evaluator.task_type, evaluator.Y_train + ) evaluator.subsample = 67 train_indices4 = subsample_indices( - train_indices, 0.971, evaluator.task_type, evaluator.Y_train) + train_indices, 0.971, evaluator.task_type, evaluator.Y_train + ) # Common cases for ti in train_indices1: self.assertIn(ti, train_indices2) @@ -714,62 +931,98 @@ def test_subsample_indices_classification(self, mock, backend_mock): # Corner cases self.assertRaisesRegex( - ValueError, 'train_size=0.0 should be either positive and smaller than the ' - r'number of samples 69 or a float in the \(0, 1\) range', - subsample_indices, train_indices, 0.0, evaluator.task_type, evaluator.Y_train) + ValueError, + "train_size=0.0 should be either positive and smaller than the " + r"number of samples 69 or a float in the \(0, 1\) range", + subsample_indices, + train_indices, + 0.0, + evaluator.task_type, + evaluator.Y_train, + ) # With equal or greater it should return a non-shuffled array of indices train_indices5 = subsample_indices( - train_indices, 1.0, evaluator.task_type, evaluator.Y_train) + train_indices, 1.0, evaluator.task_type, evaluator.Y_train + ) self.assertTrue(np.all(train_indices5 == train_indices)) evaluator.subsample = 68 self.assertRaisesRegex( - ValueError, 'The test_size = 1 should be greater or equal to the number of ' - 'classes = 2', subsample_indices, train_indices, 0.9999, evaluator.task_type, - evaluator.Y_train) + ValueError, + "The test_size = 1 should be greater or equal to the number of " + "classes = 2", + subsample_indices, + train_indices, + 0.9999, + evaluator.task_type, + evaluator.Y_train, + ) - @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend') - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend") + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_subsample_indices_regression(self, mock, backend_mock): configuration = unittest.mock.Mock(spec=Configuration) queue_ = multiprocessing.Queue() backend_mock.temporary_directory = tempfile.gettempdir() - evaluator = TrainEvaluator(backend_mock, queue_, - port=self.port, - configuration=configuration, - resampling_strategy='cv', - resampling_strategy_args={'folds': 10}, - metric=accuracy, - additional_components=dict(), - ) + evaluator = TrainEvaluator( + backend_mock, + queue_, + port=self.port, + configuration=configuration, + resampling_strategy="cv", + resampling_strategy_args={"folds": 10}, + metric=accuracy, + additional_components=dict(), + ) train_indices = np.arange(69, dtype=int) - train_indices3 = subsample_indices(train_indices, subsample=0.4347, - task_type=evaluator.task_type, - Y_train=evaluator.Y_train) + train_indices3 = subsample_indices( + train_indices, + subsample=0.4347, + task_type=evaluator.task_type, + Y_train=evaluator.Y_train, + ) evaluator.subsample = 67 - train_indices4 = subsample_indices(train_indices, subsample=0.4347, - task_type=evaluator.task_type, - Y_train=evaluator.Y_train) + train_indices4 = subsample_indices( + train_indices, + subsample=0.4347, + task_type=evaluator.task_type, + Y_train=evaluator.Y_train, + ) # Common cases for ti in train_indices3: self.assertIn(ti, train_indices4) # Corner cases self.assertRaisesRegex( - ValueError, 'train_size=0.0 should be either positive and smaller than the ' - r'number of samples 69 or a float in the \(0, 1\) range', - subsample_indices, train_indices, 0.0, - evaluator.task_type, evaluator.Y_train) + ValueError, + "train_size=0.0 should be either positive and smaller than the " + r"number of samples 69 or a float in the \(0, 1\) range", + subsample_indices, + train_indices, + 0.0, + evaluator.task_type, + evaluator.Y_train, + ) self.assertRaisesRegex( - ValueError, 'Subsample must not be larger than 1, but is 1.000100', - subsample_indices, train_indices, 1.0001, - evaluator.task_type, evaluator.Y_train) + ValueError, + "Subsample must not be larger than 1, but is 1.000100", + subsample_indices, + train_indices, + 1.0001, + evaluator.task_type, + evaluator.Y_train, + ) # With equal or greater it should return a non-shuffled array of indices - train_indices6 = subsample_indices(train_indices, 1.0, evaluator.task_type, - evaluator.Y_train) + train_indices6 = subsample_indices( + train_indices, 1.0, evaluator.task_type, evaluator.Y_train + ) np.testing.assert_allclose(train_indices6, train_indices) - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_predict_proba_binary_classification(self, mock): D = get_binary_classification_datamanager() self.backend_mock.load_datamanager.return_value = D @@ -781,30 +1034,38 @@ def test_predict_proba_binary_classification(self, mock): configuration = unittest.mock.Mock(spec=Configuration) queue_ = multiprocessing.Queue() - evaluator = TrainEvaluator(self.backend_mock, queue_, - port=self.port, - configuration=configuration, - resampling_strategy='cv', - resampling_strategy_args={'folds': 10}, - output_y_hat_optimization=False, - metric=accuracy, - additional_components=dict(), - ) + evaluator = TrainEvaluator( + self.backend_mock, + queue_, + port=self.port, + configuration=configuration, + resampling_strategy="cv", + resampling_strategy_args={"folds": 10}, + output_y_hat_optimization=False, + metric=accuracy, + additional_components=dict(), + ) evaluator.fit_predict_and_loss() Y_optimization_pred = self.backend_mock.save_numrun_to_dir.call_args_list[0][1][ - 'ensemble_predictions'] + "ensemble_predictions" + ] for i in range(7): self.assertEqual(0.9, Y_optimization_pred[i][1]) - @unittest.mock.patch.object(TrainEvaluator, 'file_output') - @unittest.mock.patch.object(TrainEvaluator, '_partial_fit_and_predict_standard') - @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend') - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch.object(TrainEvaluator, "file_output") + @unittest.mock.patch.object(TrainEvaluator, "_partial_fit_and_predict_standard") + @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend") + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_fit_predict_and_loss_standard_additional_run_info( - self, mock, backend_mock, _partial_fit_and_predict_mock, - file_output_mock, + self, + mock, + backend_mock, + _partial_fit_and_predict_mock, + file_output_mock, ): D = get_binary_classification_datamanager() backend_mock.load_datamanager.return_value = D @@ -815,7 +1076,7 @@ def test_fit_predict_and_loss_standard_additional_run_info( np.array([[0.1, 0.9]] * 23), np.array([[0.1, 0.9]] * 25), np.array([[0.1, 0.9]] * 6), - {'a': 5}, + {"a": 5}, ) file_output_mock.return_value = (None, {}) @@ -823,10 +1084,11 @@ def test_fit_predict_and_loss_standard_additional_run_info( queue_ = multiprocessing.Queue() evaluator = TrainEvaluator( - backend_mock, queue_, + backend_mock, + queue_, port=self.port, configuration=configuration, - resampling_strategy='holdout', + resampling_strategy="holdout", output_y_hat_optimization=False, metric=accuracy, additional_components=dict(), @@ -840,8 +1102,8 @@ def test_fit_predict_and_loss_standard_additional_run_info( rval = evaluator.fit_predict_and_loss(iterative=False) self.assertIsNone(rval) element = queue_.get() - self.assertEqual(element['status'], StatusType.SUCCESS) - self.assertEqual(element['additional_run_info']['a'], 5) + self.assertEqual(element["status"], StatusType.SUCCESS) + self.assertEqual(element["additional_run_info"]["a"], 5) self.assertEqual(_partial_fit_and_predict_mock.call_count, 1) class SideEffect(object): @@ -856,7 +1118,7 @@ def __call__(self, *args, **kwargs): np.array([[0.1, 0.9]] * 35), np.array([[0.1, 0.9]] * 25), np.array([[0.1, 0.9]] * 6), - {'a': 5} + {"a": 5}, ) else: return ( @@ -864,15 +1126,17 @@ def __call__(self, *args, **kwargs): np.array([[0.1, 0.9]] * 34), np.array([[0.1, 0.9]] * 25), np.array([[0.1, 0.9]] * 6), - {'a': 5} + {"a": 5}, ) + _partial_fit_and_predict_mock.side_effect = SideEffect() evaluator = TrainEvaluator( - backend_mock, queue_, + backend_mock, + queue_, port=self.port, configuration=configuration, - resampling_strategy='cv', - resampling_strategy_args={'folds': 2}, + resampling_strategy="cv", + resampling_strategy_args={"folds": 2}, output_y_hat_optimization=False, metric=accuracy, additional_components=dict(), @@ -885,28 +1149,34 @@ def __call__(self, *args, **kwargs): self.assertRaisesRegex( TAEAbortException, - 'Found additional run info "{\'a\': 5}" in fold 1, ' - 'but cannot handle additional run info if fold >= 1.', + "Found additional run info \"{'a': 5}\" in fold 1, " + "but cannot handle additional run info if fold >= 1.", evaluator.fit_predict_and_loss, - iterative=False + iterative=False, ) - @unittest.mock.patch.object(TrainEvaluator, '_loss') - @unittest.mock.patch.object(TrainEvaluator, 'finish_up') - @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend') - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch.object(TrainEvaluator, "_loss") + @unittest.mock.patch.object(TrainEvaluator, "finish_up") + @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend") + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_fit_predict_and_loss_iterative_additional_run_info( - self, mock, backend_mock, finish_up_mock, loss_mock, + self, + mock, + backend_mock, + finish_up_mock, + loss_mock, ): - class Counter: counter = 0 def __call__(self): self.counter += 1 return False if self.counter <= 1 else True + mock.estimator_supports_iterative_fit.return_value = True - mock.fit_transformer.return_value = ('Xt', {}) + mock.fit_transformer.return_value = ("Xt", {}) mock.configuration_fully_fitted.side_effect = Counter() mock.get_current_iter.side_effect = Counter() mock.get_max_iter.return_value = 1 @@ -922,10 +1192,11 @@ def __call__(self): queue_ = multiprocessing.Queue() evaluator = TrainEvaluator( - backend_mock, queue_, + backend_mock, + queue_, port=self.port, configuration=configuration, - resampling_strategy='holdout', + resampling_strategy="holdout", output_y_hat_optimization=False, metric=accuracy, budget=0.0, @@ -938,17 +1209,23 @@ def __call__(self): rval = evaluator.fit_predict_and_loss(iterative=True) self.assertIsNone(rval) self.assertEqual(finish_up_mock.call_count, 1) - self.assertEqual(finish_up_mock.call_args[1]['additional_run_info'], 14678) - - @unittest.mock.patch.object(TrainEvaluator, '_loss') - @unittest.mock.patch.object(TrainEvaluator, 'finish_up') - @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend') - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + self.assertEqual(finish_up_mock.call_args[1]["additional_run_info"], 14678) + + @unittest.mock.patch.object(TrainEvaluator, "_loss") + @unittest.mock.patch.object(TrainEvaluator, "finish_up") + @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend") + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info( - self, mock, backend_mock, finish_up_mock, loss_mock, + self, + mock, + backend_mock, + finish_up_mock, + loss_mock, ): mock.estimator_supports_iterative_fit.return_value = False - mock.fit_transformer.return_value = ('Xt', {}) + mock.fit_transformer.return_value = ("Xt", {}) mock.get_additional_run_info.return_value = 14678 loss_mock.return_value = 0.5 @@ -961,10 +1238,11 @@ def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info( queue_ = multiprocessing.Queue() evaluator = TrainEvaluator( - backend_mock, queue_, + backend_mock, + queue_, port=self.port, configuration=configuration, - resampling_strategy='holdout', + resampling_strategy="holdout", output_y_hat_optimization=False, metric=accuracy, additional_components=dict(), @@ -977,14 +1255,20 @@ def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info( rval = evaluator.fit_predict_and_loss(iterative=True) self.assertIsNone(rval) self.assertEqual(finish_up_mock.call_count, 1) - self.assertEqual(finish_up_mock.call_args[1]['additional_run_info'], 14678) - - @unittest.mock.patch.object(TrainEvaluator, '_loss') - @unittest.mock.patch.object(TrainEvaluator, 'finish_up') - @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend') - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + self.assertEqual(finish_up_mock.call_args[1]["additional_run_info"], 14678) + + @unittest.mock.patch.object(TrainEvaluator, "_loss") + @unittest.mock.patch.object(TrainEvaluator, "finish_up") + @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend") + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_fit_predict_and_loss_budget_additional_run_info( - self, mock, backend_mock, finish_up_mock, loss_mock, + self, + mock, + backend_mock, + finish_up_mock, + loss_mock, ): class Counter: counter = 0 @@ -992,12 +1276,13 @@ class Counter: def __call__(self): self.counter += 1 return False if self.counter <= 1 else True + mock.configuration_fully_fitted.side_effect = Counter() mock.get_current_iter.side_effect = Counter() mock.get_max_iter.return_value = 1 mock.estimator_supports_iterative_fit.return_value = True - mock.fit_transformer.return_value = ('Xt', {}) - mock.get_additional_run_info.return_value = {'val': 14678} + mock.fit_transformer.return_value = ("Xt", {}) + mock.get_additional_run_info.return_value = {"val": 14678} mock.get_max_iter.return_value = 512 loss_mock.return_value = 0.5 @@ -1010,13 +1295,14 @@ def __call__(self): queue_ = multiprocessing.Queue() evaluator = TrainEvaluator( - backend_mock, queue_, + backend_mock, + queue_, port=self.port, configuration=configuration, - resampling_strategy='holdout', + resampling_strategy="holdout", output_y_hat_optimization=False, metric=accuracy, - budget_type='iterations', + budget_type="iterations", budget=50, additional_components=dict(), ) @@ -1028,18 +1314,26 @@ def __call__(self): rval = evaluator.fit_predict_and_loss(iterative=False) self.assertIsNone(rval) self.assertEqual(finish_up_mock.call_count, 1) - self.assertEqual(finish_up_mock.call_args[1]['additional_run_info'], {'val': 14678}) + self.assertEqual( + finish_up_mock.call_args[1]["additional_run_info"], {"val": 14678} + ) - @unittest.mock.patch.object(TrainEvaluator, '_loss') - @unittest.mock.patch.object(TrainEvaluator, 'finish_up') - @unittest.mock.patch('autosklearn.automl_common.common.utils.backend.Backend') - @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + @unittest.mock.patch.object(TrainEvaluator, "_loss") + @unittest.mock.patch.object(TrainEvaluator, "finish_up") + @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend") + @unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + ) def test_fit_predict_and_loss_budget_2_additional_run_info( - self, mock, backend_mock, finish_up_mock, loss_mock, + self, + mock, + backend_mock, + finish_up_mock, + loss_mock, ): mock.estimator_supports_iterative_fit.return_value = False - mock.fit_transformer.return_value = ('Xt', {}) - mock.get_additional_run_info.return_value = {'val': 14678} + mock.fit_transformer.return_value = ("Xt", {}) + mock.get_additional_run_info.return_value = {"val": 14678} loss_mock.return_value = 0.5 D = get_binary_classification_datamanager() @@ -1051,13 +1345,14 @@ def test_fit_predict_and_loss_budget_2_additional_run_info( queue_ = multiprocessing.Queue() evaluator = TrainEvaluator( - backend_mock, queue_, + backend_mock, + queue_, port=self.port, configuration=configuration, - resampling_strategy='holdout', + resampling_strategy="holdout", output_y_hat_optimization=False, metric=accuracy, - budget_type='subsample', + budget_type="subsample", budget=50, additional_components=dict(), ) @@ -1069,7 +1364,9 @@ def test_fit_predict_and_loss_budget_2_additional_run_info( rval = evaluator.fit_predict_and_loss(iterative=False) self.assertIsNone(rval) self.assertEqual(finish_up_mock.call_count, 1) - self.assertEqual(finish_up_mock.call_args[1]['additional_run_info'], {'val': 14678}) + self.assertEqual( + finish_up_mock.call_args[1]["additional_run_info"], {"val": 14678} + ) def test_get_results(self): queue_ = multiprocessing.Queue() @@ -1082,33 +1379,39 @@ def test_get_results(self): def test_datasets(self): for getter in get_dataset_getters(): - testname = '%s_%s' % (os.path.basename(__file__). - replace('.pyc', '').replace('.py', ''), - getter.__name__) + testname = "%s_%s" % ( + os.path.basename(__file__).replace(".pyc", "").replace(".py", ""), + getter.__name__, + ) with self.subTest(testname): D = getter() D_ = copy.deepcopy(D) - y = D.data['Y_train'] + y = D.data["Y_train"] if len(y.shape) == 2 and y.shape[1] == 1: - D_.data['Y_train'] = y.flatten() + D_.data["Y_train"] = y.flatten() self.backend_mock.load_datamanager.return_value = D_ queue_ = multiprocessing.Queue() - metric_lookup = {MULTILABEL_CLASSIFICATION: f1_macro, - BINARY_CLASSIFICATION: accuracy, - MULTICLASS_CLASSIFICATION: accuracy, - REGRESSION: r2} - evaluator = TrainEvaluator(self.backend_mock, queue_, - port=self.port, - resampling_strategy='cv', - resampling_strategy_args={'folds': 2}, - output_y_hat_optimization=False, - metric=metric_lookup[D.info['task']], - additional_components=dict(),) + metric_lookup = { + MULTILABEL_CLASSIFICATION: f1_macro, + BINARY_CLASSIFICATION: accuracy, + MULTICLASS_CLASSIFICATION: accuracy, + REGRESSION: r2, + } + evaluator = TrainEvaluator( + self.backend_mock, + queue_, + port=self.port, + resampling_strategy="cv", + resampling_strategy_args={"folds": 2}, + output_y_hat_optimization=False, + metric=metric_lookup[D.info["task"]], + additional_components=dict(), + ) evaluator.fit_predict_and_loss() rval = evaluator.queue.get(timeout=1) - self.assertTrue(np.isfinite(rval['loss'])) + self.assertTrue(np.isfinite(rval["loss"])) ############################################################################ # Test obtaining a splitter object from scikit-learn @@ -1122,147 +1425,142 @@ def test_get_splitter(self, te_mock): # holdout, binary classification evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'holdout' + evaluator.resampling_strategy = "holdout" evaluator.resampling_strategy_args = {} cv = evaluator.get_splitter(D) - self.assertIsInstance(cv, - sklearn.model_selection.StratifiedShuffleSplit) + self.assertIsInstance(cv, sklearn.model_selection.StratifiedShuffleSplit) # holdout, binary classification, no shuffle evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'holdout' - evaluator.resampling_strategy_args = {'shuffle': False} + evaluator.resampling_strategy = "holdout" + evaluator.resampling_strategy_args = {"shuffle": False} cv = evaluator.get_splitter(D) - self.assertIsInstance(cv, - sklearn.model_selection.PredefinedSplit) + self.assertIsInstance(cv, sklearn.model_selection.PredefinedSplit) # holdout, binary classification, fallback to custom shuffle split - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1, 2]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1, 2]) evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'holdout' + evaluator.resampling_strategy = "holdout" evaluator.resampling_strategy_args = {} cv = evaluator.get_splitter(D) - self.assertIsInstance(cv, - autosklearn.evaluation.splitter.CustomStratifiedShuffleSplit) + self.assertIsInstance( + cv, autosklearn.evaluation.splitter.CustomStratifiedShuffleSplit + ) # cv, binary classification - D.data['Y_train'] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'cv' - evaluator.resampling_strategy_args = {'folds': 5} + evaluator.resampling_strategy = "cv" + evaluator.resampling_strategy_args = {"folds": 5} cv = evaluator.get_splitter(D) - self.assertIsInstance(cv, - sklearn.model_selection._split.StratifiedKFold) + self.assertIsInstance(cv, sklearn.model_selection._split.StratifiedKFold) # cv, binary classification, shuffle is True - D.data['Y_train'] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'cv' - evaluator.resampling_strategy_args = {'folds': 5} + evaluator.resampling_strategy = "cv" + evaluator.resampling_strategy_args = {"folds": 5} cv = evaluator.get_splitter(D) - self.assertIsInstance(cv, - sklearn.model_selection._split.StratifiedKFold) + self.assertIsInstance(cv, sklearn.model_selection._split.StratifiedKFold) self.assertTrue(cv.shuffle) # cv, binary classification, shuffle is False - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'cv' - evaluator.resampling_strategy_args = {'folds': 5, 'shuffle': False} + evaluator.resampling_strategy = "cv" + evaluator.resampling_strategy_args = {"folds": 5, "shuffle": False} cv = evaluator.get_splitter(D) - self.assertIsInstance(cv, - sklearn.model_selection._split.KFold) + self.assertIsInstance(cv, sklearn.model_selection._split.KFold) self.assertFalse(cv.shuffle) # cv, binary classification, fallback to custom splitter - D.data['Y_train'] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2]) + D.data["Y_train"] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2]) evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'cv' - evaluator.resampling_strategy_args = {'folds': 5} + evaluator.resampling_strategy = "cv" + evaluator.resampling_strategy_args = {"folds": 5} cv = evaluator.get_splitter(D) - self.assertIsInstance(cv, - autosklearn.evaluation.splitter.CustomStratifiedKFold) + self.assertIsInstance(cv, autosklearn.evaluation.splitter.CustomStratifiedKFold) # regression, shuffle split - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'holdout' + evaluator.resampling_strategy = "holdout" evaluator.resampling_strategy_args = {} cv = evaluator.get_splitter(D) - self.assertIsInstance(cv, - sklearn.model_selection._split.ShuffleSplit) + self.assertIsInstance(cv, sklearn.model_selection._split.ShuffleSplit) # regression, no shuffle - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'holdout' - evaluator.resampling_strategy_args = {'shuffle': False} + evaluator.resampling_strategy = "holdout" + evaluator.resampling_strategy_args = {"shuffle": False} cv = evaluator.get_splitter(D) - self.assertIsInstance(cv, - sklearn.model_selection._split.PredefinedSplit) + self.assertIsInstance(cv, sklearn.model_selection._split.PredefinedSplit) # regression cv, KFold - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'cv' - evaluator.resampling_strategy_args = {'folds': 5} + evaluator.resampling_strategy = "cv" + evaluator.resampling_strategy_args = {"folds": 5} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, sklearn.model_selection._split.KFold) self.assertTrue(cv.shuffle) # regression cv, KFold, no shuffling - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'cv' - evaluator.resampling_strategy_args = {'folds': 5, 'shuffle': False} + evaluator.resampling_strategy = "cv" + evaluator.resampling_strategy_args = {"folds": 5, "shuffle": False} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, sklearn.model_selection._split.KFold) self.assertFalse(cv.shuffle) # multioutput regression, shuffle split - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'holdout' + evaluator.resampling_strategy = "holdout" evaluator.resampling_strategy_args = {} cv = evaluator.get_splitter(D) - self.assertIsInstance(cv, - sklearn.model_selection._split.ShuffleSplit) + self.assertIsInstance(cv, sklearn.model_selection._split.ShuffleSplit) # multioutput regression, no shuffle - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'holdout' - evaluator.resampling_strategy_args = {'shuffle': False} + evaluator.resampling_strategy = "holdout" + evaluator.resampling_strategy_args = {"shuffle": False} cv = evaluator.get_splitter(D) - self.assertIsInstance(cv, - sklearn.model_selection._split.PredefinedSplit) + self.assertIsInstance(cv, sklearn.model_selection._split.PredefinedSplit) # multioutput regression cv, KFold - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'cv' - evaluator.resampling_strategy_args = {'folds': 5} + evaluator.resampling_strategy = "cv" + evaluator.resampling_strategy_args = {"folds": 5} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, sklearn.model_selection._split.KFold) self.assertTrue(cv.shuffle) # multioutput regression cv, KFold, no shuffling - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'cv' - evaluator.resampling_strategy_args = {'folds': 5, 'shuffle': False} + evaluator.resampling_strategy = "cv" + evaluator.resampling_strategy_args = {"folds": 5, "shuffle": False} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, sklearn.model_selection._split.KFold) self.assertFalse(cv.shuffle) @@ -1276,19 +1574,26 @@ def test_get_splitter_cv_object(self, te_mock): D.feat_type = {} # GroupKFold, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) - D.data['X_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) + D.data["X_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = GroupKFold(n_splits=2) - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, GroupKFold) - self.assertEqual(cv.get_n_splits(groups=evaluator.resampling_strategy_args['groups']), 2) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 + ) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # GroupKFold, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = GroupKFold(n_splits=2) evaluator.resampling_strategy_args = None @@ -1296,23 +1601,31 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # GroupKFold, regression with args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = GroupKFold(n_splits=2) - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, GroupKFold) - self.assertEqual(cv.get_n_splits(groups=evaluator.resampling_strategy_args['groups']), 2) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 + ) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # GroupKFold, regression no args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = GroupKFold(n_splits=2) evaluator.resampling_strategy_args = None @@ -1320,25 +1633,35 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # GroupKFold, multi-output regression with args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = GroupKFold(n_splits=2) - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, GroupKFold) - self.assertEqual(cv.get_n_splits(groups=evaluator.resampling_strategy_args['groups']), 2) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 + ) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # GroupKFold, multi-output regression no args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = GroupKFold(n_splits=2) evaluator.resampling_strategy_args = None @@ -1346,110 +1669,154 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # KFold, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = KFold(n_splits=4, shuffle=True, random_state=5) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, KFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 4) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4 + ) self.assertTrue(cv.shuffle) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # KFold, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = KFold(n_splits=3) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, KFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 3) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3 + ) self.assertFalse(cv.shuffle) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # KFold, regression with args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = KFold(n_splits=4, shuffle=True, random_state=5) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, KFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 4) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4 + ) self.assertTrue(cv.shuffle) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # KFold, regression no args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = KFold(n_splits=3) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, KFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 3) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3 + ) self.assertFalse(cv.shuffle) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # KFold, multi-output regression with args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = KFold(n_splits=4, shuffle=True, random_state=5) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, KFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 4) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4 + ) self.assertTrue(cv.shuffle) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # KFold, multi-output regression no args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = KFold(n_splits=3) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, KFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 3) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3 + ) self.assertFalse(cv.shuffle) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeaveOneGroupOut, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = LeaveOneGroupOut() - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeaveOneGroupOut) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeaveOneGroupOut, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = LeaveOneGroupOut() evaluator.resampling_strategy_args = None @@ -1457,22 +1824,28 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # LeaveOneGroupOut, regression with args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeaveOneGroupOut() - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeaveOneGroupOut) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeaveOneGroupOut, regression no args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeaveOneGroupOut() evaluator.resampling_strategy_args = None @@ -1480,24 +1853,32 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # LeaveOneGroupOut, multi-output regression with args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeaveOneGroupOut() - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeaveOneGroupOut) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeaveOneGroupOut, multi-output regression no args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeaveOneGroupOut() evaluator.resampling_strategy_args = None @@ -1505,21 +1886,27 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # LeavePGroupsOut, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = LeavePGroupsOut(n_groups=1) - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeavePGroupsOut) self.assertEqual(cv.n_groups, 1) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeavePGroupsOut, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = LeaveOneGroupOut() evaluator.resampling_strategy_args = None @@ -1527,23 +1914,29 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # LeavePGroupsOut, regression with args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeavePGroupsOut(n_groups=1) - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeavePGroupsOut) self.assertEqual(cv.n_groups, 1) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeavePGroupsOut, regression no args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeavePGroupsOut(n_groups=1) evaluator.resampling_strategy_args = None @@ -1551,25 +1944,33 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # LeavePGroupsOut, multi-output regression with args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeavePGroupsOut(n_groups=1) - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeavePGroupsOut) self.assertEqual(cv.n_groups, 1) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeavePGroupsOut, multi-output regression no args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeavePGroupsOut(n_groups=1) evaluator.resampling_strategy_args = None @@ -1577,384 +1978,567 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # LeaveOneOut, classification - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = LeaveOneOut() evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeaveOneOut) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeaveOneOut, regression - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeaveOneOut() evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeaveOneOut) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeaveOneOut, multi-output regression - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeaveOneOut() evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeaveOneOut) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeavePOut, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = LeavePOut(p=3) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeavePOut) self.assertEqual(cv.p, 3) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeavePOut, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = LeavePOut(p=2) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeavePOut) self.assertEqual(cv.p, 2) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeavePOut, regression with args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = LeavePOut(p=3) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeavePOut) self.assertEqual(cv.p, 3) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeavePOut, regression no args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeavePOut(p=2) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeavePOut) self.assertEqual(cv.p, 2) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeavePOut, multi-output regression with args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = LeavePOut(p=3) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeavePOut) self.assertEqual(cv.p, 3) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # LeavePOut, multi-output regression no args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = LeavePOut(p=2) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, LeavePOut) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # PredefinedSplit, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None - evaluator.resampling_strategy = PredefinedSplit(test_fold=np.array([0, 1, 0, 1, 0, 1])) + evaluator.resampling_strategy = PredefinedSplit( + test_fold=np.array([0, 1, 0, 1, 0, 1]) + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, PredefinedSplit) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # PredefinedSplit, regression with args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None - evaluator.resampling_strategy = PredefinedSplit(test_fold=np.array([0, 1, 0, 1, 0, 1])) + evaluator.resampling_strategy = PredefinedSplit( + test_fold=np.array([0, 1, 0, 1, 0, 1]) + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, PredefinedSplit) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # PredefinedSplit, multi-output regression with args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None - evaluator.resampling_strategy = PredefinedSplit(test_fold=np.array([0, 1, 0, 1, 0, 1])) + evaluator.resampling_strategy = PredefinedSplit( + test_fold=np.array([0, 1, 0, 1, 0, 1]) + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, PredefinedSplit) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # RepeatedKFold, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None - evaluator.resampling_strategy = RepeatedKFold(n_splits=4, n_repeats=3, random_state=5) + evaluator.resampling_strategy = RepeatedKFold( + n_splits=4, n_repeats=3, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, RepeatedKFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 4*3) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4 * 3 + ) self.assertEqual(cv.n_repeats, 3) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # RepeatedKFold, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = RepeatedKFold(n_splits=5, n_repeats=10) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, RepeatedKFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 5*10) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 5 * 10 + ) self.assertEqual(cv.n_repeats, 10) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # RepeatedKFold, regression with args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None - evaluator.resampling_strategy = RepeatedKFold(n_splits=4, n_repeats=3, random_state=5) + evaluator.resampling_strategy = RepeatedKFold( + n_splits=4, n_repeats=3, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, RepeatedKFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 4*3) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4 * 3 + ) self.assertEqual(cv.n_repeats, 3) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # RepeatedKFold, regression no args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = RepeatedKFold(n_splits=5, n_repeats=10) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, RepeatedKFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 5*10) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 5 * 10 + ) self.assertEqual(cv.n_repeats, 10) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # RepeatedKFold, multi-output regression with args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None - evaluator.resampling_strategy = RepeatedKFold(n_splits=4, n_repeats=3, random_state=5) + evaluator.resampling_strategy = RepeatedKFold( + n_splits=4, n_repeats=3, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, RepeatedKFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 4*3) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4 * 3 + ) self.assertEqual(cv.n_repeats, 3) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # RepeatedKFold, multi-output regression no args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = RepeatedKFold(n_splits=5, n_repeats=10) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, RepeatedKFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 5*10) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 5 * 10 + ) self.assertEqual(cv.n_repeats, 10) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # RepeatedStratifiedKFold, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = RepeatedStratifiedKFold( - n_splits=2, n_repeats=3, random_state=5) + n_splits=2, n_repeats=3, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, RepeatedStratifiedKFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 2*3) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 * 3 + ) self.assertEqual(cv.n_repeats, 3) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # RepeatedStratifiedKFold, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) - D.data['X_train'] = D.data['Y_train'] + D.data["Y_train"] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) + D.data["X_train"] = D.data["Y_train"] evaluator = TrainEvaluator() - evaluator.resampling_strategy = RepeatedStratifiedKFold(n_splits=5, n_repeats=10) + evaluator.resampling_strategy = RepeatedStratifiedKFold( + n_splits=5, n_repeats=10 + ) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, RepeatedStratifiedKFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 5*10) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 5 * 10 + ) self.assertEqual(cv.n_repeats, 10) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # StratifiedKFold, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) - D.data['X_train'] = D.data['Y_train'] + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) + D.data["X_train"] = D.data["Y_train"] evaluator = TrainEvaluator() evaluator.resampling_strategy = StratifiedKFold evaluator.resampling_strategy_args = None - evaluator.resampling_strategy = StratifiedKFold(n_splits=2, shuffle=True, random_state=5) + evaluator.resampling_strategy = StratifiedKFold( + n_splits=2, shuffle=True, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, StratifiedKFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 2) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 + ) self.assertTrue(cv.shuffle) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # StratifiedKFold, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = StratifiedKFold(n_splits=3, shuffle=False) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, StratifiedKFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 3) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3 + ) self.assertFalse(cv.shuffle) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # TimeSeriesSplit, multi-output regression with args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = TimeSeriesSplit(n_splits=4, max_train_size=3) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, TimeSeriesSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 4) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4 + ) self.assertEqual(cv.max_train_size, 3) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # TimeSeriesSplit, multi-output regression with args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = TimeSeriesSplit(n_splits=3) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, TimeSeriesSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 3) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3 + ) self.assertIsNone(cv.max_train_size) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # TimeSeriesSplit, regression with args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = TimeSeriesSplit(n_splits=4, max_train_size=3) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, TimeSeriesSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 4) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 4 + ) self.assertEqual(cv.max_train_size, 3) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # TimeSeriesSplit, regression no args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = TimeSeriesSplit(n_splits=3) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, TimeSeriesSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 3) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3 + ) self.assertIsNone(cv.max_train_size) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # StratifiedKFold, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = StratifiedKFold(n_splits=3) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, StratifiedKFold) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 3) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 3 + ) self.assertFalse(cv.shuffle) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # GroupShuffleSplit, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} - evaluator.resampling_strategy = GroupShuffleSplit(n_splits=2, test_size=0.3, - random_state=5) + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy = GroupShuffleSplit( + n_splits=2, test_size=0.3, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, GroupShuffleSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 2) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 + ) self.assertEqual(cv.test_size, 0.3) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # GroupShuffleSplit, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = GroupShuffleSplit(n_splits=5) evaluator.resampling_strategy_args = None @@ -1962,27 +2546,35 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # GroupShuffleSplit, regression with args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} - evaluator.resampling_strategy = GroupShuffleSplit(n_splits=2, test_size=0.3, - random_state=5) + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy = GroupShuffleSplit( + n_splits=2, test_size=0.3, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, GroupShuffleSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 2) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 + ) self.assertEqual(cv.test_size, 0.3) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # GroupShuffleSplit, regression no args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = GroupShuffleSplit(n_splits=5) evaluator.resampling_strategy_args = None @@ -1990,29 +2582,39 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # GroupShuffleSplit, multi-output regression with args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() - evaluator.resampling_strategy_args = {'groups': np.array([1, 1, 2, 1, 2, 2])} - evaluator.resampling_strategy = GroupShuffleSplit(n_splits=2, test_size=0.3, - random_state=5) + evaluator.resampling_strategy_args = {"groups": np.array([1, 1, 2, 1, 2, 2])} + evaluator.resampling_strategy = GroupShuffleSplit( + n_splits=2, test_size=0.3, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, GroupShuffleSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 2) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 + ) self.assertEqual(cv.test_size, 0.3) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # GroupShuffleSplit, multi-output regression no args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = GroupShuffleSplit(n_splits=5) evaluator.resampling_strategy_args = None @@ -2020,129 +2622,188 @@ def test_get_splitter_cv_object(self, te_mock): ValueError, "The 'groups' parameter should not be None", evaluator.get_splitter, - D) + D, + ) # StratifiedShuffleSplit, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None evaluator.resampling_strategy = StratifiedShuffleSplit( - n_splits=2, test_size=0.3, random_state=5) + n_splits=2, test_size=0.3, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, StratifiedShuffleSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 2) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 + ) self.assertEqual(cv.test_size, 0.3) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # StratifiedShuffleSplit, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, - 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]) - D.data['X_train'] = D.data['Y_train'] + D.data["Y_train"] = np.array( + [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1] + ) + D.data["X_train"] = D.data["Y_train"] evaluator = TrainEvaluator() evaluator.resampling_strategy = StratifiedShuffleSplit(n_splits=10) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, StratifiedShuffleSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 10) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 10 + ) self.assertIsNone(cv.test_size) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # ShuffleSplit, classification with args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) - D.data['X_train'] = D.data['Y_train'] + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) + D.data["X_train"] = D.data["Y_train"] evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None - evaluator.resampling_strategy = ShuffleSplit(n_splits=2, test_size=0.3, random_state=5) + evaluator.resampling_strategy = ShuffleSplit( + n_splits=2, test_size=0.3, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, ShuffleSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 2) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 + ) self.assertEqual(cv.test_size, 0.3) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # ShuffleSplit, classification no args - D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) + D.data["Y_train"] = np.array([0, 0, 0, 1, 1, 1]) evaluator = TrainEvaluator() evaluator.resampling_strategy = ShuffleSplit(n_splits=10) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, ShuffleSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 10) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 10 + ) self.assertIsNone(cv.test_size) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # ShuffleSplit, regression with args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None - evaluator.resampling_strategy = ShuffleSplit(n_splits=2, test_size=0.3, random_state=5) + evaluator.resampling_strategy = ShuffleSplit( + n_splits=2, test_size=0.3, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, ShuffleSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 2) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 + ) self.assertEqual(cv.test_size, 0.3) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # ShuffleSplit, regression no args - D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) - D.info['task'] = REGRESSION + D.data["Y_train"] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) + D.info["task"] = REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = ShuffleSplit(n_splits=10) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, ShuffleSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 10) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 10 + ) self.assertIsNone(cv.test_size) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # ShuffleSplit, multi-output regression with args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy_args = None - evaluator.resampling_strategy = ShuffleSplit(n_splits=2, test_size=0.3, random_state=5) + evaluator.resampling_strategy = ShuffleSplit( + n_splits=2, test_size=0.3, random_state=5 + ) cv = evaluator.get_splitter(D) self.assertIsInstance(cv, ShuffleSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 2) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 2 + ) self.assertEqual(cv.test_size, 0.3) self.assertEqual(cv.random_state, 5) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) # ShuffleSplit, multi-output regression no args - D.data['Y_train'] = np.array([[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], - [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]]) - D.info['task'] = MULTIOUTPUT_REGRESSION + D.data["Y_train"] = np.array( + [[0.0, 0.1], [0.2, 0.3], [0.4, 0.5], [1.0, 1.1], [1.2, 1.3], [1.4, 1.5]] + ) + D.info["task"] = MULTIOUTPUT_REGRESSION evaluator = TrainEvaluator() evaluator.resampling_strategy = ShuffleSplit(n_splits=10) evaluator.resampling_strategy_args = None cv = evaluator.get_splitter(D) self.assertIsInstance(cv, ShuffleSplit) - self.assertEqual(cv.get_n_splits( - groups=evaluator.resampling_strategy_args['groups']), 10) + self.assertEqual( + cv.get_n_splits(groups=evaluator.resampling_strategy_args["groups"]), 10 + ) self.assertIsNone(cv.test_size) self.assertIsNone(cv.random_state) - next(cv.split(D.data['Y_train'], D.data['Y_train'], - groups=evaluator.resampling_strategy_args['groups'])) + next( + cv.split( + D.data["Y_train"], + D.data["Y_train"], + groups=evaluator.resampling_strategy_args["groups"], + ) + ) @unittest.mock.patch.object(TrainEvaluator, "__init__") def test_holdout_split_size(self, te_mock): @@ -2151,102 +2812,119 @@ def test_holdout_split_size(self, te_mock): D.feat_type = {} evaluator = TrainEvaluator() - evaluator.resampling_strategy = 'holdout' + evaluator.resampling_strategy = "holdout" # Exact Ratio D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])) D.info = dict(task=BINARY_CLASSIFICATION) - evaluator.resampling_strategy_args = {'shuffle': True, - 'train_size': 0.7} + evaluator.resampling_strategy_args = {"shuffle": True, "train_size": 0.7} cv = evaluator.get_splitter(D) self.assertEqual(cv.get_n_splits(), 1) - train_samples, test_samples = next(cv.split(D.data['Y_train'], - D.data['Y_train'])) + train_samples, test_samples = next( + cv.split(D.data["Y_train"], D.data["Y_train"]) + ) self.assertEqual(len(train_samples), 7) self.assertEqual(len(test_samples), 3) # No Shuffle - evaluator.resampling_strategy_args = {'shuffle': False, - 'train_size': 0.7} + evaluator.resampling_strategy_args = {"shuffle": False, "train_size": 0.7} cv = evaluator.get_splitter(D) self.assertEqual(cv.get_n_splits(), 1) - train_samples, test_samples = next(cv.split(D.data['Y_train'], - D.data['Y_train'])) + train_samples, test_samples = next( + cv.split(D.data["Y_train"], D.data["Y_train"]) + ) self.assertEqual(len(train_samples), 7) self.assertEqual(len(test_samples), 3) # Rounded Ratio D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1])) - evaluator.resampling_strategy_args = {'shuffle': True, - 'train_size': 0.7} + evaluator.resampling_strategy_args = {"shuffle": True, "train_size": 0.7} cv = evaluator.get_splitter(D) self.assertEqual(cv.get_n_splits(), 1) - train_samples, test_samples = next(cv.split(D.data['Y_train'], - D.data['Y_train'])) + train_samples, test_samples = next( + cv.split(D.data["Y_train"], D.data["Y_train"]) + ) self.assertEqual(len(train_samples), 6) self.assertEqual(len(test_samples), 3) # Rounded Ratio No Shuffle - evaluator.resampling_strategy_args = {'shuffle': False, - 'train_size': 0.7} + evaluator.resampling_strategy_args = {"shuffle": False, "train_size": 0.7} cv = evaluator.get_splitter(D) self.assertEqual(cv.get_n_splits(), 1) - train_samples, test_samples = next(cv.split(D.data['Y_train'], - D.data['Y_train'])) + train_samples, test_samples = next( + cv.split(D.data["Y_train"], D.data["Y_train"]) + ) self.assertEqual(len(train_samples), 6) self.assertEqual(len(test_samples), 3) # More data - evaluator.resampling_strategy_args = {'shuffle': True, - 'train_size': 0.7} + evaluator.resampling_strategy_args = {"shuffle": True, "train_size": 0.7} D.data = dict(Y_train=np.zeros((900, 1))) cv = evaluator.get_splitter(D) self.assertEqual(cv.get_n_splits(), 1) - train_samples, test_samples = next(cv.split(D.data['Y_train'], - D.data['Y_train'])) + train_samples, test_samples = next( + cv.split(D.data["Y_train"], D.data["Y_train"]) + ) self.assertEqual(len(train_samples), 630) self.assertEqual(len(test_samples), 270) - evaluator.resampling_strategy_args = {'train_size': 0.752} + evaluator.resampling_strategy_args = {"train_size": 0.752} D.data = dict(Y_train=np.zeros((900, 1))) cv = evaluator.get_splitter(D) self.assertEqual(cv.get_n_splits(), 1) - train_samples, test_samples = next(cv.split(D.data['Y_train'], - D.data['Y_train'])) + train_samples, test_samples = next( + cv.split(D.data["Y_train"], D.data["Y_train"]) + ) self.assertEqual(len(train_samples), 676) self.assertEqual(len(test_samples), 224) # Multilabel Exact Ratio - D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1], - [1, 1], [1, 1], [1, 0], [1, 1], [1, 1]] - )) + D.data = dict( + Y_train=np.array( + [ + [0, 0], + [0, 1], + [1, 1], + [1, 0], + [1, 1], + [1, 1], + [1, 1], + [1, 0], + [1, 1], + [1, 1], + ] + ) + ) D.info = dict(task=MULTILABEL_CLASSIFICATION) - evaluator.resampling_strategy_args = {'shuffle': True, - 'train_size': 0.7} + evaluator.resampling_strategy_args = {"shuffle": True, "train_size": 0.7} cv = evaluator.get_splitter(D) self.assertEqual(cv.get_n_splits(), 1) - train_samples, test_samples = next(cv.split(D.data['Y_train'], - D.data['Y_train'])) + train_samples, test_samples = next( + cv.split(D.data["Y_train"], D.data["Y_train"]) + ) self.assertEqual(len(train_samples), 7) self.assertEqual(len(test_samples), 3) # Multilabel No Shuffle - D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1], - [1, 1], [1, 1], [1, 0], [1, 1]])) - evaluator.resampling_strategy_args = {'shuffle': False, - 'train_size': 0.7} + D.data = dict( + Y_train=np.array( + [[0, 0], [0, 1], [1, 1], [1, 0], [1, 1], [1, 1], [1, 1], [1, 0], [1, 1]] + ) + ) + evaluator.resampling_strategy_args = {"shuffle": False, "train_size": 0.7} cv = evaluator.get_splitter(D) self.assertEqual(cv.get_n_splits(), 1) - train_samples, test_samples = next(cv.split(D.data['Y_train'], - D.data['Y_train'])) + train_samples, test_samples = next( + cv.split(D.data["Y_train"], D.data["Y_train"]) + ) self.assertEqual(len(train_samples), 6) self.assertEqual(len(test_samples), 3) @@ -2255,16 +2933,17 @@ class FunctionsTest(unittest.TestCase): def setUp(self): self.queue = multiprocessing.Queue() self.configuration = get_configuration_space( - {'task': MULTICLASS_CLASSIFICATION, - 'is_sparse': False}).get_default_configuration() + {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False} + ).get_default_configuration() self.data = get_multiclass_classification_datamanager() - self.tmp_dir = os.path.join(os.path.dirname(__file__), - '.test_holdout_functions') - self.n = len(self.data.data['Y_train']) - self.y = self.data.data['Y_train'].flatten() + self.tmp_dir = os.path.join( + os.path.dirname(__file__), ".test_holdout_functions" + ) + self.n = len(self.data.data["Y_train"]) + self.y = self.data.data["Y_train"].flatten() tmp_dir_name = self.id() - self.ev_path = os.path.join(this_directory, '.tmp_evaluations', tmp_dir_name) + self.ev_path = os.path.join(this_directory, ".tmp_evaluations", tmp_dir_name) if os.path.exists(self.ev_path): shutil.rmtree(self.ev_path) os.makedirs(self.ev_path, exist_ok=False) @@ -2274,12 +2953,14 @@ def setUp(self): self.backend.get_cv_model_dir.return_value = self.ev_path dummy_model_files = [os.path.join(self.ev_path, str(n)) for n in range(100)] dummy_pred_files = [os.path.join(self.ev_path, str(n)) for n in range(100, 200)] - dummy_cv_model_files = [os.path.join(self.ev_path, str(n)) for n in range(200, 300)] + dummy_cv_model_files = [ + os.path.join(self.ev_path, str(n)) for n in range(200, 300) + ] self.backend.get_model_path.side_effect = dummy_model_files self.backend.get_cv_model_path.side_effect = dummy_cv_model_files self.backend.get_prediction_output_path.side_effect = dummy_pred_files self.backend.load_datamanager.return_value = self.data - self.dataset_name = json.dumps({'task_id': 'test'}) + self.dataset_name = json.dumps({"task_id": "test"}) self.port = logging.handlers.DEFAULT_TCP_LOGGING_PORT def tearDown(self): @@ -2292,7 +2973,7 @@ def test_eval_holdout(self): port=self.port, config=self.configuration, backend=self.backend, - resampling_strategy='holdout', + resampling_strategy="holdout", resampling_strategy_args=None, seed=1, num_run=1, @@ -2307,9 +2988,9 @@ def test_eval_holdout(self): ) info = read_queue(self.queue) self.assertEqual(len(info), 1) - self.assertAlmostEqual(info[0]['loss'], 0.030303030303030276, places=3) - self.assertEqual(info[0]['status'], StatusType.SUCCESS) - self.assertNotIn('bac_metric', info[0]['additional_run_info']) + self.assertAlmostEqual(info[0]["loss"], 0.030303030303030276, places=3) + self.assertEqual(info[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) def test_eval_holdout_all_loss_functions(self): eval_holdout( @@ -2317,7 +2998,7 @@ def test_eval_holdout_all_loss_functions(self): port=self.port, config=self.configuration, backend=self.backend, - resampling_strategy='holdout', + resampling_strategy="holdout", resampling_strategy_args=None, seed=1, num_run=1, @@ -2334,34 +3015,36 @@ def test_eval_holdout_all_loss_functions(self): self.assertEqual(len(rval), 1) fixture = { - 'accuracy': 0.030303030303030276, - 'balanced_accuracy': 0.033333333333333326, - 'f1_macro': 0.032036613272311221, - 'f1_micro': 0.030303030303030276, - 'f1_weighted': 0.030441716940572849, - 'log_loss': 0.06376745642134637, - 'precision_macro': 0.02777777777777779, - 'precision_micro': 0.030303030303030276, - 'precision_weighted': 0.027777777777777901, - 'recall_macro': 0.033333333333333326, - 'recall_micro': 0.030303030303030276, - 'recall_weighted': 0.030303030303030276, - 'num_run': 1, - 'validation_loss': 0.0, - 'test_loss': 0.04, - 'train_loss': 0.0, + "accuracy": 0.030303030303030276, + "balanced_accuracy": 0.033333333333333326, + "f1_macro": 0.032036613272311221, + "f1_micro": 0.030303030303030276, + "f1_weighted": 0.030441716940572849, + "log_loss": 0.06376745642134637, + "precision_macro": 0.02777777777777779, + "precision_micro": 0.030303030303030276, + "precision_weighted": 0.027777777777777901, + "recall_macro": 0.033333333333333326, + "recall_micro": 0.030303030303030276, + "recall_weighted": 0.030303030303030276, + "num_run": 1, + "validation_loss": 0.0, + "test_loss": 0.04, + "train_loss": 0.0, } - additional_run_info = rval[0]['additional_run_info'] + additional_run_info = rval[0]["additional_run_info"] for key, value in fixture.items(): - self.assertAlmostEqual(additional_run_info[key], fixture[key], - msg=key) - self.assertIn('duration', additional_run_info) - self.assertEqual(len(additional_run_info), len(fixture) + 1, - msg=sorted(additional_run_info.items())) + self.assertAlmostEqual(additional_run_info[key], fixture[key], msg=key) + self.assertIn("duration", additional_run_info) + self.assertEqual( + len(additional_run_info), + len(fixture) + 1, + msg=sorted(additional_run_info.items()), + ) - self.assertAlmostEqual(rval[0]['loss'], 0.030303030303030276, places=3) - self.assertEqual(rval[0]['status'], StatusType.SUCCESS) + self.assertAlmostEqual(rval[0]["loss"], 0.030303030303030276, places=3) + self.assertEqual(rval[0]["status"], StatusType.SUCCESS) def test_eval_holdout_iterative_fit_no_timeout(self): eval_iterative_holdout( @@ -2369,7 +3052,7 @@ def test_eval_holdout_iterative_fit_no_timeout(self): port=self.port, config=self.configuration, backend=self.backend, - resampling_strategy='holdout', + resampling_strategy="holdout", resampling_strategy_args=None, seed=1, num_run=1, @@ -2384,9 +3067,9 @@ def test_eval_holdout_iterative_fit_no_timeout(self): ) rval = read_queue(self.queue) self.assertEqual(len(rval), 9) - self.assertAlmostEqual(rval[-1]['loss'], 0.030303030303030276) - self.assertEqual(rval[0]['status'], StatusType.DONOTADVANCE) - self.assertEqual(rval[-1]['status'], StatusType.SUCCESS) + self.assertAlmostEqual(rval[-1]["loss"], 0.030303030303030276) + self.assertEqual(rval[0]["status"], StatusType.DONOTADVANCE) + self.assertEqual(rval[-1]["status"], StatusType.SUCCESS) def test_eval_holdout_budget_iterations(self): eval_holdout( @@ -2394,7 +3077,7 @@ def test_eval_holdout_budget_iterations(self): port=self.port, config=self.configuration, backend=self.backend, - resampling_strategy='holdout', + resampling_strategy="holdout", resampling_strategy_args=None, seed=1, num_run=1, @@ -2406,45 +3089,45 @@ def test_eval_holdout_budget_iterations(self): instance=self.dataset_name, metric=accuracy, budget=1, - budget_type='iterations', + budget_type="iterations", additional_components=dict(), ) info = read_queue(self.queue) self.assertEqual(len(info), 1) - self.assertAlmostEqual(info[0]['loss'], 0.06060606060606055, places=3) - self.assertEqual(info[0]['status'], StatusType.SUCCESS) - self.assertNotIn('bac_metric', info[0]['additional_run_info']) + self.assertAlmostEqual(info[0]["loss"], 0.06060606060606055, places=3) + self.assertEqual(info[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) def test_eval_holdout_budget_iterations_converged(self): configuration = get_configuration_space( - exclude={'classifier': ['random_forest', 'liblinear_svc']}, - info={'task': MULTICLASS_CLASSIFICATION, 'is_sparse': False}, + exclude={"classifier": ["random_forest", "liblinear_svc"]}, + info={"task": MULTICLASS_CLASSIFICATION, "is_sparse": False}, ).get_default_configuration() eval_holdout( queue=self.queue, port=self.port, config=configuration, backend=self.backend, - resampling_strategy='holdout', + resampling_strategy="holdout", resampling_strategy_args=None, seed=1, num_run=1, scoring_functions=None, output_y_hat_optimization=True, include=None, - exclude={'classifier': ['random_forest', 'liblinear_svc']}, + exclude={"classifier": ["random_forest", "liblinear_svc"]}, disable_file_output=False, instance=self.dataset_name, metric=accuracy, budget=80, - budget_type='iterations', + budget_type="iterations", additional_components=dict(), ) info = read_queue(self.queue) self.assertEqual(len(info), 1) - self.assertAlmostEqual(info[0]['loss'], 0.18181818181818177, places=3) - self.assertEqual(info[0]['status'], StatusType.DONOTADVANCE) - self.assertNotIn('bac_metric', info[0]['additional_run_info']) + self.assertAlmostEqual(info[0]["loss"], 0.18181818181818177, places=3) + self.assertEqual(info[0]["status"], StatusType.DONOTADVANCE) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) def test_eval_holdout_budget_subsample(self): eval_holdout( @@ -2452,7 +3135,7 @@ def test_eval_holdout_budget_subsample(self): port=self.port, config=self.configuration, backend=self.backend, - resampling_strategy='holdout', + resampling_strategy="holdout", resampling_strategy_args=None, seed=1, num_run=1, @@ -2464,14 +3147,14 @@ def test_eval_holdout_budget_subsample(self): instance=self.dataset_name, metric=accuracy, budget=30, - budget_type='subsample', + budget_type="subsample", additional_components=dict(), ) info = read_queue(self.queue) self.assertEqual(len(info), 1) - self.assertAlmostEqual(info[0]['loss'], 0.0) - self.assertEqual(info[0]['status'], StatusType.SUCCESS) - self.assertNotIn('bac_metric', info[0]['additional_run_info']) + self.assertAlmostEqual(info[0]["loss"], 0.0) + self.assertEqual(info[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) def test_eval_holdout_budget_mixed_iterations(self): print(self.configuration) @@ -2480,7 +3163,7 @@ def test_eval_holdout_budget_mixed_iterations(self): port=self.port, config=self.configuration, backend=self.backend, - resampling_strategy='holdout', + resampling_strategy="holdout", resampling_strategy_args=None, seed=1, num_run=1, @@ -2492,44 +3175,44 @@ def test_eval_holdout_budget_mixed_iterations(self): instance=self.dataset_name, metric=accuracy, budget=1, - budget_type='mixed', - additional_components=dict() + budget_type="mixed", + additional_components=dict(), ) info = read_queue(self.queue) self.assertEqual(len(info), 1) - self.assertAlmostEqual(info[0]['loss'], 0.06060606060606055) + self.assertAlmostEqual(info[0]["loss"], 0.06060606060606055) def test_eval_holdout_budget_mixed_subsample(self): configuration = get_configuration_space( - exclude={'classifier': ['random_forest']}, - info={'task': MULTICLASS_CLASSIFICATION, 'is_sparse': False}, + exclude={"classifier": ["random_forest"]}, + info={"task": MULTICLASS_CLASSIFICATION, "is_sparse": False}, ).get_default_configuration() - self.assertEqual(configuration['classifier:__choice__'], 'liblinear_svc') + self.assertEqual(configuration["classifier:__choice__"], "liblinear_svc") eval_holdout( queue=self.queue, port=self.port, config=configuration, backend=self.backend, - resampling_strategy='holdout', + resampling_strategy="holdout", resampling_strategy_args=None, seed=1, num_run=1, scoring_functions=None, output_y_hat_optimization=True, include=None, - exclude={'classifier': ['random_forest']}, + exclude={"classifier": ["random_forest"]}, disable_file_output=False, instance=self.dataset_name, metric=accuracy, budget=40, - budget_type='mixed', + budget_type="mixed", additional_components=dict(), ) info = read_queue(self.queue) self.assertEqual(len(info), 1) - self.assertAlmostEqual(info[0]['loss'], 0.06060606060606055) - self.assertEqual(info[0]['status'], StatusType.SUCCESS) - self.assertNotIn('bac_metric', info[0]['additional_run_info']) + self.assertAlmostEqual(info[0]["loss"], 0.06060606060606055) + self.assertEqual(info[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) def test_eval_cv(self): eval_cv( @@ -2539,8 +3222,8 @@ def test_eval_cv(self): backend=self.backend, seed=1, num_run=1, - resampling_strategy='cv', - resampling_strategy_args={'folds': 3}, + resampling_strategy="cv", + resampling_strategy_args={"folds": 3}, scoring_functions=None, output_y_hat_optimization=True, include=None, @@ -2552,9 +3235,9 @@ def test_eval_cv(self): ) rval = read_queue(self.queue) self.assertEqual(len(rval), 1) - self.assertAlmostEqual(rval[0]['loss'], 0.04999999999999997) - self.assertEqual(rval[0]['status'], StatusType.SUCCESS) - self.assertNotIn('bac_metric', rval[0]['additional_run_info']) + self.assertAlmostEqual(rval[0]["loss"], 0.04999999999999997) + self.assertEqual(rval[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", rval[0]["additional_run_info"]) def test_eval_cv_all_loss_functions(self): eval_cv( @@ -2564,8 +3247,8 @@ def test_eval_cv_all_loss_functions(self): backend=self.backend, seed=1, num_run=1, - resampling_strategy='cv', - resampling_strategy_args={'folds': 3}, + resampling_strategy="cv", + resampling_strategy_args={"folds": 3}, scoring_functions=SCORER_LIST, output_y_hat_optimization=True, include=None, @@ -2579,33 +3262,36 @@ def test_eval_cv_all_loss_functions(self): self.assertEqual(len(rval), 1) fixture = { - 'accuracy': 0.04999999999999997, - 'balanced_accuracy': 0.05130303030303027, - 'f1_macro': 0.052793650793650775, - 'f1_micro': 0.04999999999999997, - 'f1_weighted': 0.050090909090909096, - 'log_loss': 0.12108563414774837, - 'precision_macro': 0.04963636363636359, - 'precision_micro': 0.04999999999999997, - 'precision_weighted': 0.045757575757575664, - 'recall_macro': 0.05130303030303027, - 'recall_micro': 0.04999999999999997, - 'recall_weighted': 0.04999999999999997, - 'num_run': 1, - 'validation_loss': 0.04, - 'test_loss': 0.04, - 'train_loss': 0.0, + "accuracy": 0.04999999999999997, + "balanced_accuracy": 0.05130303030303027, + "f1_macro": 0.052793650793650775, + "f1_micro": 0.04999999999999997, + "f1_weighted": 0.050090909090909096, + "log_loss": 0.12108563414774837, + "precision_macro": 0.04963636363636359, + "precision_micro": 0.04999999999999997, + "precision_weighted": 0.045757575757575664, + "recall_macro": 0.05130303030303027, + "recall_micro": 0.04999999999999997, + "recall_weighted": 0.04999999999999997, + "num_run": 1, + "validation_loss": 0.04, + "test_loss": 0.04, + "train_loss": 0.0, } - additional_run_info = rval[0]['additional_run_info'] + additional_run_info = rval[0]["additional_run_info"] for key, value in fixture.items(): self.assertAlmostEqual(additional_run_info[key], fixture[key], msg=key) - self.assertIn('duration', additional_run_info) - self.assertEqual(len(additional_run_info), len(fixture) + 1, - msg=sorted(additional_run_info.items())) + self.assertIn("duration", additional_run_info) + self.assertEqual( + len(additional_run_info), + len(fixture) + 1, + msg=sorted(additional_run_info.items()), + ) - self.assertAlmostEqual(rval[0]['loss'], 0.04999999999999997) - self.assertEqual(rval[0]['status'], StatusType.SUCCESS) + self.assertAlmostEqual(rval[0]["loss"], 0.04999999999999997) + self.assertEqual(rval[0]["status"], StatusType.SUCCESS) # def test_eval_cv_on_subset(self): # backend_api = backend.create(self.tmp_dir, self.tmp_dir) @@ -2619,13 +3305,15 @@ def test_eval_cv_all_loss_functions(self): # self.assertEqual(info[2], 1) def test_eval_partial_cv(self): - results = [0.050000000000000044, - 0.0, - 0.09999999999999998, - 0.09999999999999998, - 0.050000000000000044] + results = [ + 0.050000000000000044, + 0.0, + 0.09999999999999998, + 0.09999999999999998, + 0.050000000000000044, + ] for fold in range(5): - instance = json.dumps({'task_id': 'data', 'fold': fold}) + instance = json.dumps({"task_id": "data", "fold": fold}) eval_partial_cv( port=self.port, queue=self.queue, @@ -2634,8 +3322,8 @@ def test_eval_partial_cv(self): seed=1, num_run=1, instance=instance, - resampling_strategy='partial-cv', - resampling_strategy_args={'folds': 5}, + resampling_strategy="partial-cv", + resampling_strategy_args={"folds": 5}, scoring_functions=None, output_y_hat_optimization=True, include=None, @@ -2646,5 +3334,5 @@ def test_eval_partial_cv(self): ) rval = read_queue(self.queue) self.assertEqual(len(rval), 1) - self.assertAlmostEqual(rval[0]['loss'], results[fold]) - self.assertEqual(rval[0]['status'], StatusType.SUCCESS) + self.assertAlmostEqual(rval[0]["loss"], results[fold]) + self.assertEqual(rval[0]["status"], StatusType.SUCCESS) diff --git a/test/test_metalearning/__init__.py b/test/test_metalearning/__init__.py index cc3cd7becd..e298f0f075 100644 --- a/test/test_metalearning/__init__.py +++ b/test/test_metalearning/__init__.py @@ -1,2 +1,2 @@ # -*- encoding: utf-8 -*- -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/test/test_metalearning/pyMetaLearn/__init__.py b/test/test_metalearning/pyMetaLearn/__init__.py index 8f0ce6cb7c..92bf78f389 100644 --- a/test/test_metalearning/pyMetaLearn/__init__.py +++ b/test/test_metalearning/pyMetaLearn/__init__.py @@ -1 +1 @@ -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py b/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py index 6733dca93f..4877379440 100644 --- a/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py +++ b/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py @@ -1,8 +1,9 @@ import logging import unittest -import numpy as np +import numpy as np import pandas as pd + from autosklearn.metalearning.metalearning.kNearestDatasets.kND import KNearestDatasets from autosklearn.metalearning.metalearning.metrics.misc import get_random_metric @@ -11,15 +12,35 @@ class kNDTest(unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.anneal = pd.Series({"number_of_instances": 898., "number_of_classes": 5., - "number_of_features": 38.}, name=232) - self.krvskp = pd.Series({"number_of_instances": 3196., "number_of_classes": - 2., "number_of_features": 36.}, name=233) - self.labor = pd.Series({"number_of_instances": 57., "number_of_classes": - 2., "number_of_features": 16.}, name=234) - self.runs = {232: [0.1, 0.5, 0.7], - 233: [np.NaN, 0.1, 0.7], - 234: [0.5, 0.7, 0.1]} + self.anneal = pd.Series( + { + "number_of_instances": 898.0, + "number_of_classes": 5.0, + "number_of_features": 38.0, + }, + name=232, + ) + self.krvskp = pd.Series( + { + "number_of_instances": 3196.0, + "number_of_classes": 2.0, + "number_of_features": 36.0, + }, + name=233, + ) + self.labor = pd.Series( + { + "number_of_instances": 57.0, + "number_of_classes": 2.0, + "number_of_features": 16.0, + }, + name=234, + ) + self.runs = { + 232: [0.1, 0.5, 0.7], + 233: [np.NaN, 0.1, 0.7], + 234: [0.5, 0.7, 0.1], + } self.runs = pd.DataFrame(self.runs) self.logger = logging.getLogger() @@ -30,43 +51,47 @@ def test_fit_l1_distance(self): self.assertEqual(kND.best_configuration_per_dataset[232], 0) self.assertEqual(kND.best_configuration_per_dataset[233], 1) self.assertEqual(kND.best_configuration_per_dataset[234], 2) - self.assertTrue((kND.metafeatures == - pd.DataFrame([self.anneal, self.krvskp, self.labor])).all().all()) + self.assertTrue( + (kND.metafeatures == pd.DataFrame([self.anneal, self.krvskp, self.labor])) + .all() + .all() + ) # TODO: rename to kNearestTasks or something def test_kNearestDatasets(self): kND = KNearestDatasets(logger=self.logger) - kND.fit(pd.DataFrame([self.krvskp, self.labor]), - self.runs.loc[:, [233, 234]]) + kND.fit(pd.DataFrame([self.krvskp, self.labor]), self.runs.loc[:, [233, 234]]) neighbor = kND.kNearestDatasets(self.anneal, 1) self.assertEqual([233], neighbor) - neighbor, distance = kND.kNearestDatasets(self.anneal, 1, - return_distance=True) + neighbor, distance = kND.kNearestDatasets(self.anneal, 1, return_distance=True) self.assertEqual([233], neighbor) np.testing.assert_array_almost_equal([3.8320802803440586], distance) neighbors = kND.kNearestDatasets(self.anneal, 2) self.assertEqual([233, 234], neighbors) - neighbors, distance = kND.kNearestDatasets(self.anneal, 2, - return_distance=True) + neighbors, distance = kND.kNearestDatasets(self.anneal, 2, return_distance=True) self.assertEqual([233, 234], neighbors) - np.testing.assert_array_almost_equal([3.8320802803440586, 4.367919719655942], distance) + np.testing.assert_array_almost_equal( + [3.8320802803440586, 4.367919719655942], distance + ) neighbors = kND.kNearestDatasets(self.anneal, -1) self.assertEqual([233, 234], neighbors) - neighbors, distance = kND.kNearestDatasets(self.anneal, -1, - return_distance=True) + neighbors, distance = kND.kNearestDatasets( + self.anneal, -1, return_distance=True + ) self.assertEqual([233, 234], neighbors) - np.testing.assert_array_almost_equal([3.8320802803440586, 4.367919719655942], distance) + np.testing.assert_array_almost_equal( + [3.8320802803440586, 4.367919719655942], distance + ) self.assertRaises(ValueError, kND.kNearestDatasets, self.anneal, 0) self.assertRaises(ValueError, kND.kNearestDatasets, self.anneal, -2) def test_kBestSuggestions(self): kND = KNearestDatasets(logger=self.logger) - kND.fit(pd.DataFrame([self.krvskp, self.labor]), - self.runs.loc[:, [233, 234]]) + kND.fit(pd.DataFrame([self.krvskp, self.labor]), self.runs.loc[:, [233, 234]]) neighbor = kND.kBestSuggestions(self.anneal, 1) np.testing.assert_array_almost_equal( [(233, 3.8320802803440586, 1)], @@ -87,10 +112,10 @@ def test_kBestSuggestions(self): self.assertRaises(ValueError, kND.kBestSuggestions, self.anneal, -2) def test_random_metric(self): - kND = KNearestDatasets(logger=self.logger, - metric=get_random_metric(random_state=1)) - kND.fit(pd.DataFrame([self.krvskp, self.labor]), - self.runs.loc[:, [233, 234]]) + kND = KNearestDatasets( + logger=self.logger, metric=get_random_metric(random_state=1) + ) + kND.fit(pd.DataFrame([self.krvskp, self.labor]), self.runs.loc[:, [233, 234]]) distances = [] for i in range(20): neighbor = kND.kBestSuggestions(self.anneal, 1) diff --git a/test/test_metalearning/pyMetaLearn/test_meta_base.py b/test/test_metalearning/pyMetaLearn/test_meta_base.py index b1ac39ee2a..1c6788e816 100644 --- a/test/test_metalearning/pyMetaLearn/test_meta_base.py +++ b/test/test_metalearning/pyMetaLearn/test_meta_base.py @@ -14,7 +14,7 @@ class MetaBaseTest(unittest.TestCase): def setUp(self): self.cwd = os.getcwd() data_dir = os.path.dirname(__file__) - data_dir = os.path.join(data_dir, 'test_meta_base_data') + data_dir = os.path.join(data_dir, "test_meta_base_data") os.chdir(data_dir) pipeline = autosklearn.pipeline.classification.SimpleClassificationPipeline() @@ -33,33 +33,34 @@ def test_get_all_runs(self): self.assertEqual((125, 125), runs.shape) def test_get_runs(self): - runs = self.base.get_runs('233') + runs = self.base.get_runs("233") # TODO update this ASAP self.assertEqual(125, len(runs)) self.assertIsInstance(runs, pd.Series) def test_get_metafeatures_single_dataset(self): - mf = self.base.get_metafeatures('233') + mf = self.base.get_metafeatures("233") self.assertIsInstance(mf, pd.Series) - self.assertEqual(mf.name, '233') - self.assertEqual(mf.loc['NumberOfInstances'], 2142.0) + self.assertEqual(mf.name, "233") + self.assertEqual(mf.loc["NumberOfInstances"], 2142.0) def test_get_metafeatures_single_feature(self): - mf = self.base.get_metafeatures(features='NumberOfInstances') + mf = self.base.get_metafeatures(features="NumberOfInstances") self.assertIsInstance(mf, pd.Series) - self.assertEqual(mf.shape, (132, )) + self.assertEqual(mf.shape, (132,)) def test_get_metafeatures_single_dataset_and_single_feature(self): - mf = self.base.get_metafeatures('233', features='NumberOfInstances') + mf = self.base.get_metafeatures("233", features="NumberOfInstances") self.assertEqual(mf.shape, ()) def test_get_metafeatures_multiple_datasets(self): - mf = self.base.get_metafeatures(['233', '236']) + mf = self.base.get_metafeatures(["233", "236"]) self.assertIsInstance(mf, pd.DataFrame) self.assertEqual(mf.shape, (2, 46)) def test_get_metafeatures_multiple_features(self): - mf = self.base.get_metafeatures(features=['NumberOfInstances', - 'NumberOfClasses']) + mf = self.base.get_metafeatures( + features=["NumberOfInstances", "NumberOfClasses"] + ) self.assertIsInstance(mf, pd.DataFrame) self.assertEqual(mf.shape, (132, 2)) diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features.py b/test/test_metalearning/pyMetaLearn/test_meta_features.py index d31f3d0227..6a9bec4dcf 100644 --- a/test/test_metalearning/pyMetaLearn/test_meta_features.py +++ b/test/test_metalearning/pyMetaLearn/test_meta_features.py @@ -3,25 +3,21 @@ import tempfile import unittest +import arff +import numpy as np import pandas as pd - import pytest - -import arff from joblib import Memory -import numpy as np -from sklearn.datasets import make_multilabel_classification, fetch_openml +from sklearn.datasets import fetch_openml, make_multilabel_classification -from autosklearn.pipeline.components.data_preprocessing.feature_type \ - import FeatTypeSplit -from autosklearn.metalearning.metafeatures.metafeature import MetaFeatureValue import autosklearn.metalearning.metafeatures.metafeatures as meta_features +from autosklearn.metalearning.metafeatures.metafeature import MetaFeatureValue +from autosklearn.pipeline.components.data_preprocessing.feature_type import ( + FeatTypeSplit, +) -@pytest.fixture( - scope='class', - params=('pandas', 'numpy') -) +@pytest.fixture(scope="class", params=("pandas", "numpy")) def multilabel_train_data(request): cache = Memory(location=tempfile.gettempdir()) cached_func = cache.cache(make_multilabel_classification) @@ -31,20 +27,17 @@ def multilabel_train_data(request): n_classes=5, n_labels=5, return_indicator=True, - random_state=1 + random_state=1, ) - if request.param == 'numpy': + if request.param == "numpy": return X, y - elif request.param == 'pandas': + elif request.param == "pandas": return pd.DataFrame(X), y else: raise ValueError(request.param) -@pytest.fixture( - scope='class', - params=('pandas', 'numpy') -) +@pytest.fixture(scope="class", params=("pandas", "numpy")) def meta_train_data(request): tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) @@ -55,40 +48,41 @@ def meta_train_data(request): # -1 because the last attribute is the class attribute_types = [ - 'numeric' if type(type_) != list else 'nominal' - for name, type_ in dataset['attributes'][:-1]] + "numeric" if type(type_) != list else "nominal" + for name, type_ in dataset["attributes"][:-1] + ] - categorical = {i: True if attribute == 'nominal' else False - for i, attribute in enumerate(attribute_types)} + categorical = { + i: True if attribute == "nominal" else False + for i, attribute in enumerate(attribute_types) + } - data = np.array(dataset['data'], dtype=np.float64) + data = np.array(dataset["data"], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1,)) - logger = logging.getLogger('Meta') + logger = logging.getLogger("Meta") meta_features.helper_functions.set_value( - "MissingValues", meta_features.helper_functions["MissingValues"](X, y, logger, categorical), - ) + "MissingValues", + meta_features.helper_functions["MissingValues"](X, y, logger, categorical), + ) meta_features.helper_functions.set_value( "NumSymbols", - meta_features.helper_functions["NumSymbols"](X, y, logger, categorical), + meta_features.helper_functions["NumSymbols"](X, y, logger, categorical), ) meta_features.helper_functions.set_value( "ClassOccurences", meta_features.helper_functions["ClassOccurences"](X, y, logger), ) - if request.param == 'numpy': + if request.param == "numpy": return X, y, categorical - elif request.param == 'pandas': + elif request.param == "pandas": return pd.DataFrame(X), y, categorical else: raise ValueError(request.param) -@pytest.fixture( - scope='class', - params=('pandas', 'numpy') -) +@pytest.fixture(scope="class", params=("pandas", "numpy")) def meta_train_data_transformed(request): tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) @@ -99,53 +93,67 @@ def meta_train_data_transformed(request): # -1 because the last attribute is the class attribute_types = [ - 'numeric' if type(type_) != list else 'nominal' - for name, type_ in dataset['attributes'][:-1]] - categorical = {i: True if attribute == 'nominal' else False - for i, attribute in enumerate(attribute_types)} + "numeric" if type(type_) != list else "nominal" + for name, type_ in dataset["attributes"][:-1] + ] + categorical = { + i: True if attribute == "nominal" else False + for i, attribute in enumerate(attribute_types) + } - data = np.array(dataset['data'], dtype=np.float64) + data = np.array(dataset["data"], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1,)) - logger = logging.getLogger('Meta') + logger = logging.getLogger("Meta") meta_features.helper_functions.set_value( - "MissingValues", meta_features.helper_functions["MissingValues"](X, y, logger, categorical), - ) + "MissingValues", + meta_features.helper_functions["MissingValues"](X, y, logger, categorical), + ) meta_features.helper_functions.set_value( "NumSymbols", - meta_features.helper_functions["NumSymbols"](X, y, logger, categorical), + meta_features.helper_functions["NumSymbols"](X, y, logger, categorical), ) meta_features.helper_functions.set_value( "ClassOccurences", meta_features.helper_functions["ClassOccurences"](X, y, logger), ) - DPP = FeatTypeSplit(feat_type={ - col: 'categorical' if category else 'numerical' for col, category in categorical.items() - }) + DPP = FeatTypeSplit( + feat_type={ + col: "categorical" if category else "numerical" + for col, category in categorical.items() + } + ) X_transformed = DPP.fit_transform(X) number_numerical = np.sum(~np.array(list(categorical.values()))) - categorical_transformed = {i: True if i < (X_transformed.shape[1] - number_numerical) else False - for i in range(X_transformed.shape[1])} + categorical_transformed = { + i: True if i < (X_transformed.shape[1] - number_numerical) else False + for i in range(X_transformed.shape[1]) + } # pre-compute values for transformed inputs meta_features.helper_functions.set_value( - "PCA", meta_features.helper_functions["PCA"](X_transformed, y, logger), + "PCA", + meta_features.helper_functions["PCA"](X_transformed, y, logger), ) meta_features.helper_functions.set_value( - "Skewnesses", meta_features.helper_functions["Skewnesses"]( - X_transformed, y, logger, categorical_transformed), + "Skewnesses", + meta_features.helper_functions["Skewnesses"]( + X_transformed, y, logger, categorical_transformed + ), ) meta_features.helper_functions.set_value( - "Kurtosisses", meta_features.helper_functions["Kurtosisses"]( - X_transformed, y, logger, categorical_transformed) + "Kurtosisses", + meta_features.helper_functions["Kurtosisses"]( + X_transformed, y, logger, categorical_transformed + ), ) - if request.param == 'numpy': + if request.param == "numpy": return X_transformed, y, categorical_transformed - elif request.param == 'pandas': + elif request.param == "pandas": return pd.DataFrame(X_transformed), y, categorical_transformed else: raise ValueError(request.param) @@ -154,7 +162,8 @@ def meta_train_data_transformed(request): def test_number_of_instance(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["NumberOfInstances"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 898 assert isinstance(mf, MetaFeatureValue) @@ -162,7 +171,8 @@ def test_number_of_instance(meta_train_data): def test_number_of_classes(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["NumberOfClasses"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 5 assert isinstance(mf, MetaFeatureValue) @@ -170,7 +180,8 @@ def test_number_of_classes(meta_train_data): def test_number_of_features(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["NumberOfFeatures"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 38 assert isinstance(mf, MetaFeatureValue) @@ -178,8 +189,9 @@ def test_number_of_features(meta_train_data): def test_missing_values(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.helper_functions["MissingValues"]( - X, y, logging.getLogger('Meta'), categorical) - assert isinstance(mf.value, pd.DataFrame if hasattr(X, 'iloc') else np.ndarray) + X, y, logging.getLogger("Meta"), categorical + ) + assert isinstance(mf.value, pd.DataFrame if hasattr(X, "iloc") else np.ndarray) assert mf.value.shape == X.shape assert 22175 == np.count_nonzero(mf.value) @@ -187,7 +199,8 @@ def test_missing_values(meta_train_data): def test_number_of_Instances_with_missing_values(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["NumberOfInstancesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 898 assert isinstance(mf, MetaFeatureValue) @@ -197,10 +210,12 @@ def test_percentage_of_Instances_with_missing_values(meta_train_data): meta_features.metafeatures.set_value( "NumberOfInstancesWithMissingValues", meta_features.metafeatures["NumberOfInstancesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical), - ) + X, y, logging.getLogger("Meta"), categorical + ), + ) mf = meta_features.metafeatures["PercentageOfInstancesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert pytest.approx(mf.value) == 1.0 assert isinstance(mf, MetaFeatureValue) @@ -208,7 +223,8 @@ def test_percentage_of_Instances_with_missing_values(meta_train_data): def test_number_of_features_with_missing_values(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["NumberOfFeaturesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 29 assert isinstance(mf, MetaFeatureValue) @@ -218,18 +234,22 @@ def test_percentage_of_features_with_missing_values(meta_train_data): meta_features.metafeatures.set_value( "NumberOfFeaturesWithMissingValues", meta_features.metafeatures["NumberOfFeaturesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical)) + X, y, logging.getLogger("Meta"), categorical + ), + ) mf = meta_features.metafeatures["PercentageOfFeaturesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) - assert pytest.approx(mf.value) == float(29)/float(38) + X, y, logging.getLogger("Meta"), categorical + ) + assert pytest.approx(mf.value) == float(29) / float(38) assert isinstance(mf, MetaFeatureValue) def test_number_of_missing_values(meta_train_data): X, y, categorical = meta_train_data - np.save('/tmp/debug', X) + np.save("/tmp/debug", X) mf = meta_features.metafeatures["NumberOfMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 22175 assert isinstance(mf, MetaFeatureValue) @@ -237,18 +257,23 @@ def test_number_of_missing_values(meta_train_data): def test_percentage_missing_values(meta_train_data): X, y, categorical = meta_train_data meta_features.metafeatures.set_value( - "NumberOfMissingValues", meta_features.metafeatures["NumberOfMissingValues"]( - X, y, logging.getLogger('Meta'), categorical)) + "NumberOfMissingValues", + meta_features.metafeatures["NumberOfMissingValues"]( + X, y, logging.getLogger("Meta"), categorical + ), + ) mf = meta_features.metafeatures["PercentageOfMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) - assert pytest.approx(mf.value) == (float(22175)/float(38*898)) + X, y, logging.getLogger("Meta"), categorical + ) + assert pytest.approx(mf.value) == (float(22175) / float(38 * 898)) assert isinstance(mf, MetaFeatureValue) def test_number_of_numeric_features(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["NumberOfNumericFeatures"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 6 assert isinstance(mf, MetaFeatureValue) @@ -256,7 +281,8 @@ def test_number_of_numeric_features(meta_train_data): def test_number_of_categorical_features(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["NumberOfCategoricalFeatures"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 32 assert isinstance(mf, MetaFeatureValue) @@ -264,62 +290,70 @@ def test_number_of_categorical_features(meta_train_data): def test_ratio_numerical_to_categorical(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["RatioNumericalToNominal"]( - X, y, logging.getLogger('Meta'), categorical) - assert pytest.approx(mf.value) == (float(6)/float(32)) + X, y, logging.getLogger("Meta"), categorical + ) + assert pytest.approx(mf.value) == (float(6) / float(32)) assert isinstance(mf, MetaFeatureValue) def test_ratio_categorical_to_numerical(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["RatioNominalToNumerical"]( - X, y, logging.getLogger('Meta'), categorical) - assert pytest.approx(mf.value) == (float(32)/float(6)) + X, y, logging.getLogger("Meta"), categorical + ) + assert pytest.approx(mf.value) == (float(32) / float(6)) assert isinstance(mf, MetaFeatureValue) def test_dataset_ratio(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["DatasetRatio"]( - X, y, logging.getLogger('Meta'), categorical) - assert pytest.approx(mf.value) == (float(38)/float(898)) + X, y, logging.getLogger("Meta"), categorical + ) + assert pytest.approx(mf.value) == (float(38) / float(898)) assert isinstance(mf, MetaFeatureValue) def test_inverse_dataset_ratio(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["InverseDatasetRatio"]( - X, y, logging.getLogger('Meta'), categorical) - assert pytest.approx(mf.value) == (float(898)/float(38)) + X, y, logging.getLogger("Meta"), categorical + ) + assert pytest.approx(mf.value) == (float(898) / float(38)) assert isinstance(mf, MetaFeatureValue) def test_class_occurences(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.helper_functions["ClassOccurences"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == {0.0: 8.0, 1.0: 99.0, 2.0: 684.0, 4.0: 67.0, 5.0: 40.0} def test_class_probability_min(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["ClassProbabilityMin"]( - X, y, logging.getLogger('Meta'), categorical) - assert pytest.approx(mf.value) == (float(8)/float(898)) + X, y, logging.getLogger("Meta"), categorical + ) + assert pytest.approx(mf.value) == (float(8) / float(898)) assert isinstance(mf, MetaFeatureValue) def test_class_probability_max(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["ClassProbabilityMax"]( - X, y, logging.getLogger('Meta'), categorical) - assert pytest.approx(mf.value) == (float(684)/float(898)) + X, y, logging.getLogger("Meta"), categorical + ) + assert pytest.approx(mf.value) == (float(684) / float(898)) assert isinstance(mf, MetaFeatureValue) def test_class_probability_mean(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["ClassProbabilityMean"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) prob_mean = (classes / float(898)).mean() assert pytest.approx(mf.value) == prob_mean @@ -329,7 +363,8 @@ def test_class_probability_mean(meta_train_data): def test_class_probability_std(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["ClassProbabilitySTD"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) prob_std = (classes / float(898)).std() assert pytest.approx(mf.value) == prob_std @@ -339,53 +374,148 @@ def test_class_probability_std(meta_train_data): def test_num_symbols(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.helper_functions["NumSymbols"]( - X, y, logging.getLogger('Meta'), categorical) - symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, 0, - 1, 1, 1, 0, 1, 1, 0, 3, 1, 0, 0, 0, 2, 2, 3, 2] + X, y, logging.getLogger("Meta"), categorical + ) + symbol_frequency = [ + 2, + 1, + 7, + 1, + 2, + 4, + 1, + 1, + 4, + 2, + 1, + 1, + 1, + 2, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 0, + 3, + 1, + 0, + 0, + 0, + 2, + 2, + 3, + 2, + ] assert mf.value == symbol_frequency def test_symbols_min(meta_train_data): X, y, categorical = meta_train_data - mf = meta_features.metafeatures["SymbolsMin"](X, y, logging.getLogger('Meta'), categorical) + mf = meta_features.metafeatures["SymbolsMin"]( + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 1 def test_symbols_max(meta_train_data): X, y, categorical = meta_train_data # this is attribute steel - mf = meta_features.metafeatures["SymbolsMax"](X, y, logging.getLogger('Meta'), categorical) + mf = meta_features.metafeatures["SymbolsMax"]( + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 7 def test_symbols_mean(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["SymbolsMean"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) # Empty looking spaces denote empty attributes - symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, # - 1, 1, 1, 1, 1, 3, 1, 2, 2, 3, 2] + symbol_frequency = [ + 2, + 1, + 7, + 1, + 2, + 4, + 1, + 1, + 4, + 2, + 1, + 1, + 1, + 2, + 1, # + 1, + 1, + 1, + 1, + 1, + 3, + 1, + 2, + 2, + 3, + 2, + ] assert pytest.approx(mf.value) == np.mean(symbol_frequency) def test_symbols_std(meta_train_data): X, y, categorical = meta_train_data - mf = meta_features.metafeatures["SymbolsSTD"](X, y, logging.getLogger('Meta'), categorical) - symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, # - 1, 1, 1, 1, 1, 3, 1, 2, 2, 3, 2] + mf = meta_features.metafeatures["SymbolsSTD"]( + X, y, logging.getLogger("Meta"), categorical + ) + symbol_frequency = [ + 2, + 1, + 7, + 1, + 2, + 4, + 1, + 1, + 4, + 2, + 1, + 1, + 1, + 2, + 1, # + 1, + 1, + 1, + 1, + 1, + 3, + 1, + 2, + 2, + 3, + 2, + ] assert pytest.approx(mf.value) == np.std(symbol_frequency) def test_symbols_sum(meta_train_data): X, y, categorical = meta_train_data - mf = meta_features.metafeatures["SymbolsSum"](X, y, logging.getLogger('Meta'), categorical) + mf = meta_features.metafeatures["SymbolsSum"]( + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 49 def test_class_entropy(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.metafeatures["ClassEntropy"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) classes = classes / sum(classes) entropy = -np.sum([c * np.log2(c) for c in classes]) @@ -396,15 +526,17 @@ def test_class_entropy(meta_train_data): def test_calculate_all_metafeatures(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.calculate_all_metafeatures( - X, y, categorical, "2", logger=logging.getLogger('Meta')) + X, y, categorical, "2", logger=logging.getLogger("Meta") + ) assert 52 == len(mf.metafeature_values) - assert mf.metafeature_values['NumberOfCategoricalFeatures'].value == 32 + assert mf.metafeature_values["NumberOfCategoricalFeatures"].value == 32 def test_kurtosisses(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed mf = meta_features.helper_functions["Kurtosisses"]( - X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + X_transformed, y, logging.getLogger("Meta"), categorical_transformed + ) assert 6 == len(mf.value) @@ -412,34 +544,39 @@ def test_kurtosis_min(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["KurtosisMin"]( - X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + X_transformed, y, logging.getLogger("Meta"), categorical_transformed + ) def test_kurtosis_max(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["KurtosisMax"]( - X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + X_transformed, y, logging.getLogger("Meta"), categorical_transformed + ) def test_kurtosis_mean(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["KurtosisMean"]( - X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + X_transformed, y, logging.getLogger("Meta"), categorical_transformed + ) def test_kurtosis_std(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["KurtosisSTD"]( - X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + X_transformed, y, logging.getLogger("Meta"), categorical_transformed + ) def test_skewnesses(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed mf = meta_features.helper_functions["Skewnesses"]( - X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + X_transformed, y, logging.getLogger("Meta"), categorical_transformed + ) assert 6 == len(mf.value) @@ -447,62 +584,72 @@ def test_skewness_min(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["SkewnessMin"]( - X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + X_transformed, y, logging.getLogger("Meta"), categorical_transformed + ) def test_skewness_max(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["SkewnessMax"]( - X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + X_transformed, y, logging.getLogger("Meta"), categorical_transformed + ) def test_skewness_mean(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["SkewnessMean"]( - X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + X_transformed, y, logging.getLogger("Meta"), categorical_transformed + ) def test_skewness_std(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["SkewnessSTD"]( - X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + X_transformed, y, logging.getLogger("Meta"), categorical_transformed + ) def test_landmark_lda(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? - meta_features.metafeatures["LandmarkLDA"](X_transformed, y, logging.getLogger('Meta')) + meta_features.metafeatures["LandmarkLDA"]( + X_transformed, y, logging.getLogger("Meta") + ) def test_landmark_naive_bayes(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["LandmarkNaiveBayes"]( - X_transformed, y, logging.getLogger('Meta')) + X_transformed, y, logging.getLogger("Meta") + ) def test_landmark_decision_tree(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["LandmarkDecisionTree"]( - X_transformed, y, logging.getLogger('Meta')) + X_transformed, y, logging.getLogger("Meta") + ) def test_decision_node(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["LandmarkDecisionNodeLearner"]( - X_transformed, y, logging.getLogger('Meta')) + X_transformed, y, logging.getLogger("Meta") + ) def test_random_node(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["LandmarkRandomNodeLearner"]( - X_transformed, y, logging.getLogger('Meta')) + X_transformed, y, logging.getLogger("Meta") + ) @unittest.skip("Currently not implemented!") @@ -510,57 +657,72 @@ def test_worst_node(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["LandmarkWorstNodeLearner"]( - X_transformed, y, logging.getLogger('Meta')) + X_transformed, y, logging.getLogger("Meta") + ) def test_1NN(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? - meta_features.metafeatures["Landmark1NN"](X_transformed, y, logging.getLogger('Meta')) + meta_features.metafeatures["Landmark1NN"]( + X_transformed, y, logging.getLogger("Meta") + ) def test_pca(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed - meta_features.helper_functions["PCA"](X_transformed, y, logging.getLogger('Meta')) + meta_features.helper_functions["PCA"](X_transformed, y, logging.getLogger("Meta")) def test_pca_95percent(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed mf = meta_features.metafeatures["PCAFractionOfComponentsFor95PercentVariance"]( - X_transformed, y, logging.getLogger('Meta')) + X_transformed, y, logging.getLogger("Meta") + ) assert pytest.approx(0.2716049382716049) == mf.value def test_pca_kurtosis_first_pc(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed mf = meta_features.metafeatures["PCAKurtosisFirstPC"]( - X_transformed, y, logging.getLogger('Meta')) + X_transformed, y, logging.getLogger("Meta") + ) assert pytest.approx(-0.702850) != mf.value def test_pca_skewness_first_pc(meta_train_data_transformed): X_transformed, y, categorical_transformed = meta_train_data_transformed mf = meta_features.metafeatures["PCASkewnessFirstPC"]( - X_transformed, y, logging.getLogger('Meta')) + X_transformed, y, logging.getLogger("Meta") + ) assert pytest.approx(0.051210) != mf.value def test_class_occurences_multilabel(multilabel_train_data): X, y = multilabel_train_data - mf = meta_features.helper_functions["ClassOccurences"](X, y, logging.getLogger('Meta')) - assert mf.value == [{0: 16.0, 1: 84.0}, - {0: 8.0, 1: 92.0}, - {0: 68.0, 1: 32.0}, - {0: 15.0, 1: 85.0}, - {0: 28.0, 1: 72.0}] + mf = meta_features.helper_functions["ClassOccurences"]( + X, y, logging.getLogger("Meta") + ) + assert mf.value == [ + {0: 16.0, 1: 84.0}, + {0: 8.0, 1: 92.0}, + {0: 68.0, 1: 32.0}, + {0: 15.0, 1: 85.0}, + {0: 28.0, 1: 72.0}, + ] def test_class_probability_min_multilabel(multilabel_train_data): X, y = multilabel_train_data meta_features.helper_functions.set_value( - "ClassOccurences", meta_features.helper_functions["ClassOccurences"]( - X, y, logging.getLogger('Meta'))) - mf = meta_features.metafeatures["ClassProbabilityMin"](X, y, logging.getLogger('Meta')) + "ClassOccurences", + meta_features.helper_functions["ClassOccurences"]( + X, y, logging.getLogger("Meta") + ), + ) + mf = meta_features.metafeatures["ClassProbabilityMin"]( + X, y, logging.getLogger("Meta") + ) assert pytest.approx(mf.value) == (float(8) / float(100)) assert isinstance(mf, MetaFeatureValue) @@ -568,9 +730,14 @@ def test_class_probability_min_multilabel(multilabel_train_data): def test_class_probability_max_multilabel(multilabel_train_data): X, y = multilabel_train_data meta_features.helper_functions.set_value( - "ClassOccurences", meta_features.helper_functions["ClassOccurences"]( - X, y, logging.getLogger('Meta'))) - mf = meta_features.metafeatures["ClassProbabilityMax"](X, y, logging.getLogger('Meta')) + "ClassOccurences", + meta_features.helper_functions["ClassOccurences"]( + X, y, logging.getLogger("Meta") + ), + ) + mf = meta_features.metafeatures["ClassProbabilityMax"]( + X, y, logging.getLogger("Meta") + ) assert pytest.approx(mf.value) == (float(92) / float(100)) assert isinstance(mf, MetaFeatureValue) @@ -578,9 +745,14 @@ def test_class_probability_max_multilabel(multilabel_train_data): def test_class_probability_mean_multilabel(multilabel_train_data): X, y = multilabel_train_data meta_features.helper_functions.set_value( - "ClassOccurences", meta_features.helper_functions["ClassOccurences"]( - X, y, logging.getLogger('Meta'))) - mf = meta_features.metafeatures["ClassProbabilityMean"](X, y, logging.getLogger('Meta')) + "ClassOccurences", + meta_features.helper_functions["ClassOccurences"]( + X, y, logging.getLogger("Meta") + ), + ) + mf = meta_features.metafeatures["ClassProbabilityMean"]( + X, y, logging.getLogger("Meta") + ) classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)] probas = np.mean([np.mean(np.array(cls_)) / 100 for cls_ in classes]) assert mf.value == pytest.approx(probas) @@ -589,7 +761,7 @@ def test_class_probability_mean_multilabel(multilabel_train_data): def test_number_of_classes_multilabel(multilabel_train_data): X, y = multilabel_train_data - mf = meta_features.metafeatures["NumberOfClasses"](X, y, logging.getLogger('Meta')) + mf = meta_features.metafeatures["NumberOfClasses"](X, y, logging.getLogger("Meta")) assert mf.value == 5 assert isinstance(mf, MetaFeatureValue) @@ -597,18 +769,23 @@ def test_number_of_classes_multilabel(multilabel_train_data): def test_class_probability_std_multilabel(multilabel_train_data): X, y = multilabel_train_data meta_features.helper_functions.set_value( - "ClassOccurences", meta_features.helper_functions["ClassOccurences"]( - X, y, logging.getLogger('Meta'))) - mf = meta_features.metafeatures["ClassProbabilitySTD"](X, y, logging.getLogger('Meta')) + "ClassOccurences", + meta_features.helper_functions["ClassOccurences"]( + X, y, logging.getLogger("Meta") + ), + ) + mf = meta_features.metafeatures["ClassProbabilitySTD"]( + X, y, logging.getLogger("Meta") + ) classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)] - probas = np.mean([np.std(np.array(cls_) / 100.) for cls_ in classes]) + probas = np.mean([np.std(np.array(cls_) / 100.0) for cls_ in classes]) assert pytest.approx(mf.value) == probas assert isinstance(mf, MetaFeatureValue) def test_class_entropy_multilabel(multilabel_train_data): X, y = multilabel_train_data - mf = meta_features.metafeatures["ClassEntropy"](X, y, logging.getLogger('Meta')) + mf = meta_features.metafeatures["ClassEntropy"](X, y, logging.getLogger("Meta")) classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)] entropies = [] @@ -623,39 +800,45 @@ def test_class_entropy_multilabel(multilabel_train_data): def test_landmark_lda_multilabel(multilabel_train_data): X, y = multilabel_train_data - mf = meta_features.metafeatures["LandmarkLDA"](X, y, logging.getLogger('Meta')) + mf = meta_features.metafeatures["LandmarkLDA"](X, y, logging.getLogger("Meta")) assert np.isfinite(mf.value) def test_landmark_naive_bayes_multilabel(multilabel_train_data): X, y = multilabel_train_data - mf = meta_features.metafeatures["LandmarkNaiveBayes"](X, y, logging.getLogger('Meta')) + mf = meta_features.metafeatures["LandmarkNaiveBayes"]( + X, y, logging.getLogger("Meta") + ) assert np.isfinite(mf.value) def test_landmark_decision_tree_multilabel(multilabel_train_data): X, y = multilabel_train_data - mf = meta_features.metafeatures["LandmarkDecisionTree"](X, y, logging.getLogger('Meta')) + mf = meta_features.metafeatures["LandmarkDecisionTree"]( + X, y, logging.getLogger("Meta") + ) assert np.isfinite(mf.value) def test_landmark_decision_node_multilabel(multilabel_train_data): X, y = multilabel_train_data mf = meta_features.metafeatures["LandmarkDecisionNodeLearner"]( - X, y, logging.getLogger('Meta')) + X, y, logging.getLogger("Meta") + ) assert np.isfinite(mf.value) def test_landmark_random_node_multilabel(multilabel_train_data): X, y = multilabel_train_data mf = meta_features.metafeatures["LandmarkRandomNodeLearner"]( - X, y, logging.getLogger('Meta')) + X, y, logging.getLogger("Meta") + ) assert np.isfinite(mf.value) def test_1NN_multilabel(multilabel_train_data): X, y = multilabel_train_data - mf = meta_features.metafeatures["Landmark1NN"](X, y, logging.getLogger('TestMeta')) + mf = meta_features.metafeatures["Landmark1NN"](X, y, logging.getLogger("TestMeta")) assert np.isfinite(mf.value) @@ -664,7 +847,8 @@ def test_calculate_all_metafeatures_multilabel(multilabel_train_data): X, y = multilabel_train_data categorical = {i: False for i in range(10)} mf = meta_features.calculate_all_metafeatures( - X, y, categorical, "Generated", logger=logging.getLogger('TestMeta')) + X, y, categorical, "Generated", logger=logging.getLogger("TestMeta") + ) assert 52 == len(mf.metafeature_values) @@ -675,77 +859,84 @@ def test_calculate_all_metafeatures_same_results_across_datatypes(): all metafeatures work in this complex dataset """ X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=True) - categorical = {col: True if X[col].dtype.name == 'category' else False - for col in X.columns} + categorical = { + col: True if X[col].dtype.name == "category" else False for col in X.columns + } mf = meta_features.calculate_all_metafeatures( - X, y, categorical, "2", logger=logging.getLogger('Meta')) + X, y, categorical, "2", logger=logging.getLogger("Meta") + ) assert 52 == len(mf.metafeature_values) expected = { - 'PCASkewnessFirstPC': 0.41897660337677867, - 'PCAKurtosisFirstPC': -0.677692541156901, - 'PCAFractionOfComponentsFor95PercentVariance': 0.2716049382716049, - 'ClassEntropy': 1.1898338562043977, - 'SkewnessSTD': 7.540418815675546, - 'SkewnessMean': 1.47397188548894, - 'SkewnessMax': 29.916569235579203, - 'SkewnessMin': -29.916569235579203, - 'KurtosisSTD': 153.0563504598898, - 'KurtosisMean': 56.998860939761165, - 'KurtosisMax': 893.0011148272025, - 'KurtosisMin': -3.0, - 'SymbolsSum': 49, - 'SymbolsSTD': 1.3679553264445183, - 'SymbolsMean': 1.8846153846153846, - 'SymbolsMax': 7, - 'SymbolsMin': 1, - 'ClassProbabilitySTD': 0.28282850691819206, - 'ClassProbabilityMean': 0.2, - 'ClassProbabilityMax': 0.7616926503340757, - 'ClassProbabilityMin': 0.008908685968819599, - 'InverseDatasetRatio': 23.63157894736842, - 'DatasetRatio': 0.042316258351893093, - 'RatioNominalToNumerical': 5.333333333333333, - 'RatioNumericalToNominal': 0.1875, - 'NumberOfCategoricalFeatures': 32, - 'NumberOfNumericFeatures': 6, - 'NumberOfMissingValues': 22175.0, - 'NumberOfFeaturesWithMissingValues': 29.0, - 'NumberOfInstancesWithMissingValues': 898.0, - 'NumberOfFeatures': 38.0, - 'NumberOfClasses': 5.0, - 'NumberOfInstances': 898.0, - 'LogInverseDatasetRatio': 3.162583908575814, - 'LogDatasetRatio': -3.162583908575814, - 'PercentageOfMissingValues': 0.6498358926268901, - 'PercentageOfFeaturesWithMissingValues': 0.7631578947368421, - 'PercentageOfInstancesWithMissingValues': 1.0, - 'LogNumberOfFeatures': 3.6375861597263857, - 'LogNumberOfInstances': 6.8001700683022, + "PCASkewnessFirstPC": 0.41897660337677867, + "PCAKurtosisFirstPC": -0.677692541156901, + "PCAFractionOfComponentsFor95PercentVariance": 0.2716049382716049, + "ClassEntropy": 1.1898338562043977, + "SkewnessSTD": 7.540418815675546, + "SkewnessMean": 1.47397188548894, + "SkewnessMax": 29.916569235579203, + "SkewnessMin": -29.916569235579203, + "KurtosisSTD": 153.0563504598898, + "KurtosisMean": 56.998860939761165, + "KurtosisMax": 893.0011148272025, + "KurtosisMin": -3.0, + "SymbolsSum": 49, + "SymbolsSTD": 1.3679553264445183, + "SymbolsMean": 1.8846153846153846, + "SymbolsMax": 7, + "SymbolsMin": 1, + "ClassProbabilitySTD": 0.28282850691819206, + "ClassProbabilityMean": 0.2, + "ClassProbabilityMax": 0.7616926503340757, + "ClassProbabilityMin": 0.008908685968819599, + "InverseDatasetRatio": 23.63157894736842, + "DatasetRatio": 0.042316258351893093, + "RatioNominalToNumerical": 5.333333333333333, + "RatioNumericalToNominal": 0.1875, + "NumberOfCategoricalFeatures": 32, + "NumberOfNumericFeatures": 6, + "NumberOfMissingValues": 22175.0, + "NumberOfFeaturesWithMissingValues": 29.0, + "NumberOfInstancesWithMissingValues": 898.0, + "NumberOfFeatures": 38.0, + "NumberOfClasses": 5.0, + "NumberOfInstances": 898.0, + "LogInverseDatasetRatio": 3.162583908575814, + "LogDatasetRatio": -3.162583908575814, + "PercentageOfMissingValues": 0.6498358926268901, + "PercentageOfFeaturesWithMissingValues": 0.7631578947368421, + "PercentageOfInstancesWithMissingValues": 1.0, + "LogNumberOfFeatures": 3.6375861597263857, + "LogNumberOfInstances": 6.8001700683022, } assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected) expected_landmarks = { - 'Landmark1NN': 0.9721601489757914, - 'LandmarkRandomNodeLearner': 0.7616945996275606, - 'LandmarkDecisionNodeLearner': 0.7827932960893855, - 'LandmarkDecisionTree': 0.9899875853507139, - 'LandmarkNaiveBayes': 0.9287150837988827, - 'LandmarkLDA': 0.9610242085661079, + "Landmark1NN": 0.9721601489757914, + "LandmarkRandomNodeLearner": 0.7616945996275606, + "LandmarkDecisionNodeLearner": 0.7827932960893855, + "LandmarkDecisionTree": 0.9899875853507139, + "LandmarkNaiveBayes": 0.9287150837988827, + "LandmarkLDA": 0.9610242085661079, } assert {k: mf[k].value for k in expected_landmarks.keys()} == pytest.approx( - expected_landmarks, rel=1e-5) + expected_landmarks, rel=1e-5 + ) # Then do numpy! X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=False) - categorical = {i: True if category else False - for i, category in enumerate(categorical.values())} + categorical = { + i: True if category else False + for i, category in enumerate(categorical.values()) + } mf = meta_features.calculate_all_metafeatures( - X, y, categorical, "2", logger=logging.getLogger('Meta')) + X, y, categorical, "2", logger=logging.getLogger("Meta") + ) assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected) # The column-reorder of pandas and numpy array are different after # the data preprocessing. So we cannot directly compare, and landmarking is # sensible to column order - expected_landmarks['LandmarkDecisionTree'] = 0.9922098075729361 + expected_landmarks["LandmarkDecisionTree"] = 0.9922098075729361 assert {k: mf[k].value for k in expected_landmarks.keys()} == pytest.approx( - expected_landmarks, rel=1e-5) + expected_landmarks, rel=1e-5 + ) diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py b/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py index 3239184469..856fd595cb 100644 --- a/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py +++ b/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py @@ -2,19 +2,16 @@ import os import arff - import numpy as np - import pytest - from scipy import sparse - from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler -from autosklearn.pipeline.components.data_preprocessing.feature_type \ - import FeatTypeSplit import autosklearn.metalearning.metafeatures.metafeatures as meta_features +from autosklearn.pipeline.components.data_preprocessing.feature_type import ( + FeatTypeSplit, +) @pytest.fixture @@ -28,12 +25,15 @@ def sparse_data(): # -1 because the last attribute is the class attribute_types = [ - 'numeric' if type(type_) != list else 'nominal' - for name, type_ in dataset['attributes'][:-1]] - categorical = {i: True if attribute == 'nominal' else False - for i, attribute in enumerate(attribute_types)} + "numeric" if type(type_) != list else "nominal" + for name, type_ in dataset["attributes"][:-1] + ] + categorical = { + i: True if attribute == "nominal" else False + for i, attribute in enumerate(attribute_types) + } - data = np.array(dataset['data'], dtype=np.float64) + data = np.array(dataset["data"], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1,)) @@ -53,19 +53,19 @@ def sparse_data(): helpers.set_value( "MissingValues", helpers["MissingValues"](X, y, logger, categorical), - ) + ) mf.set_value( "NumberOfMissingValues", mf["NumberOfMissingValues"](X, y, logger, categorical), - ) + ) helpers.set_value( "NumSymbols", helpers["NumSymbols"](X, y, logger, categorical), - ) + ) helpers.set_value( "ClassOccurences", helpers["ClassOccurences"](X, y, logger), - ) + ) return X, y, categorical @@ -80,12 +80,15 @@ def sparse_data_transformed(): # -1 because the last attribute is the class attribute_types = [ - 'numeric' if type(type_) != list else 'nominal' - for name, type_ in dataset['attributes'][:-1]] - categorical = {i: True if attribute == 'nominal' else False - for i, attribute in enumerate(attribute_types)} + "numeric" if type(type_) != list else "nominal" + for name, type_ in dataset["attributes"][:-1] + ] + categorical = { + i: True if attribute == "nominal" else False + for i, attribute in enumerate(attribute_types) + } - data = np.array(dataset['data'], dtype=np.float64) + data = np.array(dataset["data"], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1,)) @@ -96,10 +99,12 @@ def sparse_data_transformed(): X_sparse[NaNs] = 0 X_sparse = sparse.csr_matrix(X_sparse) - ohe = FeatTypeSplit(feat_type={ - col: 'categorical' if category else 'numerical' - for col, category in categorical.items() - }) + ohe = FeatTypeSplit( + feat_type={ + col: "categorical" if category else "numerical" + for col, category in categorical.items() + } + ) X_transformed = X_sparse.copy() X_transformed = ohe.fit_transform(X_transformed) imp = SimpleImputer(copy=False) @@ -109,8 +114,10 @@ def sparse_data_transformed(): # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(list(categorical.values()))) - categorical_transformed = {i: True if i < (X_transformed.shape[1] - number_numerical) else False - for i in range(X_transformed.shape[1])} + categorical_transformed = { + i: True if i < (X_transformed.shape[1] - number_numerical) else False + for i in range(X_transformed.shape[1]) + } X = X_sparse X_transformed = X_transformed @@ -123,28 +130,27 @@ def sparse_data_transformed(): helpers.set_value( "PCA", helpers["PCA"](X_transformed, y, logger), - ) + ) helpers.set_value( "MissingValues", helpers["MissingValues"](X, y, logger, categorical), - ) + ) mf.set_value( "NumberOfMissingValues", mf["NumberOfMissingValues"](X, y, logger, categorical), - ) + ) helpers.set_value( "NumSymbols", helpers["NumSymbols"](X, y, logger, categorical), - ) + ) helpers.set_value( "ClassOccurences", helpers["ClassOccurences"](X, y, logger), - ) + ) helpers.set_value( "Skewnesses", - helpers["Skewnesses"](X_transformed, y, logger, - categorical_transformed), - ) + helpers["Skewnesses"](X_transformed, y, logger, categorical_transformed), + ) helpers.set_value( "Kurtosisses", helpers["Kurtosisses"](X_transformed, y, logger, categorical_transformed), @@ -155,7 +161,8 @@ def sparse_data_transformed(): def test_missing_values(sparse_data): X, y, categorical = sparse_data mf = meta_features.helper_functions["MissingValues"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert sparse.issparse(mf.value) assert mf.value.shape == X.shape assert mf.value.dtype == bool @@ -165,21 +172,24 @@ def test_missing_values(sparse_data): def test_number_of_missing_values(sparse_data): X, y, categorical = sparse_data mf = meta_features.metafeatures["NumberOfMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert 0 == mf.value def test_percentage_missing_values(sparse_data): X, y, categorical = sparse_data mf = meta_features.metafeatures["PercentageOfMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert 0 == mf.value def test_number_of_Instances_with_missing_values(sparse_data): X, y, categorical = sparse_data mf = meta_features.metafeatures["NumberOfInstancesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert 0 == mf.value @@ -188,16 +198,20 @@ def test_percentage_of_Instances_with_missing_values(sparse_data): meta_features.metafeatures.set_value( "NumberOfInstancesWithMissingValues", meta_features.metafeatures["NumberOfInstancesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical)) + X, y, logging.getLogger("Meta"), categorical + ), + ) mf = meta_features.metafeatures["PercentageOfInstancesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert pytest.approx(0) == mf.value def test_number_of_features_with_missing_values(sparse_data): X, y, categorical = sparse_data mf = meta_features.metafeatures["NumberOfFeaturesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert 0 == mf.value @@ -206,33 +220,72 @@ def test_percentage_of_features_with_missing_values(sparse_data): meta_features.metafeatures.set_value( "NumberOfFeaturesWithMissingValues", meta_features.metafeatures["NumberOfFeaturesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical)) + X, y, logging.getLogger("Meta"), categorical + ), + ) mf = meta_features.metafeatures["PercentageOfFeaturesWithMissingValues"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert pytest.approx(0, mf.value) def test_num_symbols(sparse_data): X, y, categorical = sparse_data mf = meta_features.helper_functions["NumSymbols"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) - symbol_frequency = [2, 0, 6, 0, 1, 3, 0, 0, 3, 1, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 2, 2] + symbol_frequency = [ + 2, + 0, + 6, + 0, + 1, + 3, + 0, + 0, + 3, + 1, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 2, + 0, + 0, + 0, + 0, + 1, + 1, + 2, + 2, + ] assert mf.value == symbol_frequency def test_symbols_max(sparse_data): X, y, categorical = sparse_data # this is attribute steel - mf = meta_features.metafeatures["SymbolsMax"](X, y, logging.getLogger('Meta'), categorical) + mf = meta_features.metafeatures["SymbolsMax"]( + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 6 def test_symbols_mean(sparse_data): X, y, categorical = sparse_data mf = meta_features.metafeatures["SymbolsMean"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) # Empty looking spaces denote empty attributes symbol_frequency = [2, 6, 1, 3, 3, 1, 1, 2, 1, 1, 2, 2] assert pytest.approx(mf.value) == np.mean(symbol_frequency) @@ -241,7 +294,8 @@ def test_symbols_mean(sparse_data): def test_symbols_std(sparse_data): X, y, categorical = sparse_data mf = meta_features.metafeatures["SymbolsSTD"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) symbol_frequency = [2, 6, 1, 3, 3, 1, 1, 2, 1, 1, 2, 2] assert pytest.approx(mf.value) == np.std(symbol_frequency) @@ -249,19 +303,49 @@ def test_symbols_std(sparse_data): def test_symbols_sum(sparse_data): X, y, categorical = sparse_data mf = meta_features.metafeatures["SymbolsSum"]( - X, y, logging.getLogger('Meta'), categorical) + X, y, logging.getLogger("Meta"), categorical + ) assert mf.value == 25 def test_skewnesses(sparse_data_transformed): X_transformed, y, categorical_transformed = sparse_data_transformed fixture = [ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - -0.696970849903357, 0.626346013011262, 0.38099875966240554, - 1.4762248835141032, 0.07687661087633788, 0.3688979783036015 + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + -0.696970849903357, + 0.626346013011262, + 0.38099875966240554, + 1.4762248835141032, + 0.07687661087633788, + 0.3688979783036015, ] - mf = meta_features.helper_functions["Skewnesses"](X_transformed, y, logging.getLogger('Meta')) + mf = meta_features.helper_functions["Skewnesses"]( + X_transformed, y, logging.getLogger("Meta") + ) print(mf.value) print(fixture) np.testing.assert_allclose(mf.value, fixture) @@ -269,13 +353,42 @@ def test_skewnesses(sparse_data_transformed): def test_kurtosisses(sparse_data_transformed): fixture = [ - -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, - -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, -3.0, - -3.0, -1.1005836114255763, -1.1786325509475744, -1.23879983823279, - 1.3934382644137013, -0.9768209837948336, -1.7937072296512784 + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -3.0, + -1.1005836114255763, + -1.1786325509475744, + -1.23879983823279, + 1.3934382644137013, + -0.9768209837948336, + -1.7937072296512784, ] X_transformed, y, categorical_transformed = sparse_data_transformed - mf = meta_features.helper_functions["Kurtosisses"](X_transformed, y, logging.getLogger('Meta')) + mf = meta_features.helper_functions["Kurtosisses"]( + X_transformed, y, logging.getLogger("Meta") + ) print(mf.value) np.testing.assert_allclose(mf.value, fixture) @@ -283,26 +396,30 @@ def test_kurtosisses(sparse_data_transformed): def test_pca_95percent(sparse_data_transformed): X_transformed, y, categorical_transformed = sparse_data_transformed mf = meta_features.metafeatures["PCAFractionOfComponentsFor95PercentVariance"]( - X_transformed, y, logging.getLogger('Meta')) + X_transformed, y, logging.getLogger("Meta") + ) assert pytest.approx(0.7741935483870968) == mf.value def test_pca_kurtosis_first_pc(sparse_data_transformed): X_transformed, y, categorical_transformed = sparse_data_transformed mf = meta_features.metafeatures["PCAKurtosisFirstPC"]( - X_transformed, y, logging.getLogger('Meta')) + X_transformed, y, logging.getLogger("Meta") + ) assert pytest.approx(-0.15444516166802469) == mf.value def test_pca_skewness_first_pc(sparse_data_transformed): X_transformed, y, categorical_transformed = sparse_data_transformed mf = meta_features.metafeatures["PCASkewnessFirstPC"]( - X_transformed, y, logging.getLogger('Meta')) + X_transformed, y, logging.getLogger("Meta") + ) assert pytest.approx(0.026514792083623905) == mf.value def test_calculate_all_metafeatures(sparse_data): X, y, categorical = sparse_data mf = meta_features.calculate_all_metafeatures( - X, y, categorical, "2", logger=logging.getLogger('Meta')) + X, y, categorical, "2", logger=logging.getLogger("Meta") + ) assert 52 == len(mf.metafeature_values) diff --git a/test/test_metalearning/pyMetaLearn/test_metalearner.py b/test/test_metalearning/pyMetaLearn/test_metalearner.py index 58f2ce800a..a8b7d604cb 100644 --- a/test/test_metalearning/pyMetaLearn/test_metalearner.py +++ b/test/test_metalearning/pyMetaLearn/test_metalearner.py @@ -1,14 +1,13 @@ import logging -import numpy as np import os import unittest +import numpy as np import pandas as pd - from ConfigSpace.configuration_space import Configuration -import autosklearn.pipeline.classification -import autosklearn.metalearning.optimizers.metalearn_optimizer.metalearner as metalearner +import autosklearn.metalearning.optimizers.metalearn_optimizer.metalearner as metalearner # noqa: E501 +import autosklearn.pipeline.classification from autosklearn.metalearning.metalearning.meta_base import MetaBase logging.basicConfig() @@ -20,7 +19,7 @@ class MetaLearnerTest(unittest.TestCase): def setUp(self): self.cwd = os.getcwd() data_dir = os.path.dirname(__file__) - data_dir = os.path.join(data_dir, 'test_meta_base_data') + data_dir = os.path.join(data_dir, "test_meta_base_data") os.chdir(data_dir) pipeline = autosklearn.pipeline.classification.SimpleClassificationPipeline() @@ -29,7 +28,8 @@ def setUp(self): self.logger = logging.getLogger() meta_base = MetaBase(self.cs, data_dir, logger=self.logger) self.meta_optimizer = metalearner.MetaLearningOptimizer( - '233', self.cs, meta_base, logger=self.logger) + "233", self.cs, meta_base, logger=self.logger + ) def tearDown(self): os.chdir(self.cwd) @@ -38,8 +38,8 @@ def test_metalearning_suggest_all(self): ret = self.meta_optimizer.metalearning_suggest_all() self.assertEqual(124, len(ret)) # Reduced to 17 as we changed QDA searchspace - self.assertEqual('gradient_boosting', ret[0]['classifier:__choice__']) - self.assertEqual('adaboost', ret[1]['classifier:__choice__']) + self.assertEqual("gradient_boosting", ret[0]["classifier:__choice__"]) + self.assertEqual("adaboost", ret[1]["classifier:__choice__"]) # There is no test for exclude_double_configuration as it's not present # in the test data @@ -48,17 +48,17 @@ def test_metalearning_suggest_all_nan_metafeatures(self): ret = self.meta_optimizer.metalearning_suggest_all() self.assertEqual(124, len(ret)) # Reduced to 17 as we changed QDA searchspace - self.assertEqual('gradient_boosting', ret[0]['classifier:__choice__']) - self.assertEqual('gradient_boosting', ret[1]['classifier:__choice__']) + self.assertEqual("gradient_boosting", ret[0]["classifier:__choice__"]) + self.assertEqual("gradient_boosting", ret[1]["classifier:__choice__"]) def test_metalearning_suggest(self): ret = self.meta_optimizer.metalearning_suggest([]) self.assertIsInstance(ret, Configuration) - self.assertEqual('gradient_boosting', ret['classifier:__choice__']) + self.assertEqual("gradient_boosting", ret["classifier:__choice__"]) ret2 = self.meta_optimizer.metalearning_suggest([ret]) self.assertIsInstance(ret2, Configuration) - self.assertEqual('adaboost', ret2['classifier:__choice__']) + self.assertEqual("adaboost", ret2["classifier:__choice__"]) def test_learn(self): # Test only some special cases which are probably not yet handled @@ -67,8 +67,10 @@ def test_learn(self): self.meta_optimizer._learn() def test_split_metafeature_array(self): - ds_metafeatures, other_metafeatures = self.meta_optimizer. \ - _split_metafeature_array() + ( + ds_metafeatures, + other_metafeatures, + ) = self.meta_optimizer._split_metafeature_array() self.assertIsInstance(ds_metafeatures, pd.Series) self.assertEqual(ds_metafeatures.shape, (46,)) self.assertIsInstance(other_metafeatures, pd.DataFrame) diff --git a/test/test_metalearning/pyMetaLearn/test_optimizer_base.py b/test/test_metalearning/pyMetaLearn/test_optimizer_base.py index a78a6a7f61..63dc2184da 100644 --- a/test/test_metalearning/pyMetaLearn/test_optimizer_base.py +++ b/test/test_metalearning/pyMetaLearn/test_optimizer_base.py @@ -1,5 +1,5 @@ -from collections import OrderedDict import unittest +from collections import OrderedDict from autosklearn.metalearning.optimizers import optimizer_base @@ -14,8 +14,9 @@ def setUp(self): def test_parse_hyperopt_string(self): hyperparameter_string = "x {-5, 0, 5, 10}\ny {0, 5, 10, 15}" - expected = OrderedDict([["x", ["-5", "0", "5", "10"]], - ["y", ["0", "5", "10", "15"]]]) + expected = OrderedDict( + [["x", ["-5", "0", "5", "10"]], ["y", ["0", "5", "10", "15"]]] + ) ret = optimizer_base.parse_hyperparameter_string(hyperparameter_string) self.assertEqual(ret, expected) @@ -28,8 +29,11 @@ def test_parse_hyperopt_string(self): self.assertEqual(ret, expected) hyperparameter_string = "x {-5, 0, 5, 10}\ny 0, 5, 10, 15} [5]" - self.assertRaises(ValueError, optimizer_base.parse_hyperparameter_string, - hyperparameter_string) + self.assertRaises( + ValueError, + optimizer_base.parse_hyperparameter_string, + hyperparameter_string, + ) def test_construct_cli_call(self): cli_call = optimizer_base.construct_cli_call("cv.py", {"x": -5, "y": 0}) diff --git a/test/test_metalearning/test_metalearning.py b/test/test_metalearning/test_metalearning.py index 6a7e87511d..3ec847a8f5 100644 --- a/test/test_metalearning/test_metalearning.py +++ b/test/test_metalearning/test_metalearning.py @@ -1,18 +1,17 @@ # -*- encoding: utf-8 -*- import unittest -from autosklearn.pipeline.util import get_dataset -from autosklearn.classification import AutoSklearnClassifier +from sklearn.datasets import load_breast_cancer -from autosklearn.smbo import _calculate_metafeatures, _calculate_metafeatures_encoded -from autosklearn.constants import REGRESSION, MULTICLASS_CLASSIFICATION +from autosklearn.classification import AutoSklearnClassifier +from autosklearn.constants import MULTICLASS_CLASSIFICATION, REGRESSION from autosklearn.metalearning.mismbo import suggest_via_metalearning +from autosklearn.pipeline.util import get_dataset +from autosklearn.smbo import _calculate_metafeatures, _calculate_metafeatures_encoded from autosklearn.util.pipeline import get_configuration_space -from sklearn.datasets import load_breast_cancer class MetafeatureValueDummy(object): - def __init__(self, name, value): self.name = name self.value = value @@ -22,83 +21,93 @@ class Test(unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.X_train, self.Y_train, self.X_test, self.Y_test = \ - get_dataset('iris') + self.X_train, self.Y_train, self.X_test, self.Y_test = get_dataset("iris") eliminate_class_two = self.Y_train != 2 self.X_train = self.X_train[eliminate_class_two] self.Y_train = self.Y_train[eliminate_class_two] - @unittest.skip('TODO refactor!') + @unittest.skip("TODO refactor!") def test_metalearning(self): - dataset_name_classification = 'digits' + dataset_name_classification = "digits" initial_challengers_classification = { - "ACC_METRIC": "--initial-challengers \" " - "-balancing:strategy 'weighting' " - "-classifier:__choice__ 'proj_logit'", - "AUC_METRIC": "--initial-challengers \" " - "-balancing:strategy 'weighting' " - "-classifier:__choice__ 'liblinear_svc'", - "BAC_METRIC": "--initial-challengers \" " - "-balancing:strategy 'weighting' " - "-classifier:__choice__ 'proj_logit'", - "F1_METRIC": "--initial-challengers \" " - "-balancing:strategy 'weighting' " - "-classifier:__choice__ 'proj_logit'", - "PAC_METRIC": "--initial-challengers \" " - "-balancing:strategy 'none' " - "-classifier:__choice__ 'random_forest'" + "ACC_METRIC": '--initial-challengers " ' + "-balancing:strategy 'weighting' " + "-classifier:__choice__ 'proj_logit'", + "AUC_METRIC": '--initial-challengers " ' + "-balancing:strategy 'weighting' " + "-classifier:__choice__ 'liblinear_svc'", + "BAC_METRIC": '--initial-challengers " ' + "-balancing:strategy 'weighting' " + "-classifier:__choice__ 'proj_logit'", + "F1_METRIC": '--initial-challengers " ' + "-balancing:strategy 'weighting' " + "-classifier:__choice__ 'proj_logit'", + "PAC_METRIC": '--initial-challengers " ' + "-balancing:strategy 'none' " + "-classifier:__choice__ 'random_forest'", } - dataset_name_regression = 'diabetes' + dataset_name_regression = "diabetes" initial_challengers_regression = { - "A_METRIC": "--initial-challengers \" " - "-imputation:strategy 'mean' " - "-one_hot_encoding:minimum_fraction '0.01' " - "-one_hot_encoding:use_minimum_fraction 'True' " - "-preprocessor:__choice__ 'no_preprocessing' " - "-regressor:__choice__ 'random_forest'", - "R2_METRIC": "--initial-challengers \" " - "-imputation:strategy 'mean' " - "-one_hot_encoding:minimum_fraction '0.01' " - "-one_hot_encoding:use_minimum_fraction 'True' " - "-preprocessor:__choice__ 'no_preprocessing' " - "-regressor:__choice__ 'random_forest'", + "A_METRIC": '--initial-challengers " ' + "-imputation:strategy 'mean' " + "-one_hot_encoding:minimum_fraction '0.01' " + "-one_hot_encoding:use_minimum_fraction 'True' " + "-preprocessor:__choice__ 'no_preprocessing' " + "-regressor:__choice__ 'random_forest'", + "R2_METRIC": '--initial-challengers " ' + "-imputation:strategy 'mean' " + "-one_hot_encoding:minimum_fraction '0.01' " + "-one_hot_encoding:use_minimum_fraction 'True' " + "-preprocessor:__choice__ 'no_preprocessing' " + "-regressor:__choice__ 'random_forest'", } for dataset_name, task, initial_challengers in [ (dataset_name_regression, REGRESSION, initial_challengers_regression), - (dataset_name_classification, MULTICLASS_CLASSIFICATION, - initial_challengers_classification)]: + ( + dataset_name_classification, + MULTICLASS_CLASSIFICATION, + initial_challengers_classification, + ), + ]: for metric in initial_challengers: configuration_space = get_configuration_space( - { - 'metric': metric, - 'task': task, - 'is_sparse': False - }, - include={'feature_preprocessor': ['no_preprocessing']}) + {"metric": metric, "task": task, "is_sparse": False}, + include={"feature_preprocessor": ["no_preprocessing"]}, + ) X_train, Y_train, X_test, Y_test = get_dataset(dataset_name) categorical = {i: False for i in range(X_train.shape[1])} meta_features_label = _calculate_metafeatures( - X_train, Y_train, categorical, dataset_name, task) + X_train, Y_train, categorical, dataset_name, task + ) meta_features_encoded_label = _calculate_metafeatures_encoded( - X_train, Y_train, categorical, dataset_name, task) - - initial_configuration_strings_for_smac = \ - suggest_via_metalearning( - meta_features_label, - meta_features_encoded_label, - configuration_space, dataset_name, metric, - task, False, 1, None) + X_train, Y_train, categorical, dataset_name, task + ) + + initial_configuration_strings_for_smac = suggest_via_metalearning( + meta_features_label, + meta_features_encoded_label, + configuration_space, + dataset_name, + metric, + task, + False, + 1, + None, + ) print(metric) print(initial_configuration_strings_for_smac[0]) - self.assertTrue(initial_configuration_strings_for_smac[ - 0].startswith(initial_challengers[metric])) + self.assertTrue( + initial_configuration_strings_for_smac[0].startswith( + initial_challengers[metric] + ) + ) def test_metadata_directory(self): # Test that metadata directory is set correctly (if user specifies, @@ -108,11 +117,10 @@ def test_metadata_directory(self): automl1 = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, - metadata_directory="pyMetaLearn/metadata_dir", # user specified metadata_dir + metadata_directory="pyMetaLearn/metadata_dir", # user metadata_dir dask_client=dask_client, ) - self.assertEqual(automl1.metadata_directory, - "pyMetaLearn/metadata_dir") + self.assertEqual(automl1.metadata_directory, "pyMetaLearn/metadata_dir") automl2 = AutoSklearnClassifier( # default metadata_dir time_left_for_this_task=30, @@ -130,6 +138,11 @@ def test_metadata_directory(self): ensemble_size=0, ) X, y = load_breast_cancer(return_X_y=True) - self.assertRaisesRegex(ValueError, "The specified metadata directory " - "\'%s\' does not exist!" % nonexistent_dir, - automl3.fit, X=X, y=y) + self.assertRaisesRegex( + ValueError, + "The specified metadata directory " + "'%s' does not exist!" % nonexistent_dir, + automl3.fit, + X=X, + y=y, + ) diff --git a/test/test_metric/__init__.py b/test/test_metric/__init__.py index cc3cd7becd..e298f0f075 100644 --- a/test/test_metric/__init__.py +++ b/test/test_metric/__init__.py @@ -1,2 +1,2 @@ # -*- encoding: utf-8 -*- -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 3c6ff73c2b..334a485fe3 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -1,27 +1,24 @@ import unittest import warnings -import pytest - import numpy as np +import pytest import sklearn.metrics +from smac.utils.constants import MAXINT import autosklearn.metrics - -from autosklearn.metrics import calculate_score, calculate_loss, calculate_metric from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION - -from smac.utils.constants import MAXINT +from autosklearn.metrics import calculate_loss, calculate_metric, calculate_score class TestScorer(unittest.TestCase): - def test_predict_scorer_binary(self): y_true = np.array([0, 0, 1, 1]) y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, 0, 1, {}) + "accuracy", sklearn.metrics.accuracy_score, 1, 0, 1, {} + ) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -35,15 +32,20 @@ def test_predict_scorer_binary(self): self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._PredictScorer( - 'bac', sklearn.metrics.balanced_accuracy_score, - 1, 0, 1, {}) + "bac", sklearn.metrics.balanced_accuracy_score, 1, 0, 1, {} + ) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._PredictScorer( - name='accuracy', score_func=sklearn.metrics.accuracy_score, - optimum=1, worst_possible_result=0, sign=-1, kwargs={}) + name="accuracy", + score_func=sklearn.metrics.accuracy_score, + optimum=1, + worst_possible_result=0, + sign=-1, + kwargs={}, + ) y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) score = scorer(y_true, y_pred) @@ -54,7 +56,8 @@ def test_predict_scorer_multiclass(self): y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, 0, 1, {}) + "accuracy", sklearn.metrics.accuracy_score, 1, 0, 1, {} + ) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -68,14 +71,15 @@ def test_predict_scorer_multiclass(self): self.assertAlmostEqual(score, 0.333333333) scorer = autosklearn.metrics._PredictScorer( - 'bac', sklearn.metrics.balanced_accuracy_score, - 1, 0, 1, {}) + "bac", sklearn.metrics.balanced_accuracy_score, 1, 0, 1, {} + ) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.333333333) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, 0, -1, {}) + "accuracy", sklearn.metrics.accuracy_score, 1, 0, -1, {} + ) y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) score = scorer(y_true, y_pred) @@ -86,7 +90,8 @@ def test_predict_scorer_multilabel(self): y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, 0, 1, {}) + "accuracy", sklearn.metrics.accuracy_score, 1, 0, 1, {} + ) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -100,7 +105,8 @@ def test_predict_scorer_multilabel(self): self.assertAlmostEqual(score, 0.25) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, 0, -1, {}) + "accuracy", sklearn.metrics.accuracy_score, 1, 0, -1, {} + ) y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) score = scorer(y_true, y_pred) @@ -111,7 +117,8 @@ def test_predict_scorer_regression(self): y_pred = y_true.copy() scorer = autosklearn.metrics._PredictScorer( - 'r2', sklearn.metrics.r2_score, 1, 0, 1, {}) + "r2", sklearn.metrics.r2_score, 1, 0, 1, {} + ) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -125,7 +132,8 @@ def test_proba_scorer_binary(self): y_pred = [[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]] scorer = autosklearn.metrics._ProbaScorer( - 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, 1, {}) + "log_loss", sklearn.metrics.log_loss, 0, MAXINT, 1, {} + ) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.0) @@ -139,7 +147,8 @@ def test_proba_scorer_binary(self): self.assertAlmostEqual(score, 0.69314718055994529) scorer = autosklearn.metrics._ProbaScorer( - 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, -1, {}) + "log_loss", sklearn.metrics.log_loss, 0, MAXINT, -1, {} + ) y_pred = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]] score = scorer(y_true, y_pred) @@ -150,7 +159,8 @@ def test_proba_scorer_multiclass(self): y_pred = [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] scorer = autosklearn.metrics._ProbaScorer( - 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, 1, {}) + "log_loss", sklearn.metrics.log_loss, 0, MAXINT, 1, {} + ) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.0) @@ -164,7 +174,8 @@ def test_proba_scorer_multiclass(self): self.assertAlmostEqual(score, 1.0986122886681096) scorer = autosklearn.metrics._ProbaScorer( - 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, -1, {}) + "log_loss", sklearn.metrics.log_loss, 0, MAXINT, -1, {} + ) y_pred = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]] score = scorer(y_true, y_pred) @@ -175,7 +186,8 @@ def test_proba_scorer_multilabel(self): y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) scorer = autosklearn.metrics._ProbaScorer( - 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, 1, {}) + "log_loss", sklearn.metrics.log_loss, 0, MAXINT, 1, {} + ) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.34657359027997314) @@ -189,7 +201,8 @@ def test_proba_scorer_multilabel(self): self.assertAlmostEqual(score, 0.69314718055994529) scorer = autosklearn.metrics._ProbaScorer( - 'log_loss', sklearn.metrics.log_loss, 0, MAXINT, -1, {}) + "log_loss", sklearn.metrics.log_loss, 0, MAXINT, -1, {} + ) y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) score = scorer(y_true, y_pred) @@ -200,7 +213,8 @@ def test_threshold_scorer_binary(self): y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) scorer = autosklearn.metrics._ThresholdScorer( - 'roc_auc', sklearn.metrics.roc_auc_score, 1, 0, 1, {}) + "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, 1, {} + ) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -214,7 +228,8 @@ def test_threshold_scorer_binary(self): self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._ThresholdScorer( - 'roc_auc', sklearn.metrics.roc_auc_score, 1, 0, -1, {}) + "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, -1, {} + ) y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) score = scorer(y_true, y_pred) @@ -225,7 +240,8 @@ def test_threshold_scorer_multilabel(self): y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) scorer = autosklearn.metrics._ThresholdScorer( - 'roc_auc', sklearn.metrics.roc_auc_score, 1, 0, 1, {}) + "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, 1, {} + ) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -239,7 +255,8 @@ def test_threshold_scorer_multilabel(self): self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._ThresholdScorer( - 'roc_auc', sklearn.metrics.roc_auc_score, 1, 0, -1, {}) + "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, -1, {} + ) y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) score = scorer(y_true, y_pred) @@ -250,7 +267,8 @@ def test_sign_flip(self): y_pred = y_true.copy() scorer = autosklearn.metrics.make_scorer( - 'r2', sklearn.metrics.r2_score, greater_is_better=True) + "r2", sklearn.metrics.r2_score, greater_is_better=True + ) score = scorer(y_true, y_pred + 1.0) self.assertAlmostEqual(score, -9.0) @@ -262,7 +280,8 @@ def test_sign_flip(self): self.assertAlmostEqual(score, 1.0) scorer = autosklearn.metrics.make_scorer( - 'r2', sklearn.metrics.r2_score, greater_is_better=False) + "r2", sklearn.metrics.r2_score, greater_is_better=False + ) score = scorer(y_true, y_pred + 1.0) self.assertAlmostEqual(score, 9.0) @@ -275,49 +294,44 @@ def test_sign_flip(self): class TestMetricsDoNotAlterInput(unittest.TestCase): - def test_regression_metrics(self): for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items(): y_true = np.random.random(100).reshape((-1, 1)) y_pred = y_true.copy() + np.random.randn(100, 1) * 0.1 - if metric == 'mean_squared_log_error': + if metric == "mean_squared_log_error": y_true = np.abs(y_true) y_pred = np.abs(y_pred) y_true_2 = y_true.copy() y_pred_2 = y_pred.copy() self.assertTrue(np.isfinite(scorer(y_true_2, y_pred_2))) - np.testing.assert_array_almost_equal(y_true, y_true_2, - err_msg=metric) - np.testing.assert_array_almost_equal(y_pred, y_pred_2, - err_msg=metric) + np.testing.assert_array_almost_equal(y_true, y_true_2, err_msg=metric) + np.testing.assert_array_almost_equal(y_pred, y_pred_2, err_msg=metric) def test_classification_metrics(self): for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): y_true = np.random.randint(0, 2, size=(100, 1)) y_pred = np.random.random(200).reshape((-1, 2)) - y_pred = np.array([y_pred[i] / np.sum(y_pred[i]) - for i in range(100)]) + y_pred = np.array([y_pred[i] / np.sum(y_pred[i]) for i in range(100)]) y_true_2 = y_true.copy() y_pred_2 = y_pred.copy() try: self.assertTrue(np.isfinite(scorer(y_true_2, y_pred_2))) - np.testing.assert_array_almost_equal(y_true, y_true_2, - err_msg=metric) - np.testing.assert_array_almost_equal(y_pred, y_pred_2, - err_msg=metric) + np.testing.assert_array_almost_equal(y_true, y_true_2, err_msg=metric) + np.testing.assert_array_almost_equal(y_pred, y_pred_2, err_msg=metric) except ValueError as e: - if e.args[0] == 'Samplewise metrics are not available outside' \ - ' of multilabel classification.': + if ( + e.args[0] == "Samplewise metrics are not available outside" + " of multilabel classification." + ): pass else: raise e class TestMetric(unittest.TestCase): - def test_regression_all(self): for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items(): @@ -331,7 +345,7 @@ def test_regression_all(self): current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) - if scorer.name == 'mean_squared_log_error': + if scorer.name == "mean_squared_log_error": continue y_pred = np.array([-1, 0, -1, 0]) @@ -352,31 +366,39 @@ def test_classification_binary(self): # TODO: but its behavior is not right. When y_pred is completely # TODO: wrong, it does return 0.5, but when it is not completely # TODO: wrong, it returns value smaller than 0.5. - if metric in ['average_precision', - 'precision_samples', 'recall_samples', 'f1_samples']: + if metric in [ + "average_precision", + "precision_samples", + "recall_samples", + "f1_samples", + ]: continue y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) - y_pred = \ - np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) + y_pred = np.array( + [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] + ) previous_score = scorer._optimum current_score = scorer(y_true, y_pred) self.assertAlmostEqual(current_score, previous_score) - y_pred = \ - np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]]) + y_pred = np.array( + [[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]] + ) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) - y_pred = \ - np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]) + y_pred = np.array( + [[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]] + ) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) - y_pred = \ - np.array([[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]) + y_pred = np.array( + [[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]] + ) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) @@ -390,76 +412,86 @@ def test_classification_multiclass(self): # # This test should be parameterized so we can identify which metrics # cause which warning specifically and rectify if needed. - ignored_warnings = [ - (UserWarning, 'y_pred contains classes not in y_true') - ] + ignored_warnings = [(UserWarning, "y_pred contains classes not in y_true")] for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): # Skip functions not applicable for multiclass classification. - if metric in ['roc_auc', 'average_precision', - 'precision', 'recall', 'f1', 'precision_samples', - 'recall_samples', 'f1_samples']: + if metric in [ + "roc_auc", + "average_precision", + "precision", + "recall", + "f1", + "precision_samples", + "recall_samples", + "f1_samples", + ]: continue - y_true = np.array( - [0.0, 0.0, 1.0, 1.0, 2.0] - ) + y_true = np.array([0.0, 0.0, 1.0, 1.0, 2.0]) - y_pred = np.array([ - [1.0, 0.0, 0.0], - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0] - ]) + y_pred = np.array( + [ + [1.0, 0.0, 0.0], + [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], + [0.0, 1.0, 0.0], + [0.0, 0.0, 1.0], + ] + ) previous_score = scorer._optimum current_score = scorer(y_true, y_pred) self.assertAlmostEqual(current_score, previous_score) - y_pred = np.array([ - [1.0, 0.0, 0.0], - [1.0, 0.0, 0.0], - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ]) + y_pred = np.array( + [ + [1.0, 0.0, 0.0], + [1.0, 0.0, 0.0], + [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], + [0.0, 0.0, 1.0], + ] + ) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) - y_pred = np.array([ - [0.0, 0.0, 1.0], - [0.0, 1.0, 0.0], - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 1.0, 0.0] - ]) + y_pred = np.array( + [ + [0.0, 0.0, 1.0], + [0.0, 1.0, 0.0], + [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], + [0.0, 1.0, 0.0], + ] + ) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) - y_pred = np.array([ - [0.0, 0.0, 1.0], - [0.0, 0.0, 1.0], - [1.0, 0.0, 0.0], - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0] - ]) + y_pred = np.array( + [ + [0.0, 0.0, 1.0], + [0.0, 0.0, 1.0], + [1.0, 0.0, 0.0], + [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], + ] + ) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) # less labels in the targets than in the predictions y_true = np.array([0.0, 0.0, 1.0, 1.0]) - y_pred = np.array([ - [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] + y_pred = np.array( + [[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] ) with warnings.catch_warnings(): for category, message in ignored_warnings: warnings.filterwarnings( - 'ignore', category=category, message=message + "ignore", category=category, message=message ) score = scorer(y_true, y_pred) @@ -469,8 +501,14 @@ def test_classification_multilabel(self): for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): # Skip functions not applicable for multi-label classification. - if metric in ['roc_auc', 'log_loss', - 'precision', 'recall', 'f1', 'balanced_accuracy']: + if metric in [ + "roc_auc", + "log_loss", + "precision", + "recall", + "f1", + "balanced_accuracy", + ]: continue y_true = np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]]) y_pred = y_true.copy() @@ -495,11 +533,11 @@ def test_classification_multilabel(self): class TestCalculateScore(unittest.TestCase): - def test_unsupported_task_type(self): y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) - y_pred = \ - np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) + y_pred = np.array( + [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] + ) scorer = autosklearn.metrics.accuracy raised = False @@ -513,17 +551,22 @@ def test_classification_scoring_functions(self): scoring_functions = list(autosklearn.metrics.CLASSIFICATION_METRICS.values()) scoring_functions.remove(autosklearn.metrics.accuracy) - fail_metrics = ['precision_samples', 'recall_samples', 'f1_samples'] + fail_metrics = ["precision_samples", "recall_samples", "f1_samples"] success_metrics = list(autosklearn.metrics.CLASSIFICATION_METRICS.keys()) for metric in fail_metrics: success_metrics.remove(metric) y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) - y_pred = \ - np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) - score_dict = calculate_score(y_true, y_pred, BINARY_CLASSIFICATION, - autosklearn.metrics.accuracy, - scoring_functions) + y_pred = np.array( + [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] + ) + score_dict = calculate_score( + y_true, + y_pred, + BINARY_CLASSIFICATION, + autosklearn.metrics.accuracy, + scoring_functions, + ) self.assertIsInstance(score_dict, dict) self.assertTrue(len(success_metrics), len(score_dict)) @@ -531,8 +574,10 @@ def test_classification_scoring_functions(self): self.assertNotIn(metric, score_dict.keys()) for metric in success_metrics: self.assertIn(metric, score_dict.keys()) - self.assertAlmostEqual(autosklearn.metrics.CLASSIFICATION_METRICS[metric]._optimum, - score_dict[metric]) + self.assertAlmostEqual( + autosklearn.metrics.CLASSIFICATION_METRICS[metric]._optimum, + score_dict[metric], + ) def test_regression_scoring_functions(self): @@ -540,26 +585,33 @@ def test_regression_scoring_functions(self): scoring_functions.remove(autosklearn.metrics.root_mean_squared_error) metrics = list(autosklearn.metrics.REGRESSION_METRICS.keys()) - metrics.remove('mean_squared_log_error') + metrics.remove("mean_squared_log_error") y_true = np.array([1, 2, 3, -4]) y_pred = y_true.copy() - score_dict = calculate_score(y_true, y_pred, REGRESSION, - autosklearn.metrics.root_mean_squared_error, - scoring_functions) + score_dict = calculate_score( + y_true, + y_pred, + REGRESSION, + autosklearn.metrics.root_mean_squared_error, + scoring_functions, + ) self.assertIsInstance(score_dict, dict) self.assertTrue(len(metrics), len(score_dict)) for metric in metrics: self.assertIn(metric, score_dict.keys()) - self.assertAlmostEqual(autosklearn.metrics.REGRESSION_METRICS[metric]._optimum, - score_dict[metric]) + self.assertAlmostEqual( + autosklearn.metrics.REGRESSION_METRICS[metric]._optimum, + score_dict[metric], + ) def test_classification_only_metric(self): y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) - y_pred = \ - np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) + y_pred = np.array( + [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] + ) scorer = autosklearn.metrics.accuracy score = calculate_score(y_true, y_pred, BINARY_CLASSIFICATION, scorer) @@ -602,22 +654,28 @@ def test_calculate_loss(): prediction=y_pred, task_type=BINARY_CLASSIFICATION, metric=autosklearn.metrics.accuracy, - scoring_functions=[autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy] + scoring_functions=[ + autosklearn.metrics.accuracy, + autosklearn.metrics.balanced_accuracy, + ], ) expected_score_dict = { - 'accuracy': 0.9, - 'balanced_accuracy': 0.9285714285714286, + "accuracy": 0.9, + "balanced_accuracy": 0.9285714285714286, } loss_dict = calculate_loss( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, metric=autosklearn.metrics.accuracy, - scoring_functions=[autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy] + scoring_functions=[ + autosklearn.metrics.accuracy, + autosklearn.metrics.balanced_accuracy, + ], ) for expected_metric, expected_score in expected_score_dict.items(): assert pytest.approx(expected_score) == score_dict[expected_metric] - assert pytest.approx(1-expected_score) == loss_dict[expected_metric] + assert pytest.approx(1 - expected_score) == loss_dict[expected_metric] # Lastly make sure that metrics whose optimum is zero # are also properly working diff --git a/test/test_optimizer/test_smbo.py b/test/test_optimizer/test_smbo.py index 4b7f0ffd79..fafd7b5a42 100644 --- a/test/test_optimizer/test_smbo.py +++ b/test/test_optimizer/test_smbo.py @@ -1,36 +1,39 @@ import logging.handlers -from ConfigSpace.configuration_space import Configuration - import pytest +from ConfigSpace.configuration_space import Configuration import autosklearn.metrics -from autosklearn.smbo import AutoMLSMBO import autosklearn.pipeline.util as putil from autosklearn.automl import AutoML from autosklearn.constants import BINARY_CLASSIFICATION from autosklearn.data.xy_data_manager import XYDataManager +from autosklearn.smbo import AutoMLSMBO from autosklearn.util.stopwatch import StopWatch -@pytest.mark.parametrize("context", ['fork', 'forkserver']) +@pytest.mark.parametrize("context", ["fork", "forkserver"]) def test_smbo_metalearning_configurations(backend, context, dask_client): # Get the inputs to the optimizer - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') - config_space = AutoML(delete_tmp_folder_after_terminate=False, - metric=autosklearn.metrics.accuracy, - time_left_for_this_task=20, - per_run_time_limit=5).fit( - X_train, Y_train, - task=BINARY_CLASSIFICATION, - only_return_configuration_space=True) + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") + config_space = AutoML( + delete_tmp_folder_after_terminate=False, + metric=autosklearn.metrics.accuracy, + time_left_for_this_task=20, + per_run_time_limit=5, + ).fit( + X_train, + Y_train, + task=BINARY_CLASSIFICATION, + only_return_configuration_space=True, + ) watcher = StopWatch() # Create an optimizer smbo = AutoMLSMBO( config_space=config_space, - dataset_name='iris', + dataset_name="iris", backend=backend, total_walltime_limit=10, func_eval_time_limit=5, @@ -49,11 +52,13 @@ def test_smbo_metalearning_configurations(backend, context, dask_client): # Create the inputs to metalearning datamanager = XYDataManager( - X_train, Y_train, - X_test, Y_test, + X_train, + Y_train, + X_test, + Y_test, task=BINARY_CLASSIFICATION, - dataset_name='iris', - feat_type={i: 'numerical' for i in range(X_train.shape[1])}, + dataset_name="iris", + feat_type={i: "numerical" for i in range(X_train.shape[1])}, ) backend.save_datamanager(datamanager) smbo.task = BINARY_CLASSIFICATION diff --git a/test/test_pipeline/components/__init__.py b/test/test_pipeline/components/__init__.py index 8f0ce6cb7c..92bf78f389 100644 --- a/test/test_pipeline/components/__init__.py +++ b/test/test_pipeline/components/__init__.py @@ -1 +1 @@ -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/test/test_pipeline/components/classification/__init__.py b/test/test_pipeline/components/classification/__init__.py index 8f0ce6cb7c..92bf78f389 100644 --- a/test/test_pipeline/components/classification/__init__.py +++ b/test/test_pipeline/components/classification/__init__.py @@ -1 +1 @@ -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/test/test_pipeline/components/classification/test_adaboost.py b/test/test_pipeline/components/classification/test_adaboost.py index 3c0d96f9a6..f41ba3319f 100644 --- a/test/test_pipeline/components/classification/test_adaboost.py +++ b/test/test_pipeline/components/classification/test_adaboost.py @@ -1,7 +1,7 @@ import sklearn.ensemble -from autosklearn.pipeline.components.classification.adaboost import \ - AdaboostClassifier +from autosklearn.pipeline.components.classification.adaboost import AdaboostClassifier + from .test_base import BaseClassificationComponentTest diff --git a/test/test_pipeline/components/classification/test_base.py b/test/test_pipeline/components/classification/test_base.py index 4fc381af56..a524759bc5 100644 --- a/test/test_pipeline/components/classification/test_base.py +++ b/test/test_pipeline/components/classification/test_base.py @@ -1,15 +1,18 @@ -from typing import Optional, Dict +from typing import Dict, Optional import unittest -from autosklearn.pipeline.util import _test_classifier, \ - _test_classifier_predict_proba, _test_classifier_iterative_fit -from autosklearn.pipeline.constants import SPARSE - -import sklearn.metrics import numpy as np +import sklearn.metrics -from test.test_pipeline.ignored_warnings import ignore_warnings, classifier_warnings +from autosklearn.pipeline.constants import SPARSE +from autosklearn.pipeline.util import ( + _test_classifier, + _test_classifier_iterative_fit, + _test_classifier_predict_proba, +) + +from test.test_pipeline.ignored_warnings import classifier_warnings, ignore_warnings class BaseClassificationComponentTest(unittest.TestCase): @@ -29,14 +32,14 @@ def test_default_iris(self): return for i in range(2): - predictions, targets, n_calls = \ - _test_classifier(dataset="iris", - classifier=self.module) - self.assertAlmostEqual(self.res["default_iris"], - sklearn.metrics.accuracy_score(targets, - predictions), - places=self.res.get( - "default_iris_places", 7)) + predictions, targets, n_calls = _test_classifier( + dataset="iris", classifier=self.module + ) + self.assertAlmostEqual( + self.res["default_iris"], + sklearn.metrics.accuracy_score(targets, predictions), + places=self.res.get("default_iris_places", 7), + ) if self.res.get("iris_n_calls"): self.assertEqual(self.res["iris_n_calls"], n_calls) @@ -45,7 +48,7 @@ def test_get_max_iter(self): if self.__class__ == BaseClassificationComponentTest: return - if not hasattr(self.module, 'iterative_fit'): + if not hasattr(self.module, "iterative_fit"): return self.module.get_max_iter() @@ -55,23 +58,25 @@ def test_default_iris_iterative_fit(self): if self.__class__ == BaseClassificationComponentTest: return - if not hasattr(self.module, 'iterative_fit'): + if not hasattr(self.module, "iterative_fit"): return for i in range(2): - predictions, targets, classifier = \ - _test_classifier_iterative_fit(dataset="iris", - classifier=self.module) - self.assertAlmostEqual(self.res["default_iris_iterative"], - sklearn.metrics.accuracy_score(targets, - predictions), - places=self.res.get( - "default_iris_iterative_places", 7)) + predictions, targets, classifier = _test_classifier_iterative_fit( + dataset="iris", classifier=self.module + ) + self.assertAlmostEqual( + self.res["default_iris_iterative"], + sklearn.metrics.accuracy_score(targets, predictions), + places=self.res.get("default_iris_iterative_places", 7), + ) if self.step_hyperparameter is not None: self.assertEqual( - getattr(classifier.estimator, self.step_hyperparameter['name']), - self.res.get("iris_iterative_n_iter", self.step_hyperparameter['value']) + getattr(classifier.estimator, self.step_hyperparameter["name"]), + self.res.get( + "iris_iterative_n_iter", self.step_hyperparameter["value"] + ), ) def test_default_iris_predict_proba(self): @@ -86,7 +91,7 @@ def test_default_iris_predict_proba(self): self.assertAlmostEqual( self.res["default_iris_proba"], sklearn.metrics.log_loss(targets, predictions), - places=self.res.get("default_iris_proba_places", 7) + places=self.res.get("default_iris_proba_places", 7), ) def test_default_iris_sparse(self): @@ -98,15 +103,14 @@ def test_default_iris_sparse(self): return for i in range(2): - predictions, targets, _ = \ - _test_classifier(dataset="iris", - classifier=self.module, - sparse=True) - self.assertAlmostEqual(self.res["default_iris_sparse"], - sklearn.metrics.accuracy_score(targets, - predictions), - places=self.res.get( - "default_iris_sparse_places", 7)) + predictions, targets, _ = _test_classifier( + dataset="iris", classifier=self.module, sparse=True + ) + self.assertAlmostEqual( + self.res["default_iris_sparse"], + sklearn.metrics.accuracy_score(targets, predictions), + places=self.res.get("default_iris_sparse_places", 7), + ) def test_default_digits_binary(self): @@ -114,15 +118,14 @@ def test_default_digits_binary(self): return for i in range(2): - predictions, targets, _ = \ - _test_classifier(classifier=self.module, - dataset='digits', sparse=False, - make_binary=True) - self.assertAlmostEqual(self.res["default_digits_binary"], - sklearn.metrics.accuracy_score( - targets, predictions), - places=self.res.get( - "default_digits_binary_places", 7)) + predictions, targets, _ = _test_classifier( + classifier=self.module, dataset="digits", sparse=False, make_binary=True + ) + self.assertAlmostEqual( + self.res["default_digits_binary"], + sklearn.metrics.accuracy_score(targets, predictions), + places=self.res.get("default_digits_binary_places", 7), + ) def test_default_digits(self): @@ -130,14 +133,14 @@ def test_default_digits(self): return for i in range(2): - predictions, targets, n_calls = \ - _test_classifier(dataset="digits", - classifier=self.module) - self.assertAlmostEqual(self.res["default_digits"], - sklearn.metrics.accuracy_score(targets, - predictions), - places=self.res.get( - "default_digits_places", 7)) + predictions, targets, n_calls = _test_classifier( + dataset="digits", classifier=self.module + ) + self.assertAlmostEqual( + self.res["default_digits"], + sklearn.metrics.accuracy_score(targets, predictions), + places=self.res.get("default_digits_places", 7), + ) if self.res.get("digits_n_calls"): self.assertEqual(self.res["digits_n_calls"], n_calls) @@ -147,23 +150,25 @@ def test_default_digits_iterative_fit(self): if self.__class__ == BaseClassificationComponentTest: return - if not hasattr(self.module, 'iterative_fit'): + if not hasattr(self.module, "iterative_fit"): return for i in range(2): - predictions, targets, classifier = \ - _test_classifier_iterative_fit(dataset="digits", - classifier=self.module) - self.assertAlmostEqual(self.res["default_digits_iterative"], - sklearn.metrics.accuracy_score(targets, - predictions), - places=self.res.get( - "default_digits_iterative_places", 7)) + predictions, targets, classifier = _test_classifier_iterative_fit( + dataset="digits", classifier=self.module + ) + self.assertAlmostEqual( + self.res["default_digits_iterative"], + sklearn.metrics.accuracy_score(targets, predictions), + places=self.res.get("default_digits_iterative_places", 7), + ) if self.step_hyperparameter is not None: self.assertEqual( - getattr(classifier.estimator, self.step_hyperparameter['name']), - self.res.get("digits_iterative_n_iter", self.step_hyperparameter['value']) + getattr(classifier.estimator, self.step_hyperparameter["name"]), + self.res.get( + "digits_iterative_n_iter", self.step_hyperparameter["value"] + ), ) def test_default_digits_multilabel(self): @@ -176,15 +181,16 @@ def test_default_digits_multilabel(self): for _ in range(2): predictions, targets, _ = _test_classifier( - classifier=self.module, dataset='digits', make_multilabel=True + classifier=self.module, dataset="digits", make_multilabel=True ) score = sklearn.metrics.precision_score( - targets, predictions, average='macro', zero_division=0 + targets, predictions, average="macro", zero_division=0 ) self.assertAlmostEqual( - self.res["default_digits_multilabel"], score, - places=self.res.get("default_digits_multilabel_places", 7) + self.res["default_digits_multilabel"], + score, + places=self.res.get("default_digits_multilabel_places", 7), ) def test_default_digits_multilabel_predict_proba(self): @@ -196,15 +202,15 @@ def test_default_digits_multilabel_predict_proba(self): return for i in range(2): - predictions, targets = \ - _test_classifier_predict_proba(classifier=self.module, - make_multilabel=True) + predictions, targets = _test_classifier_predict_proba( + classifier=self.module, make_multilabel=True + ) self.assertEqual(predictions.shape, ((50, 3))) - self.assertAlmostEqual(self.res["default_digits_multilabel_proba"], - sklearn.metrics.roc_auc_score( - targets, predictions, average='macro'), - places=self.res.get( - "default_digits_multilabel_proba_places", 7)) + self.assertAlmostEqual( + self.res["default_digits_multilabel_proba"], + sklearn.metrics.roc_auc_score(targets, predictions, average="macro"), + places=self.res.get("default_digits_multilabel_proba_places", 7), + ) def test_target_algorithm_multioutput_multiclass_support(self): @@ -218,42 +224,66 @@ def test_target_algorithm_multioutput_multiclass_support(self): X = np.random.random((10, 10)) y = np.random.randint(0, 1, size=(10, 10)) self.assertRaisesRegex( - ValueError, - 'bad input shape \\(10, 10\\)', - cls.fit, - X, - y + ValueError, "bad input shape \\(10, 10\\)", cls.fit, X, y ) else: return def test_module_idempotent(self): - """ Fitting twice with the same config gives the same model params. + """Fitting twice with the same config gives the same model params. - This is only valid when the random_state passed is an int. If a - RandomState object is passed then repeated calls to fit will have - different results. See the section on "Controlling Randomness" in the - sklearn docs. + This is only valid when the random_state passed is an int. If a + RandomState object is passed then repeated calls to fit will have + different results. See the section on "Controlling Randomness" in the + sklearn docs. - https://scikit-learn.org/0.24/common_pitfalls.html#controlling-randomness + https://scikit-learn.org/0.24/common_pitfalls.html#controlling-randomness """ if self.__class__ == BaseClassificationComponentTest: return classifier_cls = self.module - X = np.array([ - [0, 0], [0, 1], [1, 0], [1, 1], - [0, 0], [0, 1], [1, 0], [1, 1], - [0, 0], [0, 1], [1, 0], [1, 1], - [0, 0], [0, 1], [1, 0], [1, 1], - ]) - y = np.array([ - 0, 1, 1, 0, - 0, 1, 1, 0, - 0, 1, 1, 0, - 0, 1, 1, 0, - ]) + X = np.array( + [ + [0, 0], + [0, 1], + [1, 0], + [1, 1], + [0, 0], + [0, 1], + [1, 0], + [1, 1], + [0, 0], + [0, 1], + [1, 0], + [1, 1], + [0, 0], + [0, 1], + [1, 0], + [1, 1], + ] + ) + y = np.array( + [ + 0, + 1, + 1, + 0, + 0, + 1, + 1, + 0, + 0, + 1, + 1, + 0, + 0, + 1, + 1, + 0, + ] + ) # There are certain errors we ignore so we wrap this in a function def fitted_params(model) -> Optional[Dict]: @@ -268,12 +298,18 @@ def is_QDA_error(err): # We are okay if the BaseClassifier in AdaBoostClassifier is worse # than random so no ensemble can be fit def is_AdaBoostClassifier_error(err): - return ("BaseClassifier in AdaBoostClassifier ensemble is worse" - + " than random, ensemble can not be fit." in err.args[0]) + return ( + "BaseClassifier in AdaBoostClassifier ensemble is worse" + + " than random, ensemble can not be fit." + in err.args[0] + ) def is_unset_param_raw_predictions_val_error(err): - return ("local variable 'raw_predictions_val' referenced before" - + " assignment" in err.args[0]) + return ( + "local variable 'raw_predictions_val' referenced before" + + " assignment" + in err.args[0] + ) try: with ignore_warnings(classifier_warnings): @@ -288,7 +324,7 @@ def is_unset_param_raw_predictions_val_error(err): return model.estimator.get_params() # We ignore certain keys when comparing - param_keys_ignored = ['base_estimator'] + param_keys_ignored = ["base_estimator"] # We use the default config + sampled ones configuration_space = classifier_cls.get_hyperparameter_search_space() @@ -302,12 +338,12 @@ def is_unset_param_raw_predictions_val_error(err): # Get the parameters on the first and second fit with config params params_first = fitted_params(classifier) - if hasattr(classifier.estimator, 'random_state'): + if hasattr(classifier.estimator, "random_state"): rs_1 = classifier.random_state rs_estimator_1 = classifier.estimator.random_state params_second = fitted_params(classifier) - if hasattr(classifier.estimator, 'random_state'): + if hasattr(classifier.estimator, "random_state"): rs_2 = classifier.random_state rs_estimator_2 = classifier.estimator.random_state @@ -322,10 +358,13 @@ def is_unset_param_raw_predictions_val_error(err): del params[key] # They should have equal parameters - self.assertEqual(params_first, params_second, - f"Failed with model args {model_args}") - if hasattr(classifier.estimator, 'random_state'): - assert all([ - seed == random_state - for random_state in [rs_1, rs_estimator_1, rs_2, rs_estimator_2] - ]) + self.assertEqual( + params_first, params_second, f"Failed with model args {model_args}" + ) + if hasattr(classifier.estimator, "random_state"): + assert all( + [ + seed == random_state + for random_state in [rs_1, rs_estimator_1, rs_2, rs_estimator_2] + ] + ) diff --git a/test/test_pipeline/components/classification/test_bernoulli_nb.py b/test/test_pipeline/components/classification/test_bernoulli_nb.py index 8384119393..2def3a385f 100644 --- a/test/test_pipeline/components/classification/test_bernoulli_nb.py +++ b/test/test_pipeline/components/classification/test_bernoulli_nb.py @@ -1,7 +1,6 @@ import sklearn.naive_bayes -from autosklearn.pipeline.components.classification.bernoulli_nb import \ - BernoulliNB +from autosklearn.pipeline.components.classification.bernoulli_nb import BernoulliNB from .test_base import BaseClassificationComponentTest diff --git a/test/test_pipeline/components/classification/test_decision_tree.py b/test/test_pipeline/components/classification/test_decision_tree.py index e32a6536c7..546040e645 100644 --- a/test/test_pipeline/components/classification/test_decision_tree.py +++ b/test/test_pipeline/components/classification/test_decision_tree.py @@ -1,7 +1,6 @@ import sklearn.tree -from autosklearn.pipeline.components.classification.decision_tree import \ - DecisionTree +from autosklearn.pipeline.components.classification.decision_tree import DecisionTree from .test_base import BaseClassificationComponentTest diff --git a/test/test_pipeline/components/classification/test_extra_trees.py b/test/test_pipeline/components/classification/test_extra_trees.py index e7b1935db0..213bfbd916 100644 --- a/test/test_pipeline/components/classification/test_extra_trees.py +++ b/test/test_pipeline/components/classification/test_extra_trees.py @@ -1,6 +1,8 @@ import sklearn.ensemble -from autosklearn.pipeline.components.classification.extra_trees import ExtraTreesClassifier +from autosklearn.pipeline.components.classification.extra_trees import ( + ExtraTreesClassifier, +) from .test_base import BaseClassificationComponentTest @@ -12,12 +14,12 @@ class ExtraTreesComponentTest(BaseClassificationComponentTest): res = dict() res["default_iris"] = 0.96 res["iris_n_calls"] = 9 - res["default_iris_iterative"] = res['default_iris'] + res["default_iris_iterative"] = res["default_iris"] res["default_iris_proba"] = 0.10053485167017469 res["default_iris_sparse"] = 0.74 res["default_digits"] = 0.9216757741347905 res["digits_n_calls"] = 9 - res["default_digits_iterative"] = res['default_digits'] + res["default_digits_iterative"] = res["default_digits"] res["default_digits_iterative_places"] = 3 res["default_digits_binary"] = 0.994535519125683 res["default_digits_multilabel"] = 0.9983621593291405 @@ -26,6 +28,6 @@ class ExtraTreesComponentTest(BaseClassificationComponentTest): sk_mod = sklearn.ensemble.ExtraTreesClassifier module = ExtraTreesClassifier step_hyperparameter = { - 'name': 'n_estimators', - 'value': module.get_max_iter(), + "name": "n_estimators", + "value": module.get_max_iter(), } diff --git a/test/test_pipeline/components/classification/test_gaussian_nb.py b/test/test_pipeline/components/classification/test_gaussian_nb.py index ea5ce7cc5b..2f813b4293 100644 --- a/test/test_pipeline/components/classification/test_gaussian_nb.py +++ b/test/test_pipeline/components/classification/test_gaussian_nb.py @@ -1,7 +1,6 @@ import sklearn.naive_bayes -from autosklearn.pipeline.components.classification.gaussian_nb import \ - GaussianNB +from autosklearn.pipeline.components.classification.gaussian_nb import GaussianNB from .test_base import BaseClassificationComponentTest diff --git a/test/test_pipeline/components/classification/test_gradient_boosting.py b/test/test_pipeline/components/classification/test_gradient_boosting.py index efa3a3cca8..4bfadfa74c 100644 --- a/test/test_pipeline/components/classification/test_gradient_boosting.py +++ b/test/test_pipeline/components/classification/test_gradient_boosting.py @@ -1,7 +1,8 @@ import sklearn.ensemble -from autosklearn.pipeline.components.classification.gradient_boosting import \ - GradientBoostingClassifier +from autosklearn.pipeline.components.classification.gradient_boosting import ( + GradientBoostingClassifier, +) from .test_base import BaseClassificationComponentTest @@ -24,6 +25,6 @@ class GradientBoostingComponentTest(BaseClassificationComponentTest): sk_mod = sklearn.ensemble.ExtraTreesClassifier module = GradientBoostingClassifier step_hyperparameter = { - 'name': 'max_iter', - 'value': module.get_max_iter(), + "name": "max_iter", + "value": module.get_max_iter(), } diff --git a/test/test_pipeline/components/classification/test_k_nearest_neighbor.py b/test/test_pipeline/components/classification/test_k_nearest_neighbor.py index 8209e2a674..d09512d07d 100644 --- a/test/test_pipeline/components/classification/test_k_nearest_neighbor.py +++ b/test/test_pipeline/components/classification/test_k_nearest_neighbor.py @@ -1,7 +1,8 @@ import sklearn.neighbors -from autosklearn.pipeline.components.classification.k_nearest_neighbors import \ - KNearestNeighborsClassifier +from autosklearn.pipeline.components.classification.k_nearest_neighbors import ( + KNearestNeighborsClassifier, +) from .test_base import BaseClassificationComponentTest diff --git a/test/test_pipeline/components/classification/test_liblinear.py b/test/test_pipeline/components/classification/test_liblinear.py index bb2d2a1894..1aec8e227e 100644 --- a/test/test_pipeline/components/classification/test_liblinear.py +++ b/test/test_pipeline/components/classification/test_liblinear.py @@ -1,7 +1,6 @@ import sklearn.svm -from autosklearn.pipeline.components.classification.liblinear_svc import \ - LibLinear_SVC +from autosklearn.pipeline.components.classification.liblinear_svc import LibLinear_SVC from .test_base import BaseClassificationComponentTest diff --git a/test/test_pipeline/components/classification/test_libsvm_svc.py b/test/test_pipeline/components/classification/test_libsvm_svc.py index dcab429fc1..6fe95f5b62 100644 --- a/test/test_pipeline/components/classification/test_libsvm_svc.py +++ b/test/test_pipeline/components/classification/test_libsvm_svc.py @@ -2,8 +2,7 @@ import sklearn.svm from autosklearn.pipeline.components.classification.libsvm_svc import LibSVM_SVC -from autosklearn.pipeline.util import get_dataset, \ - _test_classifier_predict_proba +from autosklearn.pipeline.util import _test_classifier_predict_proba, get_dataset from .test_base import BaseClassificationComponentTest @@ -30,22 +29,23 @@ def test_default_configuration_predict_proba_individual(self): # Leave this additional test here for i in range(2): predictions, targets = _test_classifier_predict_proba( - LibSVM_SVC, sparse=True, dataset='digits', - train_size_maximum=500) - self.assertAlmostEqual(5.273502056835706, - sklearn.metrics.log_loss(targets, - predictions)) + LibSVM_SVC, sparse=True, dataset="digits", train_size_maximum=500 + ) + self.assertAlmostEqual( + 5.273502056835706, sklearn.metrics.log_loss(targets, predictions) + ) for i in range(2): predictions, targets = _test_classifier_predict_proba( - LibSVM_SVC, sparse=True, dataset='iris') - self.assertAlmostEqual(0.8408320837510618, - sklearn.metrics.log_loss(targets, - predictions)) + LibSVM_SVC, sparse=True, dataset="iris" + ) + self.assertAlmostEqual( + 0.8408320837510618, sklearn.metrics.log_loss(targets, predictions) + ) # 2 class for i in range(2): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris") remove_training_data = Y_train == 2 remove_test_data = Y_test == 2 X_train = X_train[~remove_training_data] @@ -57,11 +57,19 @@ def test_default_configuration_predict_proba_individual(self): configuration_space = LibSVM_SVC.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() - cls = LibSVM_SVC(random_state=1, **{hp_name: default[hp_name] - for hp_name in default - if default[hp_name] is not None}) + cls = LibSVM_SVC( + random_state=1, + **{ + hp_name: default[hp_name] + for hp_name in default + if default[hp_name] is not None + }, + ) cls = cls.fit(X_train, Y_train) prediction = cls.predict_proba(X_test) - self.assertAlmostEqual(sklearn.metrics.log_loss(Y_test, prediction), - 0.6927962762794081, places=4) + self.assertAlmostEqual( + sklearn.metrics.log_loss(Y_test, prediction), + 0.6927962762794081, + places=4, + ) diff --git a/test/test_pipeline/components/classification/test_mlp.py b/test/test_pipeline/components/classification/test_mlp.py index b8d559b1bc..e1c4286d83 100644 --- a/test/test_pipeline/components/classification/test_mlp.py +++ b/test/test_pipeline/components/classification/test_mlp.py @@ -43,6 +43,6 @@ class MLPComponentTest(BaseClassificationComponentTest): sk_mod = sklearn.neural_network.MLPClassifier module = MLPClassifier step_hyperparameter = { - 'name': 'n_iter_', - 'value': module.get_max_iter(), + "name": "n_iter_", + "value": module.get_max_iter(), } diff --git a/test/test_pipeline/components/classification/test_multinomial_nb.py b/test/test_pipeline/components/classification/test_multinomial_nb.py index 2c982c41ef..c82b938679 100644 --- a/test/test_pipeline/components/classification/test_multinomial_nb.py +++ b/test/test_pipeline/components/classification/test_multinomial_nb.py @@ -1,10 +1,8 @@ import numpy as np - import sklearn.naive_bayes import sklearn.preprocessing -from autosklearn.pipeline.components.classification.multinomial_nb import \ - MultinomialNB +from autosklearn.pipeline.components.classification.multinomial_nb import MultinomialNB from autosklearn.pipeline.util import get_dataset from .test_base import BaseClassificationComponentTest @@ -32,17 +30,21 @@ class MultinomialNBComponentTest(BaseClassificationComponentTest): def test_default_configuration_negative_values(self): # Custon preprocessing test to check if clipping to zero works - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") ss = sklearn.preprocessing.StandardScaler() X_train = ss.fit_transform(X_train) configuration_space = MultinomialNB.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() - cls = MultinomialNB(random_state=1, **{hp_name: default[hp_name] - for hp_name in default - if default[hp_name] is not None}) + cls = MultinomialNB( + random_state=1, + **{ + hp_name: default[hp_name] + for hp_name in default + if default[hp_name] is not None + }, + ) cls = cls.fit(X_train, Y_train) prediction = cls.predict(X_test) - self.assertAlmostEqual(np.nanmean(prediction == Y_test), - 0.88888888888888884) + self.assertAlmostEqual(np.nanmean(prediction == Y_test), 0.88888888888888884) diff --git a/test/test_pipeline/components/classification/test_passive_aggressive.py b/test/test_pipeline/components/classification/test_passive_aggressive.py index d904f9e569..b83dbaf120 100644 --- a/test/test_pipeline/components/classification/test_passive_aggressive.py +++ b/test/test_pipeline/components/classification/test_passive_aggressive.py @@ -1,7 +1,8 @@ import sklearn.linear_model -from autosklearn.pipeline.components.classification.passive_aggressive import \ - PassiveAggressive +from autosklearn.pipeline.components.classification.passive_aggressive import ( + PassiveAggressive, +) from .test_base import BaseClassificationComponentTest @@ -13,13 +14,13 @@ class PassiveAggressiveComponentTest(BaseClassificationComponentTest): res = dict() res["default_iris"] = 0.98 res["iris_n_calls"] = 6 - res["default_iris_iterative"] = res['default_iris'] + res["default_iris_iterative"] = res["default_iris"] res["iris_iterative_n_iter"] = 64 res["default_iris_proba"] = 0.27840521921952033 res["default_iris_sparse"] = 0.48 res["default_digits"] = 0.9162112932604736 res["digits_n_calls"] = 6 - res["default_digits_iterative"] = res['default_digits'] + res["default_digits_iterative"] = res["default_digits"] res["digits_iterative_n_iter"] = 64 res["default_digits_binary"] = 0.99210686095932 res["default_digits_multilabel"] = 0.910908768565592 @@ -29,6 +30,6 @@ class PassiveAggressiveComponentTest(BaseClassificationComponentTest): module = PassiveAggressive step_hyperparameter = { - 'name': 'max_iter', - 'value': module.get_max_iter(), + "name": "max_iter", + "value": module.get_max_iter(), } diff --git a/test/test_pipeline/components/classification/test_random_forest.py b/test/test_pipeline/components/classification/test_random_forest.py index 8e2c1136d3..f96869c270 100644 --- a/test/test_pipeline/components/classification/test_random_forest.py +++ b/test/test_pipeline/components/classification/test_random_forest.py @@ -1,7 +1,6 @@ import sklearn.ensemble -from autosklearn.pipeline.components.classification.random_forest import \ - RandomForest +from autosklearn.pipeline.components.classification.random_forest import RandomForest from .test_base import BaseClassificationComponentTest @@ -13,12 +12,12 @@ class RandomForestComponentTest(BaseClassificationComponentTest): res = dict() res["default_iris"] = 0.96 res["iris_n_calls"] = 9 - res["default_iris_iterative"] = res['default_iris'] + res["default_iris_iterative"] = res["default_iris"] res["default_iris_proba"] = 0.0996785324703419 res["default_iris_sparse"] = 0.85999999999999999 res["default_digits"] = 0.8998178506375227 res["digits_n_calls"] = 9 - res["default_digits_iterative"] = res['default_digits'] + res["default_digits_iterative"] = res["default_digits"] res["default_digits_binary"] = 0.9896782027929569 res["default_digits_multilabel"] = 0.9973653110879388 res["default_digits_multilabel_proba"] = 0.9965660960196189 @@ -26,6 +25,6 @@ class RandomForestComponentTest(BaseClassificationComponentTest): sk_mod = sklearn.ensemble.RandomForestClassifier module = RandomForest step_hyperparameter = { - 'name': 'n_estimators', - 'value': module.get_max_iter(), + "name": "n_estimators", + "value": module.get_max_iter(), } diff --git a/test/test_pipeline/components/classification/test_sgd.py b/test/test_pipeline/components/classification/test_sgd.py index defe8af81d..8f1d7821e1 100644 --- a/test/test_pipeline/components/classification/test_sgd.py +++ b/test/test_pipeline/components/classification/test_sgd.py @@ -1,6 +1,7 @@ import sklearn.linear_model from autosklearn.pipeline.components.classification.sgd import SGD + from .test_base import BaseClassificationComponentTest @@ -11,12 +12,12 @@ class SGDComponentTest(BaseClassificationComponentTest): res = dict() res["default_iris"] = 0.69999999999999996 res["iris_n_calls"] = 9 - res["default_iris_iterative"] = res['default_iris'] + res["default_iris_iterative"] = res["default_iris"] res["default_iris_proba"] = 0.5996114465819011 res["default_iris_sparse"] = 0.54 res["default_digits"] = 0.9198542805100182 res["digits_n_calls"] = 7 - res["default_digits_iterative"] = res['default_digits'] + res["default_digits_iterative"] = res["default_digits"] res["default_digits_binary"] = 0.9951426836672739 res["default_digits_multilabel"] = -1 res["default_digits_multilabel_proba"] = -1 diff --git a/test/test_pipeline/components/data_preprocessing/__init__.py b/test/test_pipeline/components/data_preprocessing/__init__.py index 8f0ce6cb7c..92bf78f389 100644 --- a/test/test_pipeline/components/data_preprocessing/__init__.py +++ b/test/test_pipeline/components/data_preprocessing/__init__.py @@ -1 +1 @@ -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/test/test_pipeline/components/data_preprocessing/test_balancing.py b/test/test_pipeline/components/data_preprocessing/test_balancing.py index 268a8ea542..cf8dc103b8 100644 --- a/test/test_pipeline/components/data_preprocessing/test_balancing.py +++ b/test/test_pipeline/components/data_preprocessing/test_balancing.py @@ -1,4 +1,4 @@ -__author__ = 'feurerm' +__author__ = "feurerm" import copy import unittest @@ -7,86 +7,111 @@ import sklearn.datasets import sklearn.metrics -from autosklearn.pipeline.components.data_preprocessing.balancing.balancing \ - import Balancing from autosklearn.pipeline.classification import SimpleClassificationPipeline from autosklearn.pipeline.components.classification.adaboost import AdaboostClassifier from autosklearn.pipeline.components.classification.decision_tree import DecisionTree -from autosklearn.pipeline.components.classification.extra_trees import ExtraTreesClassifier -from autosklearn.pipeline.components.classification.random_forest import RandomForest +from autosklearn.pipeline.components.classification.extra_trees import ( + ExtraTreesClassifier, +) +from autosklearn.pipeline.components.classification.gradient_boosting import ( + GradientBoostingClassifier, +) from autosklearn.pipeline.components.classification.liblinear_svc import LibLinear_SVC from autosklearn.pipeline.components.classification.libsvm_svc import LibSVM_SVC +from autosklearn.pipeline.components.classification.passive_aggressive import ( + PassiveAggressive, +) +from autosklearn.pipeline.components.classification.random_forest import RandomForest from autosklearn.pipeline.components.classification.sgd import SGD -from autosklearn.pipeline.components.classification.gradient_boosting \ - import GradientBoostingClassifier -from autosklearn.pipeline.components.classification.passive_aggressive import PassiveAggressive -from autosklearn.pipeline.components.feature_preprocessing\ - .extra_trees_preproc_for_classification import ExtraTreesPreprocessorClassification -from autosklearn.pipeline.components.feature_preprocessing.liblinear_svc_preprocessor import \ - LibLinear_Preprocessor +from autosklearn.pipeline.components.data_preprocessing.balancing.balancing import ( + Balancing, +) +from autosklearn.pipeline.components.feature_preprocessing.extra_trees_preproc_for_classification import ( # noqa: E501 + ExtraTreesPreprocessorClassification, +) +from autosklearn.pipeline.components.feature_preprocessing.liblinear_svc_preprocessor import ( # noqa: E501 + LibLinear_Preprocessor, +) class BalancingComponentTest(unittest.TestCase): def test_balancing_get_weights_treed_single_label(self): Y = np.array([0] * 80 + [1] * 20) - balancing = Balancing(strategy='weighting') - init_params, fit_params = balancing.get_weights( - Y, 'adaboost', None, None, None) + balancing = Balancing(strategy="weighting") + init_params, fit_params = balancing.get_weights(Y, "adaboost", None, None, None) self.assertAlmostEqual( - np.mean(fit_params['classifier:sample_weight']), 1, + np.mean(fit_params["classifier:sample_weight"]), + 1, ) np.testing.assert_allclose( - fit_params['classifier:sample_weight'], + fit_params["classifier:sample_weight"], np.array([0.625] * 80 + [2.5] * 20), ) def test_balancing_get_weights_treed_multilabel(self): - Y = np.array([[0, 0, 0]] * 100 + [[1, 0, 0]] * 100 + [[0, 1, 0]] * 100 + - [[1, 1, 0]] * 100 + [[0, 0, 1]] * 100 + [[1, 0, 1]] * 10) - balancing = Balancing(strategy='weighting') - init_params, fit_params = balancing.get_weights( - Y, 'adaboost', None, None, None) - print(fit_params['classifier:sample_weight']) + Y = np.array( + [[0, 0, 0]] * 100 + + [[1, 0, 0]] * 100 + + [[0, 1, 0]] * 100 + + [[1, 1, 0]] * 100 + + [[0, 0, 1]] * 100 + + [[1, 0, 1]] * 10 + ) + balancing = Balancing(strategy="weighting") + init_params, fit_params = balancing.get_weights(Y, "adaboost", None, None, None) + print(fit_params["classifier:sample_weight"]) self.assertAlmostEqual( - np.mean(fit_params['classifier:sample_weight']), 1, + np.mean(fit_params["classifier:sample_weight"]), + 1, ) np.testing.assert_allclose( - fit_params['classifier:sample_weight'], + fit_params["classifier:sample_weight"], np.array([0.85] * 500 + [8.5] * 10), ) def test_balancing_get_weights_svm_sgd(self): Y = np.array([0] * 80 + [1] * 20) - balancing = Balancing(strategy='weighting') + balancing = Balancing(strategy="weighting") init_params, fit_params = balancing.get_weights( - Y, 'libsvm_svc', None, None, None) - self.assertEqual(("classifier:class_weight", "balanced"), - list(init_params.items())[0]) + Y, "libsvm_svc", None, None, None + ) + self.assertEqual( + ("classifier:class_weight", "balanced"), list(init_params.items())[0] + ) init_params, fit_params = balancing.get_weights( - Y, None, 'liblinear_svc_preprocessor', None, None) - self.assertEqual(("feature_preprocessor:class_weight", "balanced"), - list(init_params.items())[0]) + Y, None, "liblinear_svc_preprocessor", None, None + ) + self.assertEqual( + ("feature_preprocessor:class_weight", "balanced"), + list(init_params.items())[0], + ) def test_weighting_effect(self): data = sklearn.datasets.make_classification( - n_samples=200, n_features=10, n_redundant=2, n_informative=2, - n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2], - random_state=1) + n_samples=200, + n_features=10, + n_redundant=2, + n_informative=2, + n_repeated=2, + n_clusters_per_class=2, + weights=[0.8, 0.2], + random_state=1, + ) for name, clf, acc_no_weighting, acc_weighting, places in [ - ('adaboost', AdaboostClassifier, 0.810, 0.735, 3), - ('decision_tree', DecisionTree, 0.780, 0.643, 3), - ('extra_trees', ExtraTreesClassifier, 0.78, 0.8, 3), - ('random_forest', RandomForest, 0.75, 0.789, 3), - ('libsvm_svc', LibSVM_SVC, 0.769, 0.72, 3), - ('liblinear_svc', LibLinear_SVC, 0.762, 0.735, 3), - ('passive_aggressive', PassiveAggressive, 0.16, 0.222, 3), - ('sgd', SGD, 0.818, 0.567, 2), - ('gradient_boosting', GradientBoostingClassifier, 0.666, 0.682, 2) - ]: + ("adaboost", AdaboostClassifier, 0.810, 0.735, 3), + ("decision_tree", DecisionTree, 0.780, 0.643, 3), + ("extra_trees", ExtraTreesClassifier, 0.78, 0.8, 3), + ("random_forest", RandomForest, 0.75, 0.789, 3), + ("libsvm_svc", LibSVM_SVC, 0.769, 0.72, 3), + ("liblinear_svc", LibLinear_SVC, 0.762, 0.735, 3), + ("passive_aggressive", PassiveAggressive, 0.16, 0.222, 3), + ("sgd", SGD, 0.818, 0.567, 2), + ("gradient_boosting", GradientBoostingClassifier, 0.666, 0.682, 2), + ]: for strategy, acc in [ - ('none', acc_no_weighting), - ('weighting', acc_weighting) + ("none", acc_no_weighting), + ("weighting", acc_weighting), ]: # Fit data_ = copy.copy(data) @@ -98,23 +123,25 @@ def test_weighting_effect(self): model_args = { "random_state": 1, "include": { - 'classifier': [name], - 'feature_preprocessor': ['no_preprocessing'] - } + "classifier": [name], + "feature_preprocessor": ["no_preprocessing"], + }, } classifier = SimpleClassificationPipeline(**model_args) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() - default._values['balancing:strategy'] = strategy + default._values["balancing:strategy"] = strategy classifier = SimpleClassificationPipeline(config=default, **model_args) classifier.fit(X_train, Y_train) predictions1 = classifier.predict(X_test) self.assertAlmostEqual( - sklearn.metrics.f1_score(predictions1, Y_test), acc, - places=places, msg=(name, strategy) + sklearn.metrics.f1_score(predictions1, Y_test), + acc, + places=places, + msg=(name, strategy), ) # fit_transformer and fit_estimator @@ -130,39 +157,53 @@ def test_weighting_effect(self): predictions2 = classifier.predict(X_test) np.testing.assert_allclose( - predictions1, predictions2, - err_msg=f"name = {name}, strategy = {strategy}" + predictions1, + predictions2, + err_msg=f"name = {name}, strategy = {strategy}", ) self.assertAlmostEqual( - sklearn.metrics.f1_score(predictions2, Y_test), acc, - places=places, msg=(name, strategy) + sklearn.metrics.f1_score(predictions2, Y_test), + acc, + places=places, + msg=(name, strategy), ) - for name, pre, acc_no_weighting, acc_weighting in \ - [('extra_trees_preproc_for_classification', - ExtraTreesPreprocessorClassification, 0.810, 0.590), - ('liblinear_svc_preprocessor', LibLinear_Preprocessor, - 0.837, 0.562)]: - for strategy, acc in [('none', acc_no_weighting), - ('weighting', acc_weighting)]: + for name, pre, acc_no_weighting, acc_weighting in [ + ( + "extra_trees_preproc_for_classification", + ExtraTreesPreprocessorClassification, + 0.810, + 0.590, + ), + ("liblinear_svc_preprocessor", LibLinear_Preprocessor, 0.837, 0.562), + ]: + for strategy, acc in [ + ("none", acc_no_weighting), + ("weighting", acc_weighting), + ]: data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] - include = {'classifier': ['sgd'], 'feature_preprocessor': [name]} + include = {"classifier": ["sgd"], "feature_preprocessor": [name]} - classifier = SimpleClassificationPipeline(random_state=1, include=include) + classifier = SimpleClassificationPipeline( + random_state=1, include=include + ) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() - default._values['balancing:strategy'] = strategy + default._values["balancing:strategy"] = strategy classifier.set_hyperparameters(default) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual( - sklearn.metrics.f1_score(predictions, Y_test), acc, - places=3, msg=(name, strategy)) + sklearn.metrics.f1_score(predictions, Y_test), + acc, + places=3, + msg=(name, strategy), + ) # fit_transformer and fit_estimator data_ = copy.copy(data) @@ -171,11 +212,13 @@ def test_weighting_effect(self): X_test = data_[0][100:] Y_test = data_[1][100:] - default._values['balancing:strategy'] = strategy - classifier = SimpleClassificationPipeline(default, random_state=1, include=include) + default._values["balancing:strategy"] = strategy + classifier = SimpleClassificationPipeline( + default, random_state=1, include=include + ) Xt, fit_params = classifier.fit_transformer(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual( - sklearn.metrics.f1_score(predictions, Y_test), acc, - places=3) + sklearn.metrics.f1_score(predictions, Y_test), acc, places=3 + ) diff --git a/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py b/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py index 2767093179..d50e8cf842 100644 --- a/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py +++ b/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py @@ -1,11 +1,11 @@ import numpy as np -from scipy import sparse - import pandas as pd import pytest +from scipy import sparse -from autosklearn.pipeline.components.data_preprocessing.imputation.categorical_imputation\ - import CategoricalImputation +from autosklearn.pipeline.components.data_preprocessing.imputation.categorical_imputation import ( # noqa: E501 + CategoricalImputation, +) @pytest.fixture @@ -14,15 +14,15 @@ def input_data_imputation(request): X = np.array(np.random.randint(3, 10, size=size), dtype=float) mask = np.logical_not(np.random.randint(0, 5, size=size), dtype=bool) X[mask] = np.nan - if request.param == 'numpy': + if request.param == "numpy": pass - elif request.param == 'pandas': + elif request.param == "pandas": X = pd.DataFrame(X) return X, mask -@pytest.mark.parametrize('input_data_imputation', ('numpy', 'pandas'), indirect=True) -@pytest.mark.parametrize('categorical', (True, False)) +@pytest.mark.parametrize("input_data_imputation", ("numpy", "pandas"), indirect=True) +@pytest.mark.parametrize("categorical", (True, False)) def test_default_imputation(input_data_imputation, categorical): """ Makes sure that imputation works for both numerical and categorical data. @@ -30,8 +30,8 @@ def test_default_imputation(input_data_imputation, categorical): """ X, mask = input_data_imputation if categorical: - imputation_value = 'missing_value' - X = X.astype('str').astype('object') + imputation_value = "missing_value" + X = X.astype("str").astype("object") X[mask] = np.nan else: imputation_value = min(np.unique(X)) - 1 @@ -42,15 +42,15 @@ def test_default_imputation(input_data_imputation, categorical): assert np.array_equal(Y != imputation_value, ~mask) -@pytest.mark.parametrize('format_type', ('numpy', 'pandas')) +@pytest.mark.parametrize("format_type", ("numpy", "pandas")) def test_nonzero_numerical_imputation(format_type): # First try with an array with 0 as only valid category. The imputation should # happen with -1 X = np.full(fill_value=np.nan, shape=(10, 10)) X[0, :] = 0 - if 'pandas' in format_type: + if "pandas" in format_type: X = pd.DataFrame(X) - elif 'numpy' in format_type: + elif "numpy" in format_type: pass else: pytest.fail(format_type) @@ -61,13 +61,13 @@ def test_nonzero_numerical_imputation(format_type): X = np.full(fill_value=np.nan, shape=(10, 10)) X[0, :] = 0 X[1, :] = -1 - if 'pandas' in format_type: + if "pandas" in format_type: X = pd.DataFrame(X) Y = CategoricalImputation().fit_transform(X.copy()) np.testing.assert_equal(np.nan_to_num(X, nan=-2, copy=True), Y) -@pytest.mark.parametrize('input_data_imputation', ('numpy'), indirect=True) +@pytest.mark.parametrize("input_data_imputation", ("numpy"), indirect=True) def test_default_sparse(input_data_imputation): X, mask = input_data_imputation X = sparse.csr_matrix(X) diff --git a/test/test_pipeline/components/data_preprocessing/test_category_shift.py b/test/test_pipeline/components/data_preprocessing/test_category_shift.py index d49e6a84f0..ce637f50d4 100644 --- a/test/test_pipeline/components/data_preprocessing/test_category_shift.py +++ b/test/test_pipeline/components/data_preprocessing/test_category_shift.py @@ -1,19 +1,21 @@ import unittest + import numpy as np import scipy.sparse -from autosklearn.pipeline.components.data_preprocessing.category_shift.\ - category_shift import CategoryShift +from autosklearn.pipeline.components.data_preprocessing.category_shift.category_shift import ( # noqa: E501 + CategoryShift, +) class CategoryShiftTest(unittest.TestCase): - def test_data_type_consistency(self): X = np.random.randint(0, 255, (3, 4)) Y = CategoryShift().fit_transform(X) self.assertFalse(scipy.sparse.issparse(Y)) X = scipy.sparse.csc_matrix( - ([1, 2, 0, 4], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)) + ([1, 2, 0, 4], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4) + ) Y = CategoryShift().fit_transform(X) self.assertTrue(scipy.sparse.issparse(Y)) diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py index 5e6f89ad3a..ac8e9abbe2 100644 --- a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py +++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py @@ -1,13 +1,14 @@ import unittest + import numpy as np from scipy import sparse -from autosklearn.pipeline.components.data_preprocessing.feature_type \ - import FeatTypeSplit +from autosklearn.pipeline.components.data_preprocessing.feature_type import ( + FeatTypeSplit, +) class PreprocessingPipelineTest(unittest.TestCase): - def do_a_fit_transform(self, sparse_input): # X will be the input and Y is what we expect after transform. categ_feat stores # indicators of feature type (True if categorical, False if numerical) @@ -21,58 +22,57 @@ def do_a_fit_transform(self, sparse_input): # This feature should be normalized by having its mean subtracted from all # elements and by having them divided by the standard deviation. categ_feat.append(False) - nf = np.array([1., 2., 3.]).reshape(3, 1) # mean = 2. - sdev = np.sqrt(2. / 3.) - shift = 0 if sparse_input else 2. # if sparse_input, there is no mean subtraction + nf = np.array([1.0, 2.0, 3.0]).reshape(3, 1) # mean = 2. + sdev = np.sqrt(2.0 / 3.0) + shift = ( + 0 if sparse_input else 2.0 + ) # if sparse_input, there is no mean subtraction nft = (nf - shift) / sdev X.append(nf) Y.append(nft) # Feature 3 (numerical): - # This feature has a missing value that should be imputed by the mean of the other - # values (2.). This feature should also be normalized as in the previous feature. + # This feature has a missing value that should be imputed by the mean of the + # other values (2.). + # This feature should also be normalized as in the previous feature. categ_feat.append(False) - X.append(np.array([1., np.nan, 3.]).reshape(3, 1)) + X.append(np.array([1.0, np.nan, 3.0]).reshape(3, 1)) Y.append(nft.copy()) # Feature 4 (categorical) # This feature should be one hot encoded. categ_feat.append(True) X.append(np.array([1, 3, 2]).reshape(3, 1)) - Y.append(np.array([ - [1, 0, 0], - [0, 0, 1], - [0, 1, 0]])) + Y.append(np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0]])) # Feature 5 (categorical) # This feature should be one hot encoded. (A discontinuous category set or # a category 0 shouldn't be problems.) categ_feat.append(True) X.append(np.array([2, 1, 9]).reshape(3, 1)) - Y.append(np.array([ - [0, 1, 0], - [1, 0, 0], - [0, 0, 1]])) + Y.append(np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])) # Feature 6 (categorical) # This feature should be one hot encoded. The missing value gets imputed as # a category on its own. categ_feat.append(True) X.append(np.array([1, 1, np.nan]).reshape(3, 1)) - Y.append(np.array([ - [0, 1], - [0, 1], - [1, 0]])) + Y.append(np.array([[0, 1], [0, 1], [1, 0]])) # Combine datasets and shuffle columns: n_feats = len(categ_feat) random_order = np.random.choice(np.arange(n_feats), size=n_feats, replace=False) # Shuffle X according to random_order X = np.array(X)[random_order] X_comb = np.hstack(X) - # Shuffle Y according to random_order and reorder it as the PreprocessingPipeline - # does (i.e. categorical features come first in Y). + # Shuffle Y according to random_order and reorder it as the + # PreprocessingPipeline does (i.e. categorical features come first in Y). - categ_feat = {i: 'categorical' if categ_feat[order] else 'numerical' - for i, order in enumerate(random_order)} - cat_to_left_order = [index for col, index in sorted( - [(col_type, i) for i, col_type in categ_feat.items()] - )] + categ_feat = { + i: "categorical" if categ_feat[order] else "numerical" + for i, order in enumerate(random_order) + } + cat_to_left_order = [ + index + for col, index in sorted( + [(col_type, i) for i, col_type in categ_feat.items()] + ) + ] # Sort so that Y Matches the random ordering Y = [Y[n] for n in random_order] # Then move the categorical columns to the left @@ -101,15 +101,21 @@ def test_fit_transform_sparse(self): def test_string_categories(self): # Numerical dataset (as used in NumericalPreprocessingPipelineTest) - X_num = np.array([ - [3.14, 1., 1.], # noqa : matrix legibility - [3.14, 2., np.nan], # noqa : matrix legibility - [3.14, 3., 3.]]) # noqa : matrix legibility + X_num = np.array( + [ + [3.14, 1.0, 1.0], # noqa : matrix legibility + [3.14, 2.0, np.nan], # noqa : matrix legibility + [3.14, 3.0, 3.0], + ] + ) # noqa : matrix legibility # Categorical string dataset - X_cat = np.array([ - ['red', 'medium', 'small'], - ['blue', 'short', 'big'], - ['white', 'tall', np.nan]]) + X_cat = np.array( + [ + ["red", "medium", "small"], + ["blue", "short", "big"], + ["white", "tall", np.nan], + ] + ) # Combined dataset with shuffled columns: X_comb = np.hstack((X_num, X_cat)) categ_feat = [False] * 3 + [True] * 3 @@ -118,6 +124,8 @@ def test_string_categories(self): categ_feat = [categ_feat[order] for order in random_order] # Strings are not allowed, therefore: with self.assertRaises(ValueError): - categ_feat = {i: 'categorical' if feat else 'numerical' - for i, feat in enumerate(categ_feat)} + categ_feat = { + i: "categorical" if feat else "numerical" + for i, feat in enumerate(categ_feat) + } FeatTypeSplit(feat_type=categ_feat).fit_transform(X_comb) diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py index dbffe26f51..1d693eb150 100644 --- a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py +++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py @@ -1,34 +1,35 @@ import unittest -import numpy as np -from scipy import sparse +import numpy as np import pytest +from scipy import sparse -from autosklearn.pipeline.components.data_preprocessing.feature_type_categorical \ - import CategoricalPreprocessingPipeline +from autosklearn.pipeline.components.data_preprocessing.feature_type_categorical import ( # noqa: E501 + CategoricalPreprocessingPipeline, +) class CategoricalPreprocessingPipelineTest(unittest.TestCase): - def test_data_type_consistency(self): X = np.random.randint(3, 6, (3, 4)) Y = CategoricalPreprocessingPipeline().fit_transform(X) self.assertFalse(sparse.issparse(Y)) X = sparse.csc_matrix( - ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)) + ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4) + ) Y = CategoricalPreprocessingPipeline().fit_transform(X) self.assertTrue(sparse.issparse(Y)) def test_fit_transform(self): - X = np.array([ - [1, 2, 1], - [3, 1, 1], - [2, 9, np.nan]]) - Y = np.array([ - [1, 0, 0, 0, 1, 0, 0, 1], - [0, 0, 1, 1, 0, 0, 0, 1], - [0, 1, 0, 0, 0, 1, 1, 0]]) + X = np.array([[1, 2, 1], [3, 1, 1], [2, 9, np.nan]]) + Y = np.array( + [ + [1, 0, 0, 0, 1, 0, 0, 1], + [0, 0, 1, 1, 0, 0, 0, 1], + [0, 1, 0, 0, 0, 1, 1, 0], + ] + ) # dense input # Notice the X.copy() here as the imputation # is in place to save resources @@ -41,30 +42,30 @@ def test_fit_transform(self): np.testing.assert_array_equal(Yt, Y) def test_transform(self): - X1 = np.array([ - [1, 2, 0], - [3, 0, 0], - [2, 9, np.nan]]) - Y1 = np.array([ - [1, 0, 0, 0, 1, 0, 0, 1], - [0, 0, 1, 1, 0, 0, 0, 1], - [0, 1, 0, 0, 0, 1, 1, 0]]) - X2 = np.array([ - [2, 2, 1], - [3, 0, 0], - [2, np.nan, np.nan]]) - Y2 = np.array([ - [0, 1, 0, 0, 1, 0, 0, 0], - [0, 0, 1, 1, 0, 0, 0, 1], - [0, 1, 0, 0, 0, 0, 1, 0]]) - X3 = np.array([ - [3, np.nan, 0], - [3, 9, np.nan], - [2, 2, 5]]) - Y3 = np.array([ - [0, 0, 1, 0, 0, 0, 0, 1], - [0, 0, 1, 0, 0, 1, 1, 0], - [0, 1, 0, 0, 1, 0, 0, 0]]) + X1 = np.array([[1, 2, 0], [3, 0, 0], [2, 9, np.nan]]) + Y1 = np.array( + [ + [1, 0, 0, 0, 1, 0, 0, 1], + [0, 0, 1, 1, 0, 0, 0, 1], + [0, 1, 0, 0, 0, 1, 1, 0], + ] + ) + X2 = np.array([[2, 2, 1], [3, 0, 0], [2, np.nan, np.nan]]) + Y2 = np.array( + [ + [0, 1, 0, 0, 1, 0, 0, 0], + [0, 0, 1, 1, 0, 0, 0, 1], + [0, 1, 0, 0, 0, 0, 1, 0], + ] + ) + X3 = np.array([[3, np.nan, 0], [3, 9, np.nan], [2, 2, 5]]) + Y3 = np.array( + [ + [0, 0, 1, 0, 0, 0, 0, 1], + [0, 0, 1, 0, 0, 1, 1, 0], + [0, 1, 0, 0, 1, 0, 0, 0], + ] + ) # "fit" CPPL = CategoricalPreprocessingPipeline() CPPL.fit_transform(X1) @@ -81,13 +82,15 @@ def test_transform(self): def test_transform_with_coalescence(self): # Generates an array with categories 0, 20, 5, 6, 10, and occurences of 60%, # 30%, 19% 0.5% and 0.5% respectively - X = np.vstack(( - np.ones((120, 10)) * 0, - np.ones((60, 10)) * 20, - np.ones((18, 10)) * 5, - np.ones((1, 10)) * 6, - np.ones((1, 10)) * 10, - )) + X = np.vstack( + ( + np.ones((120, 10)) * 0, + np.ones((60, 10)) * 20, + np.ones((18, 10)) * 5, + np.ones((1, 10)) * 6, + np.ones((1, 10)) * 10, + ) + ) for col in range(X.shape[1]): np.random.shuffle(X[:, col]) @@ -100,10 +103,12 @@ def test_transform_with_coalescence(self): Y2t = CPPL.transform(X) np.testing.assert_array_equal(Y1t, Y2t) - @pytest.mark.xfail(reason=( - "Encoding step does not support sparse matrices to convert negative labels to" - " positive ones as it does with non-sparse matrices" - )) + @pytest.mark.xfail( + reason=( + "Encoding step does not support sparse matrices to convert negative labels" + " to positive ones as it does with non-sparse matrices" + ) + ) def test_transform_with_sparse_column_with_negative_labels(self): X = sparse.csr_matrix([[0], [-1]]) CategoricalPreprocessingPipeline().fit_transform(X) diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py index 6a0b9d37fc..5a0a840501 100644 --- a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py +++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py @@ -1,69 +1,64 @@ import unittest -import numpy as np +import numpy as np from scipy import sparse -from autosklearn.pipeline.components.data_preprocessing.feature_type_numerical \ - import NumericalPreprocessingPipeline +from autosklearn.pipeline.components.data_preprocessing.feature_type_numerical import ( + NumericalPreprocessingPipeline, +) class NumericalPreprocessingPipelineTest(unittest.TestCase): - def test_data_type_consistency(self): X = np.random.rand(3, 4) Y = NumericalPreprocessingPipeline().fit_transform(X) self.assertFalse(sparse.issparse(Y)) X = sparse.csc_matrix( - ([3., 6., 4., 5.], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)) + ([3.0, 6.0, 4.0, 5.0], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4) + ) Y = NumericalPreprocessingPipeline().fit_transform(X) self.assertTrue(sparse.issparse(Y)) def test_fit_transform(self): - X = np.array([ - [3.14, 1., 1.], - [3.14, 2., np.nan], - [3.14, 3., 3.]]) # noqa : matrix legibility + X = np.array( + [[3.14, 1.0, 1.0], [3.14, 2.0, np.nan], [3.14, 3.0, 3.0]] + ) # noqa : matrix legibility # 1st column should be droped due to low variance # The 2nd should be be standardized (default rescaling algorithm) - # The 3rd will get a value imputed by the mean (2.), therefore the transformation - # here will have the same effect as on the the 2nd column + # The 3rd will get a value imputed by the mean (2.), therefore the + # transformation here will have the same effect as on the the 2nd column sdev = np.sqrt(2 / 3) - Y1 = np.array([ - [-1/sdev, -1/sdev], - [ 0., 0.], # noqa : matrix legibility - [ 1/sdev, 1/sdev]]) # noqa : matrix legibility + Y1 = np.array( + [ + [-1 / sdev, -1 / sdev], + [0.0, 0.0], # noqa : matrix legibility + [1 / sdev, 1 / sdev], + ] + ) # noqa : matrix legibility # dense input Yt = NumericalPreprocessingPipeline().fit_transform(X) np.testing.assert_array_almost_equal(Yt, Y1) # sparse input (uses with_mean=False) - Y2 = np.array([ - [1., 1.], - [2., 2.], - [3., 3.]]) / sdev + Y2 = np.array([[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]]) / sdev X_sparse = sparse.csc_matrix(X) Yt = NumericalPreprocessingPipeline().fit_transform(X_sparse) np.testing.assert_array_almost_equal(Yt.todense(), Y2) def test_transform(self): - X1 = np.array([ - [3.14, 1., 1.], - [3.14, 2., np.nan], - [3.14, 3., 3.]]) # noqa : matrix legibility + X1 = np.array( + [[3.14, 1.0, 1.0], [3.14, 2.0, np.nan], [3.14, 3.0, 3.0]] + ) # noqa : matrix legibility sdev = np.sqrt(2 / 3) # fit NPP = NumericalPreprocessingPipeline() NPP.fit_transform(X1) # transform - X2 = np.array([ - [1., 5., 8.], - [2., 6., 9.], - [3., 7., np.nan]]) + X2 = np.array([[1.0, 5.0, 8.0], [2.0, 6.0, 9.0], [3.0, 7.0, np.nan]]) Yt = NPP.transform(X2) # imputation, variance_threshold and rescaling are done using the data already # fitted, therefore: - Y2 = np.array([ - [3/sdev, 6/sdev], - [4/sdev, 7/sdev], - [5/sdev, 0.]]) # noqa : matrix legibility + Y2 = np.array( + [[3 / sdev, 6 / sdev], [4 / sdev, 7 / sdev], [5 / sdev, 0.0]] + ) # noqa : matrix legibility np.testing.assert_array_almost_equal(Yt, Y2) diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_text.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_text.py index 0e39c5d7e9..0a2e3d5188 100644 --- a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_text.py +++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_text.py @@ -1,33 +1,57 @@ import unittest + import numpy as np import pandas as pd -from autosklearn.pipeline.components.data_preprocessing.text_encoding.bag_of_word_encoding import \ - BagOfWordEncoder as BOW -from autosklearn.pipeline.components.data_preprocessing.\ - text_encoding.bag_of_word_encoding_distinct import BagOfWordEncoder as BOW_distinct +from autosklearn.pipeline.components.data_preprocessing.text_encoding.bag_of_word_encoding import ( # noqa: E501 + BagOfWordEncoder as BOW, +) +from autosklearn.pipeline.components.data_preprocessing.text_encoding.bag_of_word_encoding_distinct import ( # noqa: E501 + BagOfWordEncoder as BOW_distinct, +) class TextPreprocessingPipelineTest(unittest.TestCase): - def test_fit_transform(self): - X = pd.DataFrame({"col1": ["hello world", - "This is a test"], - "col2": ["hello mars", - "This is the second column"]}).astype({"col1": "string", - "col2": "string"}) - BOW_fitted = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0, - min_df_relative=0, random_state=1).fit(X.copy()) + X = pd.DataFrame( + { + "col1": ["hello world", "This is a test"], + "col2": ["hello mars", "This is the second column"], + } + ).astype({"col1": "string", "col2": "string"}) + BOW_fitted = BOW( + ngram_range=1, + min_df_choice="min_df_absolute", + min_df_absolute=0, + min_df_relative=0, + random_state=1, + ).fit(X.copy()) Yt = BOW_fitted.preprocessor.vocabulary_ - words = sorted(["hello", "world", "this", "is", "test", # "a" is not added, len(...)=1 - "mars", "the", "second", "column"]) # is ignored by CountVectorizer + words = sorted( + [ + "hello", + "world", + "this", + "is", + "test", # "a" is not added, len(...)=1 + "mars", + "the", + "second", + "column", + ] + ) # is ignored by CountVectorizer Y = {key: idx for idx, key in enumerate(words)} np.testing.assert_array_equal(Yt, Y) - BOW_fitted = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0, - min_df_relative=0, random_state=1).fit(X.copy()) + BOW_fitted = BOW_distinct( + ngram_range=1, + min_df_choice="min_df_absolute", + min_df_absolute=0, + min_df_relative=0, + random_state=1, + ).fit(X.copy()) for key in BOW_fitted.preprocessor: y = [] @@ -38,58 +62,89 @@ def test_fit_transform(self): np.testing.assert_array_equal(yt, y) def test_transform(self): - X = pd.DataFrame({"col1": ["hello world", - "this is a test"], - "col2": ["hello mars", - "this is the second column"]}).astype({"col1": "string", - "col2": "string"}) - X_t = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0, - min_df_relative=0, random_state=1).fit_transform(X.copy()) + X = pd.DataFrame( + { + "col1": ["hello world", "this is a test"], + "col2": ["hello mars", "this is the second column"], + } + ).astype({"col1": "string", "col2": "string"}) + X_t = BOW( + ngram_range=1, + min_df_choice="min_df_absolute", + min_df_absolute=0, + min_df_relative=0, + random_state=1, + ).fit_transform(X.copy()) # ['column', 'hello', 'is', 'mars', 'second', 'test', 'the', 'this', 'world'] - y = np.array([[0, 2, 0, 1, 0, 0, 0, 0, 1], - [1, 0, 2, 0, 1, 1, 1, 2, 0]]) + y = np.array([[0, 2, 0, 1, 0, 0, 0, 0, 1], [1, 0, 2, 0, 1, 1, 1, 2, 0]]) np.testing.assert_array_equal(X_t.toarray(), y) - X_t = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0, - min_df_relative=0, random_state=1).fit_transform(X.copy()) + X_t = BOW_distinct( + ngram_range=1, + min_df_choice="min_df_absolute", + min_df_absolute=0, + min_df_relative=0, + random_state=1, + ).fit_transform(X.copy()) # 'hello', 'is', 'test', 'this', 'world', # 'column', 'hello', 'is', 'mars', 'second', 'the', 'this' - y = np.array([[1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0], - [0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]]) + y = np.array( + [[1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]] + ) np.testing.assert_array_equal(X_t.toarray(), y) def test_check_shape(self): - X = pd.DataFrame({"col1": ["hello world", - "this is test"], - "col2": ["test test", - "test test"]}).astype({"col1": "string", - "col2": "string"}) - X_t = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0, - min_df_relative=0, random_state=1).fit_transform(X.copy()) + X = pd.DataFrame( + { + "col1": ["hello world", "this is test"], + "col2": ["test test", "test test"], + } + ).astype({"col1": "string", "col2": "string"}) + X_t = BOW( + ngram_range=1, + min_df_choice="min_df_absolute", + min_df_absolute=0, + min_df_relative=0, + random_state=1, + ).fit_transform(X.copy()) self.assertEqual(X_t.shape, (2, 5)) - X_t = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0, - min_df_relative=0, random_state=1).fit_transform(X.copy()) + X_t = BOW_distinct( + ngram_range=1, + min_df_choice="min_df_absolute", + min_df_absolute=0, + min_df_relative=0, + random_state=1, + ).fit_transform(X.copy()) self.assertEqual(X_t.shape, (2, 6)) def test_check_nan(self): - X = pd.DataFrame({"col1": ["hello world", - "this is test", - None], - "col2": ["test test", - "test test", - "test"]}).astype({"col1": "string", - "col2": "string"}) - X_t = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0, - min_df_relative=0, random_state=1).fit_transform(X.copy()) + X = pd.DataFrame( + { + "col1": ["hello world", "this is test", None], + "col2": ["test test", "test test", "test"], + } + ).astype({"col1": "string", "col2": "string"}) + X_t = BOW( + ngram_range=1, + min_df_choice="min_df_absolute", + min_df_absolute=0, + min_df_relative=0, + random_state=1, + ).fit_transform(X.copy()) self.assertEqual(X_t.shape, (3, 5)) - X_t = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0, - min_df_relative=0, random_state=1).fit_transform(X.copy()) + X_t = BOW_distinct( + ngram_range=1, + min_df_choice="min_df_absolute", + min_df_absolute=0, + min_df_relative=0, + random_state=1, + ).fit_transform(X.copy()) self.assertEqual(X_t.shape, (3, 6)) diff --git a/test/test_pipeline/components/data_preprocessing/test_minority_coalescence.py b/test/test_pipeline/components/data_preprocessing/test_minority_coalescence.py index 7b3e35763e..8e73e963ab 100644 --- a/test/test_pipeline/components/data_preprocessing/test_minority_coalescence.py +++ b/test/test_pipeline/components/data_preprocessing/test_minority_coalescence.py @@ -1,23 +1,25 @@ import unittest -import numpy as np +import numpy as np import scipy.sparse -from autosklearn.pipeline.components.data_preprocessing.minority_coalescense\ - .minority_coalescer import MinorityCoalescer -from autosklearn.pipeline.components.data_preprocessing.minority_coalescense\ - .no_coalescense import NoCoalescence +from autosklearn.pipeline.components.data_preprocessing.minority_coalescense.minority_coalescer import ( # noqa: E501 + MinorityCoalescer, +) +from autosklearn.pipeline.components.data_preprocessing.minority_coalescense.no_coalescense import ( # noqa: E501 + NoCoalescence, +) class MinorityCoalescerTest(unittest.TestCase): - def test_data_type_consistency(self): X = np.random.randint(3, 6, (3, 4)) Y = MinorityCoalescer().fit_transform(X) self.assertFalse(scipy.sparse.issparse(Y)) X = scipy.sparse.csc_matrix( - ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)) + ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4) + ) Y = MinorityCoalescer().fit_transform(X) self.assertTrue(scipy.sparse.issparse(Y)) diff --git a/test/test_pipeline/components/data_preprocessing/test_numerical_imputation.py b/test/test_pipeline/components/data_preprocessing/test_numerical_imputation.py index 35d9d23a6d..d3354c3730 100644 --- a/test/test_pipeline/components/data_preprocessing/test_numerical_imputation.py +++ b/test/test_pipeline/components/data_preprocessing/test_numerical_imputation.py @@ -1,8 +1,9 @@ from scipy import sparse -from autosklearn.pipeline.components.data_preprocessing.imputation.numerical_imputation\ - import NumericalImputation -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase +from autosklearn.pipeline.components.data_preprocessing.imputation.numerical_imputation import ( # noqa: E501 + NumericalImputation, +) +from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing class NumericalImputationTest(PreprocessingTestCase): @@ -14,13 +15,13 @@ def test_default_configuration(self): self.assertTrue((transformation == original).all()) transformations.append(transformation) if len(transformations) > 1: - self.assertTrue( - (transformations[-1] == transformations[-2]).all()) + self.assertTrue((transformations[-1] == transformations[-2]).all()) def test_default_configuration_sparse_data(self): transformations = [] - transformation, original = _test_preprocessing(NumericalImputation, - make_sparse=True) + transformation, original = _test_preprocessing( + NumericalImputation, make_sparse=True + ) self.assertEqual(transformation.shape, original.shape) self.assertTrue((transformation.data == original.data).all()) self.assertIsInstance(transformation, sparse.csc_matrix) @@ -28,4 +29,5 @@ def test_default_configuration_sparse_data(self): def test_preprocessing_dtype(self): super(NumericalImputationTest, self)._test_preprocessing_dtype( - NumericalImputation, add_NaNs=True) + NumericalImputation, add_NaNs=True + ) diff --git a/test/test_pipeline/components/data_preprocessing/test_one_hot_encoding.py b/test/test_pipeline/components/data_preprocessing/test_one_hot_encoding.py index ba99724964..08d2cadd9e 100644 --- a/test/test_pipeline/components/data_preprocessing/test_one_hot_encoding.py +++ b/test/test_pipeline/components/data_preprocessing/test_one_hot_encoding.py @@ -3,10 +3,12 @@ import numpy as np from scipy import sparse -from autosklearn.pipeline.components.data_preprocessing.categorical_encoding.\ - one_hot_encoding import OneHotEncoder -from autosklearn.pipeline.components.data_preprocessing.categorical_encoding.\ - no_encoding import NoEncoding +from autosklearn.pipeline.components.data_preprocessing.categorical_encoding.no_encoding import ( # noqa: E501 + NoEncoding, +) +from autosklearn.pipeline.components.data_preprocessing.categorical_encoding.one_hot_encoding import ( # noqa: E501 + OneHotEncoder, +) from autosklearn.pipeline.util import _test_preprocessing @@ -18,7 +20,6 @@ def create_X(instances=1000, n_feats=10, categs_per_feat=5, seed=0): class OneHotEncoderTest(unittest.TestCase): - def setUp(self): self.X_train = create_X() @@ -28,7 +29,8 @@ def test_data_type_consistency(self): self.assertFalse(sparse.issparse(Y)) X = sparse.csc_matrix( - ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)) + ([3, 6, 4, 5], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4) + ) Y = OneHotEncoder().fit_transform(X) self.assertTrue(sparse.issparse(Y)) @@ -54,8 +56,7 @@ def test_default_configuration_no_encoding(self): self.assertTrue((transformation == original).all()) transformations.append(transformation) if len(transformations) > 1: - self.assertTrue( - (transformations[-1] == transformations[-2]).all()) + self.assertTrue((transformations[-1] == transformations[-2]).all()) def test_default_configuration_sparse_data(self): transformations = [] @@ -74,17 +75,18 @@ def test_default_configuration_sparse_data(self): transformations.append(Xt) if len(transformations) > 1: self.assertEqual( - (transformations[-1] != transformations[-2]).count_nonzero(), 0) + (transformations[-1] != transformations[-2]).count_nonzero(), 0 + ) def test_default_configuration_sparse_no_encoding(self): transformations = [] for i in range(2): - transformation, original = _test_preprocessing(NoEncoding, - make_sparse=True) + transformation, original = _test_preprocessing(NoEncoding, make_sparse=True) self.assertEqual(transformation.shape, original.shape) self.assertTrue((transformation.todense() == original.todense()).all()) transformations.append(transformation) if len(transformations) > 1: self.assertEqual( - (transformations[-1] != transformations[-2]).count_nonzero(), 0) + (transformations[-1] != transformations[-2]).count_nonzero(), 0 + ) diff --git a/test/test_pipeline/components/data_preprocessing/test_scaling.py b/test/test_pipeline/components/data_preprocessing/test_scaling.py index f800930dda..7f8249e3f1 100644 --- a/test/test_pipeline/components/data_preprocessing/test_scaling.py +++ b/test/test_pipeline/components/data_preprocessing/test_scaling.py @@ -12,13 +12,14 @@ def _test_helper(self, Preprocessor, dataset=None, make_sparse=False): X_train, Y_train, X_test, Y_test = get_dataset( dataset=dataset, make_sparse=make_sparse, - ) + ) - dataset_properties = {'sparse': make_sparse} + dataset_properties = {"sparse": make_sparse} original_X_train = X_train.copy() - configuration_space = Preprocessor(dataset_properties).\ - get_hyperparameter_search_space(dataset_properties) + configuration_space = Preprocessor( + dataset_properties + ).get_hyperparameter_search_space(dataset_properties) default = configuration_space.get_default_configuration() preprocessor = Preprocessor(dataset_properties, random_state=1) @@ -28,31 +29,31 @@ def _test_helper(self, Preprocessor, dataset=None, make_sparse=False): return transformer.transform(X_train), original_X_train def test_boston_is_not_scaled(self): - data = sklearn.datasets.load_boston()['data'] + data = sklearn.datasets.load_boston()["data"] self.assertGreaterEqual(np.max(data), 100) def test_default_configuration(self): transformations = [] for i in range(2): - transformation, original = self._test_helper(RescalingChoice, - dataset='boston') + transformation, original = self._test_helper( + RescalingChoice, dataset="boston" + ) # The maximum is around 1.95 for the transformed array... self.assertAlmostEqual(np.mean(transformation), 0, places=5) self.assertAlmostEqual(np.std(transformation), 1, places=5) self.assertFalse((original == transformation).all()) transformations.append(transformation) if len(transformations) > 1: - self.assertTrue( - (transformations[-1] == transformations[-2]).all()) + self.assertTrue((transformations[-1] == transformations[-2]).all()) def test_default_configuration_with_sparse_data(self): - preprocessing = self._test_helper(RescalingChoice, dataset='boston', - make_sparse=True) + preprocessing = self._test_helper( + RescalingChoice, dataset="boston", make_sparse=True + ) transformation, original = preprocessing self.assertEqual(original.getnnz(), transformation.getnnz()) self.assertTrue(~np.allclose(original.data, transformation.data)) @unittest.skip("Does not work at the moment.") def test_preprocessing_dtype(self): - super(ScalingComponentTest, self)._test_helper( - RescalingChoice) + super(ScalingComponentTest, self)._test_helper(RescalingChoice) diff --git a/test/test_pipeline/components/data_preprocessing/test_variance_threshold.py b/test/test_pipeline/components/data_preprocessing/test_variance_threshold.py index 4da441828d..a9ba4083ca 100644 --- a/test/test_pipeline/components/data_preprocessing/test_variance_threshold.py +++ b/test/test_pipeline/components/data_preprocessing/test_variance_threshold.py @@ -1,8 +1,9 @@ from scipy import sparse -from autosklearn.pipeline.components.data_preprocessing.variance_threshold.variance_threshold \ - import VarianceThreshold -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase +from autosklearn.pipeline.components.data_preprocessing.variance_threshold.variance_threshold import ( # noqa: E501 + VarianceThreshold, +) +from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing class VarianceThresholdTest(PreprocessingTestCase): @@ -14,13 +15,13 @@ def test_default_configuration(self): self.assertTrue((transformation == original).all()) transformations.append(transformation) if len(transformations) > 1: - self.assertTrue( - (transformations[-1] == transformations[-2]).all()) + self.assertTrue((transformations[-1] == transformations[-2]).all()) def test_default_configuration_sparse_data(self): transformations = [] - transformation, original = _test_preprocessing(VarianceThreshold, - make_sparse=True) + transformation, original = _test_preprocessing( + VarianceThreshold, make_sparse=True + ) self.assertEqual(transformation.shape, (100, 3)) self.assertTrue((transformation.toarray() == original.toarray()[:, 1:]).all()) self.assertIsInstance(transformation, sparse.csr_matrix) diff --git a/test/test_pipeline/components/dummy_components/dummy_component_1.py b/test/test_pipeline/components/dummy_components/dummy_component_1.py index 06074db983..0af3466787 100644 --- a/test/test_pipeline/components/dummy_components/dummy_component_1.py +++ b/test/test_pipeline/components/dummy_components/dummy_component_1.py @@ -5,7 +5,7 @@ # Add the parent directory to the path to import the parent component this_directory = os.path.dirname(os.path.abspath(__file__)) -parent_directory = os.path.abspath(os.path.join(this_directory, '..')) +parent_directory = os.path.abspath(os.path.join(this_directory, "..")) sys.path.append(parent_directory) diff --git a/test/test_pipeline/components/dummy_components/dummy_component_2.py b/test/test_pipeline/components/dummy_components/dummy_component_2.py index 9b67230e4c..f941dcdb40 100644 --- a/test/test_pipeline/components/dummy_components/dummy_component_2.py +++ b/test/test_pipeline/components/dummy_components/dummy_component_2.py @@ -6,7 +6,7 @@ # Add the parent directory to the path to import the parent component as # dummy_components.dummy_component_2.DummyComponent1 this_directory = os.path.dirname(os.path.abspath(__file__)) -parent_directory = os.path.abspath(os.path.join(this_directory, '..')) +parent_directory = os.path.abspath(os.path.join(this_directory, "..")) sys.path.append(parent_directory) diff --git a/test/test_pipeline/components/dummy_components/dummy_component_import.py b/test/test_pipeline/components/dummy_components/dummy_component_import.py index f7981a40a3..a4cb764215 100644 --- a/test/test_pipeline/components/dummy_components/dummy_component_import.py +++ b/test/test_pipeline/components/dummy_components/dummy_component_import.py @@ -1,2 +1,4 @@ -from autosklearn.pipeline.components.base import find_components # noqa -from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm # noqa +from autosklearn.pipeline.components.base import find_components # noqa +from autosklearn.pipeline.components.base import ( # noqa + AutoSklearnClassificationAlgorithm, +) diff --git a/test/test_pipeline/components/feature_preprocessing/__init__.py b/test/test_pipeline/components/feature_preprocessing/__init__.py index 8f0ce6cb7c..92bf78f389 100644 --- a/test/test_pipeline/components/feature_preprocessing/__init__.py +++ b/test/test_pipeline/components/feature_preprocessing/__init__.py @@ -1 +1 @@ -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/test/test_pipeline/components/feature_preprocessing/test_NoPreprocessing.py b/test/test_pipeline/components/feature_preprocessing/test_NoPreprocessing.py index 22811e75bb..440f2fd50d 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_NoPreprocessing.py +++ b/test/test_pipeline/components/feature_preprocessing/test_NoPreprocessing.py @@ -1,7 +1,9 @@ import numpy as np -from autosklearn.pipeline.components.feature_preprocessing.no_preprocessing import NoPreprocessing -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase +from autosklearn.pipeline.components.feature_preprocessing.no_preprocessing import ( + NoPreprocessing, +) +from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing class NoneComponentTest(PreprocessingTestCase): diff --git a/test/test_pipeline/components/feature_preprocessing/test_choice.py b/test/test_pipeline/components/feature_preprocessing/test_choice.py index 525ec38356..516cf318bf 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_choice.py +++ b/test/test_pipeline/components/feature_preprocessing/test_choice.py @@ -6,27 +6,27 @@ class FeatureProcessingTest(unittest.TestCase): def test_get_available_components(self): # Target type - for target_type, num_values in [('classification', 15), - ('regression', 14)]: - data_properties = {'target_type': target_type} + for target_type, num_values in [("classification", 15), ("regression", 14)]: + data_properties = {"target_type": target_type} - available_components = fp.FeaturePreprocessorChoice(data_properties)\ - .get_available_components(data_properties) + available_components = fp.FeaturePreprocessorChoice( + data_properties + ).get_available_components(data_properties) self.assertEqual(len(available_components), num_values) # Multiclass - data_properties = {'target_type': 'classification', - 'multiclass': True} - available_components = fp.FeaturePreprocessorChoice(data_properties) \ - .get_available_components(data_properties) + data_properties = {"target_type": "classification", "multiclass": True} + available_components = fp.FeaturePreprocessorChoice( + data_properties + ).get_available_components(data_properties) self.assertEqual(len(available_components), 15) # Multilabel - data_properties = {'target_type': 'classification', - 'multilabel': True} - available_components = fp.FeaturePreprocessorChoice(data_properties) \ - .get_available_components(data_properties) + data_properties = {"target_type": "classification", "multilabel": True} + available_components = fp.FeaturePreprocessorChoice( + data_properties + ).get_available_components(data_properties) self.assertEqual(len(available_components), 12) diff --git a/test/test_pipeline/components/feature_preprocessing/test_densifier.py b/test/test_pipeline/components/feature_preprocessing/test_densifier.py index 6f02ee0e5b..9831a53e57 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_densifier.py +++ b/test/test_pipeline/components/feature_preprocessing/test_densifier.py @@ -1,7 +1,7 @@ import numpy as np from autosklearn.pipeline.components.feature_preprocessing.densifier import Densifier -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase +from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing class DensifierComponentTest(PreprocessingTestCase): diff --git a/test/test_pipeline/components/feature_preprocessing/test_extra_trees_classification.py b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_classification.py index 6b69462fec..2db52679c7 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_extra_trees_classification.py +++ b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_classification.py @@ -1,29 +1,36 @@ -from sklearn.linear_model import RidgeClassifier -from autosklearn.pipeline.components.feature_preprocessing.\ - extra_trees_preproc_for_classification import \ - ExtraTreesPreprocessorClassification -from autosklearn.pipeline.util import _test_preprocessing, \ - PreprocessingTestCase, get_dataset import sklearn.metrics +from sklearn.linear_model import RidgeClassifier + +from autosklearn.pipeline.components.feature_preprocessing.extra_trees_preproc_for_classification import ( # noqa: E501 + ExtraTreesPreprocessorClassification, +) +from autosklearn.pipeline.util import ( + PreprocessingTestCase, + _test_preprocessing, + get_dataset, +) class ExtreTreesClassificationComponentTest(PreprocessingTestCase): def test_default_configuration(self): transformation, original = _test_preprocessing( - ExtraTreesPreprocessorClassification) + ExtraTreesPreprocessorClassification + ) self.assertEqual(transformation.shape[0], original.shape[0]) self.assertFalse((transformation == 0).all()) def test_default_configuration_classify(self): for i in range(2): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', - make_sparse=False) - configuration_space = ExtraTreesPreprocessorClassification.\ - get_hyperparameter_search_space() + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="digits", make_sparse=False + ) + configuration_space = ( + ExtraTreesPreprocessorClassification.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() preprocessor = ExtraTreesPreprocessorClassification( - random_state=1, - **{hp_name: default[hp_name] for hp_name in default}) + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) @@ -37,14 +44,16 @@ def test_default_configuration_classify(self): def test_default_configuration_classify_sparse(self): for i in range(2): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', - make_sparse=True) - configuration_space = ExtraTreesPreprocessorClassification.\ - get_hyperparameter_search_space() + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="digits", make_sparse=True + ) + configuration_space = ( + ExtraTreesPreprocessorClassification.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() preprocessor = ExtraTreesPreprocessorClassification( - random_state=1, - **{hp_name: default[hp_name] for hp_name in default}) + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) @@ -57,5 +66,6 @@ def test_default_configuration_classify_sparse(self): self.assertAlmostEqual(accuracy, 0.43715846994535518, places=2) def test_preprocessing_dtype(self): - super(ExtreTreesClassificationComponentTest, self).\ - _test_preprocessing_dtype(ExtraTreesPreprocessorClassification) + super(ExtreTreesClassificationComponentTest, self)._test_preprocessing_dtype( + ExtraTreesPreprocessorClassification + ) diff --git a/test/test_pipeline/components/feature_preprocessing/test_extra_trees_regression.py b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_regression.py index b850d5aa99..cd6ae3dd21 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_extra_trees_regression.py +++ b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_regression.py @@ -1,29 +1,34 @@ -from sklearn.ensemble import ExtraTreesRegressor -from autosklearn.pipeline.components.feature_preprocessing.\ - extra_trees_preproc_for_regression import \ - ExtraTreesPreprocessorRegression -from autosklearn.pipeline.util import _test_preprocessing, \ - PreprocessingTestCase, get_dataset import sklearn.metrics +from sklearn.ensemble import ExtraTreesRegressor + +from autosklearn.pipeline.components.feature_preprocessing.extra_trees_preproc_for_regression import ( # noqa: E501 + ExtraTreesPreprocessorRegression, +) +from autosklearn.pipeline.util import ( + PreprocessingTestCase, + _test_preprocessing, + get_dataset, +) class ExtraTreesRegressionComponentTest(PreprocessingTestCase): def test_default_configuration(self): - transformation, original = _test_preprocessing( - ExtraTreesPreprocessorRegression) + transformation, original = _test_preprocessing(ExtraTreesPreprocessorRegression) self.assertEqual(transformation.shape[0], original.shape[0]) self.assertFalse((transformation == 0).all()) def test_default_configuration_regression(self): for i in range(2): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston', - make_sparse=False) - configuration_space = ExtraTreesPreprocessorRegression.\ - get_hyperparameter_search_space() + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="boston", make_sparse=False + ) + configuration_space = ( + ExtraTreesPreprocessorRegression.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() preprocessor = ExtraTreesPreprocessorRegression( - random_state=1, - **{hp_name: default[hp_name] for hp_name in default}) + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) @@ -37,14 +42,16 @@ def test_default_configuration_regression(self): def test_default_configuration_classify_sparse(self): for i in range(2): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston', - make_sparse=True) - configuration_space = ExtraTreesPreprocessorRegression.\ - get_hyperparameter_search_space() + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="boston", make_sparse=True + ) + configuration_space = ( + ExtraTreesPreprocessorRegression.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() preprocessor = ExtraTreesPreprocessorRegression( - random_state=1, - **{hp_name: default[hp_name] for hp_name in default}) + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) @@ -57,5 +64,6 @@ def test_default_configuration_classify_sparse(self): self.assertAlmostEqual(error, 55.69613978965742, places=2) def test_preprocessing_dtype(self): - super(ExtraTreesRegressionComponentTest, self).\ - _test_preprocessing_dtype(ExtraTreesPreprocessorRegression) + super(ExtraTreesRegressionComponentTest, self)._test_preprocessing_dtype( + ExtraTreesPreprocessorRegression + ) diff --git a/test/test_pipeline/components/feature_preprocessing/test_fast_ica.py b/test/test_pipeline/components/feature_preprocessing/test_fast_ica.py index ae22d65c54..a38097a60e 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_fast_ica.py +++ b/test/test_pipeline/components/feature_preprocessing/test_fast_ica.py @@ -1,28 +1,30 @@ import unittest -from sklearn.linear_model import Ridge -from autosklearn.pipeline.components.feature_preprocessing.fast_ica import \ - FastICA -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \ - get_dataset import sklearn.metrics +from sklearn.linear_model import Ridge + +from autosklearn.pipeline.components.feature_preprocessing.fast_ica import FastICA +from autosklearn.pipeline.util import ( + PreprocessingTestCase, + _test_preprocessing, + get_dataset, +) class FastICAComponentTest(PreprocessingTestCase): def test_default_configuration(self): - transformation, original = _test_preprocessing(FastICA, - dataset="diabetes") + transformation, original = _test_preprocessing(FastICA, dataset="diabetes") self.assertEqual(transformation.shape[0], original.shape[0]) self.assertFalse((transformation == 0).all()) def test_default_configuration_regression(self): for i in range(5): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="diabetes") configuration_space = FastICA.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() - preprocessor = FastICA(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) + preprocessor = FastICA( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) @@ -36,5 +38,6 @@ def test_default_configuration_regression(self): @unittest.skip("Always returns float64") def test_preprocessing_dtype(self): - super(FastICAComponentTest, - self)._test_preprocessing_dtype(FastICA, dataset='diabetes') + super(FastICAComponentTest, self)._test_preprocessing_dtype( + FastICA, dataset="diabetes" + ) diff --git a/test/test_pipeline/components/feature_preprocessing/test_feature_agglomeration.py b/test/test_pipeline/components/feature_preprocessing/test_feature_agglomeration.py index 0cac9426d2..afccd79c31 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_feature_agglomeration.py +++ b/test/test_pipeline/components/feature_preprocessing/test_feature_agglomeration.py @@ -1,9 +1,14 @@ -from sklearn.ensemble import RandomForestClassifier -from autosklearn.pipeline.components.feature_preprocessing.feature_agglomeration import \ - FeatureAgglomeration -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \ - get_dataset import sklearn.metrics +from sklearn.ensemble import RandomForestClassifier + +from autosklearn.pipeline.components.feature_preprocessing.feature_agglomeration import ( # noqa: E501 + FeatureAgglomeration, +) +from autosklearn.pipeline.util import ( + PreprocessingTestCase, + _test_preprocessing, + get_dataset, +) class FeatureAgglomerationComponentTest(PreprocessingTestCase): @@ -14,13 +19,14 @@ def test_default_configuration(self): def test_default_configuration_classify(self): for i in range(3): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', - make_sparse=False) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="digits", make_sparse=False + ) configuration_space = FeatureAgglomeration.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() - preprocessor = FeatureAgglomeration(random_state=1, - **{hp_name: default[hp_name] for - hp_name in default}) + preprocessor = FeatureAgglomeration( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) @@ -33,6 +39,6 @@ def test_default_configuration_classify(self): self.assertAlmostEqual(accuracy, 0.8761384335154827) def test_preprocessing_dtype(self): - super(FeatureAgglomerationComponentTest, - self)._test_preprocessing_dtype(FeatureAgglomeration, - test_sparse=False) + super(FeatureAgglomerationComponentTest, self)._test_preprocessing_dtype( + FeatureAgglomeration, test_sparse=False + ) diff --git a/test/test_pipeline/components/feature_preprocessing/test_kernel_pca.py b/test/test_pipeline/components/feature_preprocessing/test_kernel_pca.py index 19b1368a49..2c5a8c865b 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_kernel_pca.py +++ b/test/test_pipeline/components/feature_preprocessing/test_kernel_pca.py @@ -1,38 +1,46 @@ import unittest -from sklearn.linear_model import RidgeClassifier -from autosklearn.pipeline.components.feature_preprocessing.kernel_pca import \ - KernelPCA -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \ - get_dataset import sklearn.metrics +from sklearn.linear_model import RidgeClassifier + +from autosklearn.pipeline.components.feature_preprocessing.kernel_pca import KernelPCA +from autosklearn.pipeline.util import ( + PreprocessingTestCase, + _test_preprocessing, + get_dataset, +) class KernelPCAComponentTest(PreprocessingTestCase): def test_default_configuration(self): - transformation, original = _test_preprocessing(KernelPCA, - dataset='digits', - train_size_maximum=2000) + transformation, original = _test_preprocessing( + KernelPCA, dataset="digits", train_size_maximum=2000 + ) self.assertEqual(transformation.shape[0], original.shape[0]) self.assertFalse((transformation == 0).all()) def test_default_configuration_sparse(self): - transformation, original = _test_preprocessing(KernelPCA, - make_sparse=True, - dataset='digits') + transformation, original = _test_preprocessing( + KernelPCA, make_sparse=True, dataset="digits" + ) self.assertEqual(transformation.shape[0], original.shape[0]) self.assertFalse((transformation == 0).all()) def test_default_configuration_classify(self): for i in range(5): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', - make_sparse=False, - train_size_maximum=1000) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="digits", make_sparse=False, train_size_maximum=1000 + ) configuration_space = KernelPCA.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() - preprocessor = KernelPCA(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default if default[hp_name] is not None}) + preprocessor = KernelPCA( + random_state=1, + **{ + hp_name: default[hp_name] + for hp_name in default + if default[hp_name] is not None + }, + ) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) @@ -46,5 +54,4 @@ def test_default_configuration_classify(self): @unittest.skip("Always returns float64") def test_preprocessing_dtype(self): - super(KernelPCAComponentTest, - self)._test_preprocessing_dtype(KernelPCA) + super(KernelPCAComponentTest, self)._test_preprocessing_dtype(KernelPCA) diff --git a/test/test_pipeline/components/feature_preprocessing/test_kitchen_sinks.py b/test/test_pipeline/components/feature_preprocessing/test_kitchen_sinks.py index c94e6f9a55..16ef41198d 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_kitchen_sinks.py +++ b/test/test_pipeline/components/feature_preprocessing/test_kitchen_sinks.py @@ -1,7 +1,9 @@ import unittest -from autosklearn.pipeline.components.feature_preprocessing.kitchen_sinks import RandomKitchenSinks -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase +from autosklearn.pipeline.components.feature_preprocessing.kitchen_sinks import ( + RandomKitchenSinks, +) +from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing class KitchenSinkComponent(PreprocessingTestCase): @@ -13,5 +15,4 @@ def test_default_configuration(self): @unittest.skip("Right now, the RBFSampler returns a float64 array!") def test_preprocessing_dtype(self): - super(KitchenSinkComponent, - self)._test_preprocessing_dtype(RandomKitchenSinks) + super(KitchenSinkComponent, self)._test_preprocessing_dtype(RandomKitchenSinks) diff --git a/test/test_pipeline/components/feature_preprocessing/test_liblinear.py b/test/test_pipeline/components/feature_preprocessing/test_liblinear.py index 19b56b6eac..0195dfb701 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_liblinear.py +++ b/test/test_pipeline/components/feature_preprocessing/test_liblinear.py @@ -1,15 +1,22 @@ -from sklearn.linear_model import RidgeClassifier -from autosklearn.pipeline.components.feature_preprocessing.liblinear_svc_preprocessor import \ - LibLinear_Preprocessor -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \ - get_dataset import sklearn.metrics +from sklearn.linear_model import RidgeClassifier -from test.test_pipeline.ignored_warnings import ignore_warnings, feature_preprocessing_warnings +from autosklearn.pipeline.components.feature_preprocessing.liblinear_svc_preprocessor import ( # noqa: E501 + LibLinear_Preprocessor, +) +from autosklearn.pipeline.util import ( + PreprocessingTestCase, + _test_preprocessing, + get_dataset, +) +from test.test_pipeline.ignored_warnings import ( + feature_preprocessing_warnings, + ignore_warnings, +) -class LiblinearComponentTest(PreprocessingTestCase): +class LiblinearComponentTest(PreprocessingTestCase): def test_default_configuration(self): with ignore_warnings(feature_preprocessing_warnings): transformation, original = _test_preprocessing(LibLinear_Preprocessor) @@ -19,15 +26,21 @@ def test_default_configuration(self): def test_default_configuration_classify(self): for i in range(2): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', - make_sparse=False) - configuration_space = LibLinear_Preprocessor.get_hyperparameter_search_space() + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="digits", make_sparse=False + ) + configuration_space = ( + LibLinear_Preprocessor.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = LibLinear_Preprocessor(random_state=1, - **{hp_name: default[hp_name] - for hp_name in - default if default[ - hp_name] is not None}) + preprocessor = LibLinear_Preprocessor( + random_state=1, + **{ + hp_name: default[hp_name] + for hp_name in default + if default[hp_name] is not None + }, + ) with ignore_warnings(feature_preprocessing_warnings): preprocessor.fit(X_train, Y_train) diff --git a/test/test_pipeline/components/feature_preprocessing/test_nystroem_sampler.py b/test/test_pipeline/components/feature_preprocessing/test_nystroem_sampler.py index b3db49ebca..d6244c362f 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_nystroem_sampler.py +++ b/test/test_pipeline/components/feature_preprocessing/test_nystroem_sampler.py @@ -3,8 +3,9 @@ import numpy as np import sklearn.preprocessing -from autosklearn.pipeline.components.feature_preprocessing.nystroem_sampler import \ - Nystroem +from autosklearn.pipeline.components.feature_preprocessing.nystroem_sampler import ( + Nystroem, +) from autosklearn.pipeline.util import _test_preprocessing, get_dataset @@ -16,7 +17,7 @@ def test_default_configuration(self): self.assertFalse((transformation == 0).all()) # Custon preprocessing test to check if clipping to zero works - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") original_X_train = X_train.copy() ss = sklearn.preprocessing.StandardScaler() X_train = ss.fit_transform(X_train) @@ -25,12 +26,15 @@ def test_default_configuration(self): preprocessor = Nystroem( random_state=1, - **{hp_name: default[hp_name] for hp_name in default if default[hp_name] is not None}, - ) + **{ + hp_name: default[hp_name] + for hp_name in default + if default[hp_name] is not None + }, + ) transformer = preprocessor.fit(X_train, Y_train) - transformation, original = transformer.transform( - X_train), original_X_train + transformation, original = transformer.transform(X_train), original_X_train self.assertEqual(transformation.shape[0], original.shape[0]) self.assertEqual(transformation.shape[1], 100) @@ -46,7 +50,7 @@ def _test_preprocessing_dtype(self): preprocessor = Nystroem( random_state=1, **{hp.hyperparameter.name: hp.value for hp in default.values.values()}, - ) + ) preprocessor.fit(X_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float32) @@ -59,7 +63,7 @@ def _test_preprocessing_dtype(self): preprocessor = Nystroem( random_state=1, **{hp.hyperparameter.name: hp.value for hp in default.values.values()}, - ) + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float64) @@ -73,7 +77,7 @@ def _test_preprocessing_dtype(self): preprocessor = Nystroem( random_state=1, **{hp.hyperparameter.name: hp.value for hp in default.values.values()}, - ) + ) preprocessor.fit(X_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float32) @@ -86,7 +90,7 @@ def _test_preprocessing_dtype(self): preprocessor = Nystroem( random_state=1, **{hp.hyperparameter.name: hp.value for hp in default.values.values()}, - ) + ) preprocessor.fit(X_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float64) diff --git a/test/test_pipeline/components/feature_preprocessing/test_pca.py b/test/test_pipeline/components/feature_preprocessing/test_pca.py index 02ab8bdd0e..b73da8aa64 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_pca.py +++ b/test/test_pipeline/components/feature_preprocessing/test_pca.py @@ -1,7 +1,7 @@ import numpy as np from autosklearn.pipeline.components.feature_preprocessing.pca import PCA -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase +from autosklearn.pipeline.util import PreprocessingTestCase, _test_preprocessing class PCAComponentTest(PreprocessingTestCase): @@ -13,9 +13,9 @@ def test_default_configuration(self): self.assertFalse((transformation == original).all()) transformations.append(transformation) if len(transformations) > 1: - np.testing.assert_allclose(transformations[-1], - transformations[-2], rtol=1e-4) + np.testing.assert_allclose( + transformations[-1], transformations[-2], rtol=1e-4 + ) def test_preprocessing_dtype(self): - super(PCAComponentTest, self)._test_preprocessing_dtype(PCA, - test_sparse=False) + super(PCAComponentTest, self)._test_preprocessing_dtype(PCA, test_sparse=False) diff --git a/test/test_pipeline/components/feature_preprocessing/test_polynomial.py b/test/test_pipeline/components/feature_preprocessing/test_polynomial.py index 28f84bc595..3c9e93a49c 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_polynomial.py +++ b/test/test_pipeline/components/feature_preprocessing/test_polynomial.py @@ -1,9 +1,14 @@ -from sklearn.tree import DecisionTreeClassifier -from autosklearn.pipeline.components.feature_preprocessing.polynomial import \ - PolynomialFeatures -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \ - get_dataset import sklearn.metrics +from sklearn.tree import DecisionTreeClassifier + +from autosklearn.pipeline.components.feature_preprocessing.polynomial import ( + PolynomialFeatures, +) +from autosklearn.pipeline.util import ( + PreprocessingTestCase, + _test_preprocessing, + get_dataset, +) class PolynomialFeaturesComponentTest(PreprocessingTestCase): @@ -14,13 +19,14 @@ def test_default_configuration(self): def test_default_configuration_classify(self): for i in range(2): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='breast_cancer', - make_sparse=False) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="breast_cancer", make_sparse=False + ) configuration_space = PolynomialFeatures.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() - preprocessor = PolynomialFeatures(random_state=1, - **{hp_name: default[hp_name] for - hp_name in default}) + preprocessor = PolynomialFeatures( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) @@ -34,13 +40,14 @@ def test_default_configuration_classify(self): def test_default_configuration_classify_sparse(self): for i in range(2): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='breast_cancer', - make_sparse=True) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="breast_cancer", make_sparse=True + ) configuration_space = PolynomialFeatures.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() - preprocessor = PolynomialFeatures(random_state=1, - **{hp_name: default[hp_name] for - hp_name in default}) + preprocessor = PolynomialFeatures( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) @@ -53,6 +60,6 @@ def test_default_configuration_classify_sparse(self): self.assertAlmostEqual(accuracy, 0.8544152744630071, places=2) def test_preprocessing_dtype(self): - super(PolynomialFeaturesComponentTest, - self)._test_preprocessing_dtype(PolynomialFeatures, - test_sparse=False) + super(PolynomialFeaturesComponentTest, self)._test_preprocessing_dtype( + PolynomialFeatures, test_sparse=False + ) diff --git a/test/test_pipeline/components/feature_preprocessing/test_random_trees_embedding.py b/test/test_pipeline/components/feature_preprocessing/test_random_trees_embedding.py index 681c319830..f84675dc1a 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_random_trees_embedding.py +++ b/test/test_pipeline/components/feature_preprocessing/test_random_trees_embedding.py @@ -3,8 +3,9 @@ import numpy as np import scipy.sparse -from autosklearn.pipeline.components.feature_preprocessing.random_trees_embedding import \ - RandomTreesEmbedding +from autosklearn.pipeline.components.feature_preprocessing.random_trees_embedding import ( # noqa: E501 + RandomTreesEmbedding, +) from autosklearn.pipeline.util import _test_preprocessing, get_dataset @@ -26,10 +27,9 @@ def test_preprocessing_dtype(self): configuration_space = RandomTreesEmbedding.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() - preprocessor = RandomTreesEmbedding(random_state=1, - **{hp_name: default[hp_name] for - hp_name in - default}) + preprocessor = RandomTreesEmbedding( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train) Xt = preprocessor.transform(X_train) @@ -40,10 +40,9 @@ def test_preprocessing_dtype(self): X_train = X_train.astype(np.float64) configuration_space = RandomTreesEmbedding.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() - preprocessor = RandomTreesEmbedding(random_state=1, - **{hp_name: default[hp_name] for - hp_name in - default}) + preprocessor = RandomTreesEmbedding( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float64) diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_percentile_classification.py b/test/test_pipeline/components/feature_preprocessing/test_select_percentile_classification.py index d7cde925b0..b177e4f4ba 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_select_percentile_classification.py +++ b/test/test_pipeline/components/feature_preprocessing/test_select_percentile_classification.py @@ -4,8 +4,9 @@ import scipy.sparse import sklearn.preprocessing -from autosklearn.pipeline.components.feature_preprocessing.select_percentile_classification \ - import SelectPercentileClassification +from autosklearn.pipeline.components.feature_preprocessing.select_percentile_classification import ( # noqa: E501 + SelectPercentileClassification, +) from autosklearn.pipeline.util import _test_preprocessing, get_dataset @@ -13,29 +14,35 @@ class SelectPercentileClassificationTest(unittest.TestCase): def test_default_configuration(self): transformation, original = _test_preprocessing(SelectPercentileClassification) self.assertEqual(transformation.shape[0], original.shape[0]) - self.assertEqual(transformation.shape[1], int(original.shape[1]/2)) + self.assertEqual(transformation.shape[1], int(original.shape[1] / 2)) self.assertFalse((transformation == 0).all()) transformation, original = _test_preprocessing( - SelectPercentileClassification, - make_sparse=True, - ) + SelectPercentileClassification, + make_sparse=True, + ) self.assertTrue(scipy.sparse.issparse(transformation)) self.assertEqual(transformation.shape[0], original.shape[0]) - self.assertEqual(transformation.shape[1], int(original.shape[1]/2)) + self.assertEqual(transformation.shape[1], int(original.shape[1] / 2)) # Custon preprocessing test to check if clipping to zero works - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") original_X_train = X_train.copy() ss = sklearn.preprocessing.StandardScaler() X_train = ss.fit_transform(X_train) - configuration_space = SelectPercentileClassification.get_hyperparameter_search_space() + configuration_space = ( + SelectPercentileClassification.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() preprocessor = SelectPercentileClassification( - random_state=1, - **{hp_name: default[hp_name] for hp_name in default if default[hp_name] is not None}, - ) + random_state=1, + **{ + hp_name: default[hp_name] + for hp_name in default + if default[hp_name] is not None + }, + ) transformer = preprocessor.fit(X_train, Y_train) transformation, original = transformer.transform(X_train), original_X_train @@ -48,11 +55,13 @@ def test_preprocessing_dtype(self): X_train, Y_train, X_test, Y_test = get_dataset("iris") self.assertEqual(X_train.dtype, np.float32) - configuration_space = SelectPercentileClassification.get_hyperparameter_search_space() + configuration_space = ( + SelectPercentileClassification.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = SelectPercentileClassification(random_state=1, - **{hp_name: default[hp_name] - for hp_name in default}) + preprocessor = SelectPercentileClassification( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float32) @@ -60,11 +69,13 @@ def test_preprocessing_dtype(self): # np.float64 X_train, Y_train, X_test, Y_test = get_dataset("iris") X_train = X_train.astype(np.float64) - configuration_space = SelectPercentileClassification.get_hyperparameter_search_space() + configuration_space = ( + SelectPercentileClassification.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = SelectPercentileClassification(random_state=1, - **{hp_name: default[hp_name] - for hp_name in default}) + preprocessor = SelectPercentileClassification( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float64) @@ -73,11 +84,13 @@ def test_preprocessing_dtype(self): # np.float32 X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True) self.assertEqual(X_train.dtype, np.float32) - configuration_space = SelectPercentileClassification.get_hyperparameter_search_space() + configuration_space = ( + SelectPercentileClassification.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = SelectPercentileClassification(random_state=1, - **{hp_name: default[hp_name] - for hp_name in default}) + preprocessor = SelectPercentileClassification( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float32) @@ -85,11 +98,13 @@ def test_preprocessing_dtype(self): # np.float64 X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True) X_train = X_train.astype(np.float64) - configuration_space = SelectPercentileClassification.get_hyperparameter_search_space() + configuration_space = ( + SelectPercentileClassification.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = SelectPercentileClassification(random_state=1, - **{hp_name: default[hp_name] - for hp_name in default}) + preprocessor = SelectPercentileClassification( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float64) diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_percentile_regression.py b/test/test_pipeline/components/feature_preprocessing/test_select_percentile_regression.py index a76a15c5a3..0fd335fd83 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_select_percentile_regression.py +++ b/test/test_pipeline/components/feature_preprocessing/test_select_percentile_regression.py @@ -2,8 +2,9 @@ import numpy as np -from autosklearn.pipeline.components.feature_preprocessing.select_percentile_regression \ - import SelectPercentileRegression +from autosklearn.pipeline.components.feature_preprocessing.select_percentile_regression import ( # noqa: E501 + SelectPercentileRegression, +) from autosklearn.pipeline.util import _test_preprocessing, get_dataset @@ -12,9 +13,9 @@ def test_default_configuration(self): transformation, original = _test_preprocessing( dataset="boston", Preprocessor=SelectPercentileRegression, - ) + ) self.assertEqual(transformation.shape[0], original.shape[0]) - self.assertEqual(transformation.shape[1], int(original.shape[1]/2)) + self.assertEqual(transformation.shape[1], int(original.shape[1] / 2)) self.assertFalse((transformation == 0).all()) def test_preprocessing_dtype(self): @@ -23,11 +24,13 @@ def test_preprocessing_dtype(self): X_train, Y_train, X_test, Y_test = get_dataset("iris") self.assertEqual(X_train.dtype, np.float32) - configuration_space = SelectPercentileRegression.get_hyperparameter_search_space() + configuration_space = ( + SelectPercentileRegression.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = SelectPercentileRegression(random_state=1, - **{hp_name: default[hp_name] - for hp_name in default}) + preprocessor = SelectPercentileRegression( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float32) @@ -35,11 +38,13 @@ def test_preprocessing_dtype(self): # np.float64 X_train, Y_train, X_test, Y_test = get_dataset("iris") X_train = X_train.astype(np.float64) - configuration_space = SelectPercentileRegression.get_hyperparameter_search_space() + configuration_space = ( + SelectPercentileRegression.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = SelectPercentileRegression(random_state=1, - **{hp_name: default[hp_name] - for hp_name in default}) + preprocessor = SelectPercentileRegression( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float64) diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py b/test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py index 2497b5174a..2d1c2aaf78 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py +++ b/test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py @@ -4,8 +4,9 @@ import scipy.sparse import sklearn.preprocessing -from autosklearn.pipeline.components.feature_preprocessing.select_rates_classification import \ - SelectClassificationRates +from autosklearn.pipeline.components.feature_preprocessing.select_rates_classification import ( # noqa: E501 + SelectClassificationRates, +) from autosklearn.pipeline.util import _test_preprocessing, get_dataset @@ -17,27 +18,33 @@ def test_default_configuration(self): self.assertFalse((transformation == 0).all()) transformation, original = _test_preprocessing( - SelectClassificationRates, make_sparse=True) + SelectClassificationRates, make_sparse=True + ) self.assertTrue(scipy.sparse.issparse(transformation)) self.assertEqual(transformation.shape[0], original.shape[0]) self.assertEqual(transformation.shape[1], int(original.shape[1] / 2)) # Custom preprocessing test to check if clipping to zero works - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") original_X_train = X_train.copy() ss = sklearn.preprocessing.StandardScaler() X_train = ss.fit_transform(X_train) - configuration_space = SelectClassificationRates.get_hyperparameter_search_space() + configuration_space = ( + SelectClassificationRates.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = SelectClassificationRates(random_state=1, - **{hp_name: default[hp_name] - for hp_name in default - if default[hp_name] is not None}) + preprocessor = SelectClassificationRates( + random_state=1, + **{ + hp_name: default[hp_name] + for hp_name in default + if default[hp_name] is not None + }, + ) transformer = preprocessor.fit(X_train, Y_train) - transformation, original = transformer.transform( - X_train), original_X_train + transformation, original = transformer.transform(X_train), original_X_train self.assertEqual(transformation.shape[0], original.shape[0]) # I don't know why it's 52 here and not 32 which would be half of the # number of features. Seems to be related to a runtime warning raised @@ -50,11 +57,13 @@ def test_preprocessing_dtype(self): X_train, Y_train, X_test, Y_test = get_dataset("iris") self.assertEqual(X_train.dtype, np.float32) - configuration_space = SelectClassificationRates.get_hyperparameter_search_space() + configuration_space = ( + SelectClassificationRates.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = SelectClassificationRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) + preprocessor = SelectClassificationRates( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float32) @@ -62,11 +71,13 @@ def test_preprocessing_dtype(self): # np.float64 X_train, Y_train, X_test, Y_test = get_dataset("iris") X_train = X_train.astype(np.float64) - configuration_space = SelectClassificationRates.get_hyperparameter_search_space() + configuration_space = ( + SelectClassificationRates.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = SelectClassificationRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) + preprocessor = SelectClassificationRates( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float64) @@ -75,11 +86,13 @@ def test_preprocessing_dtype(self): # np.float32 X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True) self.assertEqual(X_train.dtype, np.float32) - configuration_space = SelectClassificationRates.get_hyperparameter_search_space() + configuration_space = ( + SelectClassificationRates.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = SelectClassificationRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) + preprocessor = SelectClassificationRates( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float32) @@ -87,11 +100,13 @@ def test_preprocessing_dtype(self): # np.float64 X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True) X_train = X_train.astype(np.float64) - configuration_space = SelectClassificationRates.get_hyperparameter_search_space() + configuration_space = ( + SelectClassificationRates.get_hyperparameter_search_space() + ) default = configuration_space.get_default_configuration() - preprocessor = SelectClassificationRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) + preprocessor = SelectClassificationRates( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float64) diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py b/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py index 573bab32ce..869d7fbee2 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py +++ b/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py @@ -4,8 +4,9 @@ import scipy.sparse import sklearn.preprocessing -from autosklearn.pipeline.components.feature_preprocessing.select_rates_regression import \ - SelectRegressionRates +from autosklearn.pipeline.components.feature_preprocessing.select_rates_regression import ( # noqa: E501 + SelectRegressionRates, +) from autosklearn.pipeline.util import _test_preprocessing, get_dataset @@ -17,34 +18,38 @@ def test_default_configuration(self): self.assertFalse((transformation == 0).all()) transformation, original = _test_preprocessing( - SelectRegressionRates, make_sparse=True) + SelectRegressionRates, make_sparse=True + ) self.assertTrue(scipy.sparse.issparse(transformation)) self.assertEqual(transformation.shape[0], original.shape[0]) self.assertEqual(transformation.shape[1], int(original.shape[1] / 2)) # Makes sure that the features are reduced, not the number of samples - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") original_X_train = X_train.copy() ss = sklearn.preprocessing.StandardScaler() X_train = ss.fit_transform(X_train) configuration_space = SelectRegressionRates.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() - preprocessor = SelectRegressionRates(random_state=1, - **{hp_name: default[hp_name] - for hp_name in default - if default[hp_name] is not None}) + preprocessor = SelectRegressionRates( + random_state=1, + **{ + hp_name: default[hp_name] + for hp_name in default + if default[hp_name] is not None + }, + ) transformer = preprocessor.fit(X_train, Y_train) - transformation, original = transformer.transform( - X_train), original_X_train + transformation, original = transformer.transform(X_train), original_X_train self.assertEqual(transformation.shape[0], original.shape[0]) self.assertEqual(transformation.shape[1], 21) def test_default_configuration_regression(self): transformation, original = _test_preprocessing( SelectRegressionRates, - dataset='boston', + dataset="boston", ) self.assertEqual(transformation.shape[0], original.shape[0]) # From 13 to 12 features @@ -57,15 +62,15 @@ def test_preprocessing_dtype_regression(self): X_train, Y_train, X_test, Y_test = get_dataset("boston") self.assertEqual(X_train.dtype, np.float32) - dataset_properties = {'target_type': 'regression'} + dataset_properties = {"target_type": "regression"} configuration_space = SelectRegressionRates.get_hyperparameter_search_space( dataset_properties ) default = configuration_space.get_default_configuration() - preprocessor = SelectRegressionRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) + preprocessor = SelectRegressionRates( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float32) @@ -77,9 +82,9 @@ def test_preprocessing_dtype_regression(self): dataset_properties ) default = configuration_space.get_default_configuration() - preprocessor = SelectRegressionRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) + preprocessor = SelectRegressionRates( + random_state=1, **{hp_name: default[hp_name] for hp_name in default} + ) preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float64) diff --git a/test/test_pipeline/components/feature_preprocessing/test_truncatedSVD.py b/test/test_pipeline/components/feature_preprocessing/test_truncatedSVD.py index df1f1d2fe6..7e16fa7fa5 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_truncatedSVD.py +++ b/test/test_pipeline/components/feature_preprocessing/test_truncatedSVD.py @@ -1,11 +1,16 @@ import unittest -from sklearn.linear_model import RidgeClassifier -from autosklearn.pipeline.components.feature_preprocessing.truncatedSVD import \ - TruncatedSVD -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \ - get_dataset import sklearn.metrics +from sklearn.linear_model import RidgeClassifier + +from autosklearn.pipeline.components.feature_preprocessing.truncatedSVD import ( + TruncatedSVD, +) +from autosklearn.pipeline.util import ( + PreprocessingTestCase, + _test_preprocessing, + get_dataset, +) class TruncatedSVDComponentTest(PreprocessingTestCase): @@ -16,14 +21,19 @@ def test_default_configuration(self): def test_default_configuration_classify(self): for i in range(2): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', - make_sparse=True) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="digits", make_sparse=True + ) configuration_space = TruncatedSVD.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() preprocessor = TruncatedSVD( random_state=1, - **{hp_name: default[hp_name] for hp_name in default if default[hp_name] is not None} - ) + **{ + hp_name: default[hp_name] + for hp_name in default + if default[hp_name] is not None + }, + ) preprocessor.fit(X_train, Y_train) X_train_trans = preprocessor.transform(X_train) X_test_trans = preprocessor.transform(X_test) @@ -37,6 +47,6 @@ def test_default_configuration_classify(self): @unittest.skip("Truncated SVD returns np.float64.") def test_preprocessing_dtype(self): - super(TruncatedSVDComponentTest, - self)._test_preprocessing_dtype(TruncatedSVD, - test_sparse=False) + super(TruncatedSVDComponentTest, self)._test_preprocessing_dtype( + TruncatedSVD, test_sparse=False + ) diff --git a/test/test_pipeline/components/regression/test_adaboost.py b/test/test_pipeline/components/regression/test_adaboost.py index c7f199d5ee..b62df4fd9b 100644 --- a/test/test_pipeline/components/regression/test_adaboost.py +++ b/test/test_pipeline/components/regression/test_adaboost.py @@ -1,7 +1,7 @@ import sklearn.ensemble -from autosklearn.pipeline.components.regression.adaboost import \ - AdaboostRegressor +from autosklearn.pipeline.components.regression.adaboost import AdaboostRegressor + from .test_base import BaseRegressionComponentTest diff --git a/test/test_pipeline/components/regression/test_ard_regression.py b/test/test_pipeline/components/regression/test_ard_regression.py index dac8d61349..829bf9b507 100644 --- a/test/test_pipeline/components/regression/test_ard_regression.py +++ b/test/test_pipeline/components/regression/test_ard_regression.py @@ -1,7 +1,7 @@ import sklearn.linear_model -from autosklearn.pipeline.components.regression.ard_regression import \ - ARDRegression +from autosklearn.pipeline.components.regression.ard_regression import ARDRegression + from .test_base import BaseRegressionComponentTest diff --git a/test/test_pipeline/components/regression/test_base.py b/test/test_pipeline/components/regression/test_base.py index 8ffc1d23fe..dcf7770332 100644 --- a/test/test_pipeline/components/regression/test_base.py +++ b/test/test_pipeline/components/regression/test_base.py @@ -1,19 +1,17 @@ -from typing import Type, Container +from typing import Container, Type import unittest -import pytest - import numpy as np +import pytest import sklearn.metrics -from autosklearn.pipeline.util import _test_regressor, _test_regressor_iterative_fit -from autosklearn.pipeline.constants import SPARSE +from autosklearn.pipeline.components.regression import RegressorChoice, _regressors from autosklearn.pipeline.components.regression.libsvm_svr import LibSVM_SVR +from autosklearn.pipeline.constants import SPARSE +from autosklearn.pipeline.util import _test_regressor, _test_regressor_iterative_fit -from autosklearn.pipeline.components.regression import _regressors, RegressorChoice - -from test.test_pipeline.ignored_warnings import regressor_warnings, ignore_warnings +from test.test_pipeline.ignored_warnings import ignore_warnings, regressor_warnings class BaseRegressionComponentTest(unittest.TestCase): @@ -37,8 +35,7 @@ def test_default_boston(self): with ignore_warnings(regressor_warnings): predictions, targets, n_calls = _test_regressor( - dataset="boston", - Regressor=self.module + dataset="boston", Regressor=self.module ) score = sklearn.metrics.r2_score(y_true=targets, y_pred=predictions) @@ -70,14 +67,13 @@ def test_default_boston_iterative_fit(self): if self.__class__ == BaseRegressionComponentTest: return - if not hasattr(self.module, 'iterative_fit'): + if not hasattr(self.module, "iterative_fit"): return for i in range(2): with ignore_warnings(regressor_warnings): predictions, targets, regressor = _test_regressor_iterative_fit( - dataset="boston", - Regressor=self.module + dataset="boston", Regressor=self.module ) score = sklearn.metrics.r2_score(targets, predictions) @@ -92,8 +88,8 @@ def test_default_boston_iterative_fit(self): self.assertAlmostEqual(fixture, score, places) if self.step_hyperparameter is not None: - param_name = self.step_hyperparameter['name'] - default = self.step_hyperparameter['value'] + param_name = self.step_hyperparameter["name"] + default = self.step_hyperparameter["value"] value = getattr(regressor.estimator, param_name) expected = self.res.get("boston_iterative_n_iter", default) @@ -110,7 +106,7 @@ def test_default_boston_iterative_sparse_fit(self): if self.__class__ == BaseRegressionComponentTest: return - if not hasattr(self.module, 'iterative_fit'): + if not hasattr(self.module, "iterative_fit"): return if SPARSE not in self.module.get_properties()["input"]: @@ -119,15 +115,13 @@ def test_default_boston_iterative_sparse_fit(self): for i in range(2): with ignore_warnings(regressor_warnings): predictions, targets, _ = _test_regressor_iterative_fit( - dataset="boston", - Regressor=self.module, - sparse=True + dataset="boston", Regressor=self.module, sparse=True ) - self.assertAlmostEqual(self.res["default_boston_iterative_sparse"], - sklearn.metrics.r2_score(targets, - predictions), - places=self.res.get( - "default_boston_iterative_sparse_places", 7)) + self.assertAlmostEqual( + self.res["default_boston_iterative_sparse"], + sklearn.metrics.r2_score(targets, predictions), + places=self.res.get("default_boston_iterative_sparse_places", 7), + ) def test_default_boston_sparse(self): @@ -140,16 +134,14 @@ def test_default_boston_sparse(self): for i in range(2): with ignore_warnings(regressor_warnings): predictions, targets, _ = _test_regressor( - dataset="boston", - Regressor=self.module, - sparse=True + dataset="boston", Regressor=self.module, sparse=True ) - self.assertAlmostEqual(self.res["default_boston_sparse"], - sklearn.metrics.r2_score(targets, - predictions), - places=self.res.get( - "default_boston_sparse_places", 7)) + self.assertAlmostEqual( + self.res["default_boston_sparse"], + sklearn.metrics.r2_score(targets, predictions), + places=self.res.get("default_boston_sparse_places", 7), + ) def test_default_diabetes(self): @@ -159,15 +151,14 @@ def test_default_diabetes(self): for i in range(2): with ignore_warnings(regressor_warnings): predictions, targets, n_calls = _test_regressor( - dataset="diabetes", - Regressor=self.module + dataset="diabetes", Regressor=self.module ) - self.assertAlmostEqual(self.res["default_diabetes"], - sklearn.metrics.r2_score(targets, - predictions), - places=self.res.get( - "default_diabetes_places", 7)) + self.assertAlmostEqual( + self.res["default_diabetes"], + sklearn.metrics.r2_score(targets, predictions), + places=self.res.get("default_diabetes_places", 7), + ) if self.res.get("diabetes_n_calls"): self.assertEqual(self.res["diabetes_n_calls"], n_calls) @@ -177,28 +168,27 @@ def test_default_diabetes_iterative_fit(self): if self.__class__ == BaseRegressionComponentTest: return - if not hasattr(self.module, 'iterative_fit'): + if not hasattr(self.module, "iterative_fit"): return for i in range(2): with ignore_warnings(regressor_warnings): predictions, targets, _ = _test_regressor_iterative_fit( - dataset="diabetes", - Regressor=self.module + dataset="diabetes", Regressor=self.module ) - self.assertAlmostEqual(self.res["default_diabetes_iterative"], - sklearn.metrics.r2_score(targets, - predictions), - places=self.res.get( - "default_diabetes_iterative_places", 7)) + self.assertAlmostEqual( + self.res["default_diabetes_iterative"], + sklearn.metrics.r2_score(targets, predictions), + places=self.res.get("default_diabetes_iterative_places", 7), + ) def test_default_diabetes_iterative_sparse_fit(self): if self.__class__ == BaseRegressionComponentTest: return - if not hasattr(self.module, 'iterative_fit'): + if not hasattr(self.module, "iterative_fit"): return if SPARSE not in self.module.get_properties()["input"]: @@ -207,21 +197,21 @@ def test_default_diabetes_iterative_sparse_fit(self): for i in range(2): with ignore_warnings(regressor_warnings): predictions, targets, regressor = _test_regressor_iterative_fit( - dataset="diabetes", - Regressor=self.module, - sparse=True + dataset="diabetes", Regressor=self.module, sparse=True ) - self.assertAlmostEqual(self.res["default_diabetes_iterative_sparse"], - sklearn.metrics.r2_score(targets, - predictions), - places=self.res.get( - "default_diabetes_iterative_sparse_places", 7)) + self.assertAlmostEqual( + self.res["default_diabetes_iterative_sparse"], + sklearn.metrics.r2_score(targets, predictions), + places=self.res.get("default_diabetes_iterative_sparse_places", 7), + ) if self.step_hyperparameter is not None: self.assertEqual( - getattr(regressor.estimator, self.step_hyperparameter['name']), - self.res.get("diabetes_iterative_n_iter", self.step_hyperparameter['value']) + getattr(regressor.estimator, self.step_hyperparameter["name"]), + self.res.get( + "diabetes_iterative_n_iter", self.step_hyperparameter["value"] + ), ) def test_default_diabetes_sparse(self): @@ -235,47 +225,73 @@ def test_default_diabetes_sparse(self): for i in range(2): with ignore_warnings(regressor_warnings): predictions, targets, _ = _test_regressor( - dataset="diabetes", - Regressor=self.module, - sparse=True + dataset="diabetes", Regressor=self.module, sparse=True ) - self.assertAlmostEqual(self.res["default_diabetes_sparse"], - sklearn.metrics.r2_score(targets, - predictions), - places=self.res.get( - "default_diabetes_sparse_places", 7)) + self.assertAlmostEqual( + self.res["default_diabetes_sparse"], + sklearn.metrics.r2_score(targets, predictions), + places=self.res.get("default_diabetes_sparse_places", 7), + ) def test_module_idempotent(self): - """ Fitting twice with the same config gives the same model params. + """Fitting twice with the same config gives the same model params. - This is only valid when the random_state passed is an int. If a - RandomState object is passed then repeated calls to fit will have - different results. See the section on "Controlling Randomness" in the - sklearn docs. + This is only valid when the random_state passed is an int. If a + RandomState object is passed then repeated calls to fit will have + different results. See the section on "Controlling Randomness" in the + sklearn docs. - https://scikit-learn.org/0.24/common_pitfalls.html#controlling-randomness + https://scikit-learn.org/0.24/common_pitfalls.html#controlling-randomness """ if self.__class__ == BaseRegressionComponentTest: return regressor_cls = self.module - X = np.array([ - [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], - [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], - [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], - [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], - ]) - y = np.array([ - 1, 1, 1, 1, - 1, 1, 1, 1, - 1, 1, 1, 1, - 1, 1, 1, 1, - ]) + X = np.array( + [ + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + ] + ) + y = np.array( + [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + ] + ) # We ignore certain keys when comparing - param_keys_ignored = ['base_estimator'] + param_keys_ignored = ["base_estimator"] # We use the default config + sampled ones configuration_space = regressor_cls.get_hyperparameter_search_space() @@ -292,14 +308,14 @@ def test_module_idempotent(self): with ignore_warnings(regressor_warnings): params_first = regressor.fit(X.copy(), y.copy()).estimator.get_params() - if hasattr(regressor.estimator, 'random_state'): + if hasattr(regressor.estimator, "random_state"): rs_1 = regressor.random_state rs_estimator_1 = regressor.estimator.random_state with ignore_warnings(regressor_warnings): params_second = regressor.fit(X.copy(), y.copy()).estimator.get_params() - if hasattr(regressor.estimator, 'random_state'): + if hasattr(regressor.estimator, "random_state"): rs_2 = regressor.random_state rs_estimator_2 = regressor.estimator.random_state @@ -310,27 +326,27 @@ def test_module_idempotent(self): del params[key] # They should have equal parameters - self.assertEqual(params_first, params_second, - f"Failed with model args {model_args}") - if ( - hasattr(regressor.estimator, 'random_state') - and not isinstance(regressor, LibSVM_SVR) + self.assertEqual( + params_first, params_second, f"Failed with model args {model_args}" + ) + if hasattr(regressor.estimator, "random_state") and not isinstance( + regressor, LibSVM_SVR ): # sklearn.svm.SVR has it as an attribute but does not use it and # defaults it to None, even if a value is passed in - assert all([ - seed == random_state - for random_state in [rs_1, rs_estimator_1, rs_2, rs_estimator_2] - ]) + assert all( + [ + seed == random_state + for random_state in [rs_1, rs_estimator_1, rs_2, rs_estimator_2] + ] + ) @pytest.mark.parametrize("regressor", _regressors.values()) @pytest.mark.parametrize("X", [np.array([[1, 2, 3]] * 20)]) @pytest.mark.parametrize("y", [np.array([1] * 20)]) def test_fit_and_predict_with_1d_targets_as_1d( - regressor: Type[RegressorChoice], - X: np.ndarray, - y: np.ndarray + regressor: Type[RegressorChoice], X: np.ndarray, y: np.ndarray ) -> None: """Test that all pipelines work with 1d target types @@ -371,9 +387,7 @@ def test_fit_and_predict_with_1d_targets_as_1d( @pytest.mark.parametrize("X", [np.array([[1, 2, 3]] * 20)]) @pytest.mark.parametrize("y", [np.array([[1]] * 20)]) def test_fit_and_predict_with_1d_targets_as_2d( - regressor: Type[RegressorChoice], - X: np.ndarray, - y: np.ndarray + regressor: Type[RegressorChoice], X: np.ndarray, y: np.ndarray ) -> None: """Test that all pipelines work with 1d target types when they are wrapped as 2d @@ -412,17 +426,18 @@ def test_fit_and_predict_with_1d_targets_as_2d( assert len(predictions) == len(y) -@pytest.mark.parametrize("regressor", [ - regressor - for regressor in _regressors.values() - if regressor.get_properties()['handles_multilabel'] -]) +@pytest.mark.parametrize( + "regressor", + [ + regressor + for regressor in _regressors.values() + if regressor.get_properties()["handles_multilabel"] + ], +) @pytest.mark.parametrize("X", [np.array([[1, 2, 3]] * 20)]) @pytest.mark.parametrize("y", [np.array([[1, 1, 1]] * 20)]) def test_fit_and_predict_with_2d_targets( - regressor: Type[RegressorChoice], - X: np.ndarray, - y: np.ndarray + regressor: Type[RegressorChoice], X: np.ndarray, y: np.ndarray ) -> None: """Test that all pipelines work with 2d target types diff --git a/test/test_pipeline/components/regression/test_decision_tree.py b/test/test_pipeline/components/regression/test_decision_tree.py index a5d2e53990..942b9db601 100644 --- a/test/test_pipeline/components/regression/test_decision_tree.py +++ b/test/test_pipeline/components/regression/test_decision_tree.py @@ -1,7 +1,7 @@ import sklearn.tree -from autosklearn.pipeline.components.regression.decision_tree import \ - DecisionTree +from autosklearn.pipeline.components.regression.decision_tree import DecisionTree + from .test_base import BaseRegressionComponentTest diff --git a/test/test_pipeline/components/regression/test_extra_trees.py b/test/test_pipeline/components/regression/test_extra_trees.py index 5d6f6d1acf..8d92fa30c8 100644 --- a/test/test_pipeline/components/regression/test_extra_trees.py +++ b/test/test_pipeline/components/regression/test_extra_trees.py @@ -1,7 +1,7 @@ import sklearn.ensemble -from autosklearn.pipeline.components.regression.extra_trees import \ - ExtraTreesRegressor +from autosklearn.pipeline.components.regression.extra_trees import ExtraTreesRegressor + from .test_base import BaseRegressionComponentTest @@ -12,18 +12,18 @@ class ExtraTreesComponentTest(BaseRegressionComponentTest): res = dict() res["default_boston"] = 0.8539264243687228 res["boston_n_calls"] = 9 - res["default_boston_iterative"] = res['default_boston'] + res["default_boston_iterative"] = res["default_boston"] res["default_boston_sparse"] = 0.411211701806908 - res["default_boston_iterative_sparse"] = res['default_boston_sparse'] + res["default_boston_iterative_sparse"] = res["default_boston_sparse"] res["default_diabetes"] = 0.3885150255877827 res["diabetes_n_calls"] = 9 - res["default_diabetes_iterative"] = res['default_diabetes'] + res["default_diabetes_iterative"] = res["default_diabetes"] res["default_diabetes_sparse"] = 0.2422804139169642 - res["default_diabetes_iterative_sparse"] = res['default_diabetes_sparse'] + res["default_diabetes_iterative_sparse"] = res["default_diabetes_sparse"] sk_mod = sklearn.ensemble.ExtraTreesRegressor module = ExtraTreesRegressor step_hyperparameter = { - 'name': 'n_estimators', - 'value': module.get_max_iter(), + "name": "n_estimators", + "value": module.get_max_iter(), } diff --git a/test/test_pipeline/components/regression/test_gaussian_process.py b/test/test_pipeline/components/regression/test_gaussian_process.py index d148d416df..0f766e22b1 100644 --- a/test/test_pipeline/components/regression/test_gaussian_process.py +++ b/test/test_pipeline/components/regression/test_gaussian_process.py @@ -1,7 +1,6 @@ import sklearn.gaussian_process -from autosklearn.pipeline.components.regression.gaussian_process import \ - GaussianProcess +from autosklearn.pipeline.components.regression.gaussian_process import GaussianProcess from .test_base import BaseRegressionComponentTest diff --git a/test/test_pipeline/components/regression/test_gradient_boosting.py b/test/test_pipeline/components/regression/test_gradient_boosting.py index 9fcb2cd623..6412fd0598 100644 --- a/test/test_pipeline/components/regression/test_gradient_boosting.py +++ b/test/test_pipeline/components/regression/test_gradient_boosting.py @@ -1,7 +1,8 @@ import sklearn.ensemble -from autosklearn.pipeline.components.regression.gradient_boosting import \ - GradientBoosting +from autosklearn.pipeline.components.regression.gradient_boosting import ( + GradientBoosting, +) from .test_base import BaseRegressionComponentTest diff --git a/test/test_pipeline/components/regression/test_k_nearest_neighbors.py b/test/test_pipeline/components/regression/test_k_nearest_neighbors.py index 40637c3ec8..19d0cf40f5 100644 --- a/test/test_pipeline/components/regression/test_k_nearest_neighbors.py +++ b/test/test_pipeline/components/regression/test_k_nearest_neighbors.py @@ -1,7 +1,9 @@ import sklearn.neighbors -from autosklearn.pipeline.components.regression.k_nearest_neighbors import \ - KNearestNeighborsRegressor +from autosklearn.pipeline.components.regression.k_nearest_neighbors import ( + KNearestNeighborsRegressor, +) + from .test_base import BaseRegressionComponentTest diff --git a/test/test_pipeline/components/regression/test_liblinear_svr.py b/test/test_pipeline/components/regression/test_liblinear_svr.py index 42b73bfba7..37b6552c9b 100644 --- a/test/test_pipeline/components/regression/test_liblinear_svr.py +++ b/test/test_pipeline/components/regression/test_liblinear_svr.py @@ -1,7 +1,7 @@ import sklearn.svm -from autosklearn.pipeline.components.regression.liblinear_svr import \ - LibLinear_SVR +from autosklearn.pipeline.components.regression.liblinear_svr import LibLinear_SVR + from .test_base import BaseRegressionComponentTest diff --git a/test/test_pipeline/components/regression/test_mlp.py b/test/test_pipeline/components/regression/test_mlp.py index c003037c76..9e2a92acac 100644 --- a/test/test_pipeline/components/regression/test_mlp.py +++ b/test/test_pipeline/components/regression/test_mlp.py @@ -64,6 +64,6 @@ class MLPComponentTest(BaseRegressionComponentTest): sk_mod = sklearn.neural_network.MLPRegressor module = MLPRegressor step_hyperparameter = { - 'name': 'n_iter_', - 'value': module.get_max_iter(), + "name": "n_iter_", + "value": module.get_max_iter(), } diff --git a/test/test_pipeline/components/regression/test_random_forests.py b/test/test_pipeline/components/regression/test_random_forests.py index ee6f342a8e..6e1634ff83 100644 --- a/test/test_pipeline/components/regression/test_random_forests.py +++ b/test/test_pipeline/components/regression/test_random_forests.py @@ -1,7 +1,7 @@ import sklearn.ensemble -from autosklearn.pipeline.components.regression.random_forest import \ - RandomForest +from autosklearn.pipeline.components.regression.random_forest import RandomForest + from .test_base import BaseRegressionComponentTest @@ -11,18 +11,18 @@ class RandomForestComponentTest(BaseRegressionComponentTest): res = dict() res["default_boston"] = 0.8410063895401654 res["boston_n_calls"] = 9 - res["default_boston_iterative"] = res['default_boston'] + res["default_boston_iterative"] = res["default_boston"] res["default_boston_sparse"] = 0.4194462097407078 - res["default_boston_iterative_sparse"] = res['default_boston_sparse'] + res["default_boston_iterative_sparse"] = res["default_boston_sparse"] res["default_diabetes"] = 0.3496051170409269 res["diabetes_n_calls"] = 9 - res["default_diabetes_iterative"] = res['default_diabetes'] + res["default_diabetes_iterative"] = res["default_diabetes"] res["default_diabetes_sparse"] = 0.2383300978781976 - res["default_diabetes_iterative_sparse"] = res['default_diabetes_sparse'] + res["default_diabetes_iterative_sparse"] = res["default_diabetes_sparse"] sk_mod = sklearn.ensemble.RandomForestRegressor module = RandomForest step_hyperparameter = { - 'name': 'n_estimators', - 'value': module.get_max_iter(), + "name": "n_estimators", + "value": module.get_max_iter(), } diff --git a/test/test_pipeline/components/regression/test_sgd.py b/test/test_pipeline/components/regression/test_sgd.py index df31b3e026..467f3519f1 100644 --- a/test/test_pipeline/components/regression/test_sgd.py +++ b/test/test_pipeline/components/regression/test_sgd.py @@ -1,6 +1,7 @@ import sklearn.linear_model from autosklearn.pipeline.components.regression.sgd import SGD + from .test_base import BaseRegressionComponentTest @@ -10,16 +11,16 @@ class SGDComponentTest(BaseRegressionComponentTest): # Values are extremely bad because the invscaling does not drop the # learning rate aggressively enough! res = dict() - res["default_boston"] = -1.1811672998629865e+28 + res["default_boston"] = -1.1811672998629865e28 res["boston_n_calls"] = 6 - res["default_boston_iterative"] = res['default_boston'] - res["default_boston_sparse"] = -1.1518512489347601e+28 - res["default_boston_iterative_sparse"] = res['default_boston_sparse'] + res["default_boston_iterative"] = res["default_boston"] + res["default_boston_sparse"] = -1.1518512489347601e28 + res["default_boston_iterative_sparse"] = res["default_boston_sparse"] res["default_diabetes"] = 0.27420813549185374 res["diabetes_n_calls"] = 10 - res["default_diabetes_iterative"] = res['default_diabetes'] + res["default_diabetes_iterative"] = res["default_diabetes"] res["default_diabetes_sparse"] = 0.034801785011824404 - res["default_diabetes_iterative_sparse"] = res['default_diabetes_sparse'] + res["default_diabetes_iterative_sparse"] = res["default_diabetes_sparse"] sk_mod = sklearn.linear_model.SGDRegressor module = SGD diff --git a/test/test_pipeline/components/regression/test_support_vector_regression.py b/test/test_pipeline/components/regression/test_support_vector_regression.py index 57cde050ed..84bea51da6 100644 --- a/test/test_pipeline/components/regression/test_support_vector_regression.py +++ b/test/test_pipeline/components/regression/test_support_vector_regression.py @@ -1,6 +1,7 @@ import sklearn.linear_model from autosklearn.pipeline.components.regression.libsvm_svr import LibSVM_SVR + from .test_base import BaseRegressionComponentTest diff --git a/test/test_pipeline/components/test_base.py b/test/test_pipeline/components/test_base.py index c53246cc77..1e6ddbbd14 100644 --- a/test/test_pipeline/components/test_base.py +++ b/test/test_pipeline/components/test_base.py @@ -2,20 +2,23 @@ import sys import unittest -from autosklearn.pipeline.components.base import find_components, \ - AutoSklearnClassificationAlgorithm +from autosklearn.pipeline.components.base import ( + AutoSklearnClassificationAlgorithm, + find_components, +) this_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(this_dir) class TestBase(unittest.TestCase): - def test_find_components(self): - c = find_components('dummy_components', - os.path.join(this_dir, 'dummy_components'), - AutoSklearnClassificationAlgorithm) - print('COMPONENTS: %s' % repr(c)) + c = find_components( + "dummy_components", + os.path.join(this_dir, "dummy_components"), + AutoSklearnClassificationAlgorithm, + ) + print("COMPONENTS: %s" % repr(c)) self.assertEqual(len(c), 2) - self.assertEqual(c['dummy_component_1'].__name__, 'DummyComponent1') - self.assertEqual(c['dummy_component_2'].__name__, 'DummyComponent2') + self.assertEqual(c["dummy_component_1"].__name__, "DummyComponent1") + self.assertEqual(c["dummy_component_2"].__name__, "DummyComponent2") diff --git a/test/test_pipeline/ignored_warnings.py b/test/test_pipeline/ignored_warnings.py index 5b941281f9..715cacb6ba 100644 --- a/test/test_pipeline/ignored_warnings.py +++ b/test/test_pipeline/ignored_warnings.py @@ -1,106 +1,116 @@ -from contextlib import contextmanager -from typing import List, Iterator, Tuple +from typing import Iterator, List, Tuple, Type import warnings +from contextlib import contextmanager from sklearn.exceptions import ConvergenceWarning - regressor_warnings = [ ( - UserWarning, ( # From QuantileTransformer + UserWarning, + ( # From QuantileTransformer r"n_quantiles \(\d+\) is greater than the total number of samples \(\d+\)\." r" n_quantiles is set to n_samples\." - ) + ), ), ( - ConvergenceWarning, ( # From GaussianProcesses + ConvergenceWarning, + ( # From GaussianProcesses r"The optimal value found for dimension \d+ of parameter \w+ is close" r" to the specified (upper|lower) bound .*(Increasing|Decreasing) the bound" r" and calling fit again may find a better value." - ) + ), ), ( - UserWarning, ( # From FastICA - r"n_components is too large: it will be set to \d+" - ) + UserWarning, + (r"n_components is too large: it will be set to \d+"), # From FastICA ), ( - ConvergenceWarning, ( # From SGD - r"Maximum number of iteration reached before convergence\. Consider increasing" - r" max_iter to improve the fit\." - ) + ConvergenceWarning, + ( # From SGD + r"Maximum number of iteration reached before convergence\." + r" Consider increasing max_iter to improve the fit\." + ), ), ( - ConvergenceWarning, ( # From MLP + ConvergenceWarning, + ( # From MLP r"Stochastic Optimizer: Maximum iterations \(\d+\) reached and the" r" optimization hasn't converged yet\." - ) + ), ), ] classifier_warnings = [ ( - UserWarning, ( # From QuantileTransformer + UserWarning, + ( # From QuantileTransformer r"n_quantiles \(\d+\) is greater than the total number of samples \(\d+\)\." r" n_quantiles is set to n_samples\." - ) + ), ), ( - UserWarning, ( # From FastICA - r"n_components is too large: it will be set to \d+" - ) - + UserWarning, + (r"n_components is too large: it will be set to \d+"), # From FastICA ), ( - ConvergenceWarning, ( # From Liblinear + ConvergenceWarning, + ( # From Liblinear r"Liblinear failed to converge, increase the number of iterations\." - ) + ), ), ( - ConvergenceWarning, ( # From SGD - r"Maximum number of iteration reached before convergence\. Consider increasing" - r" max_iter to improve the fit\." - ) + ConvergenceWarning, + ( # From SGD + r"Maximum number of iteration reached before convergence\." + r"Consider increasing max_iter to improve the fit\." + ), ), ( - ConvergenceWarning, ( # From MLP + ConvergenceWarning, + ( # From MLP r"Stochastic Optimizer: Maximum iterations \(\d+\) reached and the" r" optimization hasn't converged yet\." - ) + ), ), ( - ConvergenceWarning, ( # From FastICA + ConvergenceWarning, + ( # From FastICA r"FastICA did not converge\." r" Consider increasing tolerance or the maximum number of iterations\." - ) + ), ), ( - UserWarning, ( # From LDA (Linear Discriminant Analysis) - r"Variables are collinear" - ) + UserWarning, + (r"Variables are collinear"), # From LDA (Linear Discriminant Analysis) ), ( - UserWarning, ( + UserWarning, + ( r"Clustering metrics expects discrete values but received continuous values" r" for label, and multiclass values for target" - ) - ) + ), + ), ] feature_preprocessing_warnings = [ ( - ConvergenceWarning, ( # From liblinear + ConvergenceWarning, + ( # From liblinear r"Liblinear failed to converge, increase the number of iterations." - ) + ), ) ] -ignored_warnings = regressor_warnings + classifier_warnings + feature_preprocessing_warnings +ignored_warnings = ( + regressor_warnings + classifier_warnings + feature_preprocessing_warnings +) @contextmanager -def ignore_warnings(to_ignore: List[Tuple[Exception, str]] = ignored_warnings) -> Iterator[None]: +def ignore_warnings( + to_ignore: List[Tuple[Type[Warning], str]] = ignored_warnings +) -> Iterator[None]: """A context manager to ignore warnings >>> with ignore_warnings(classifier_warnings): @@ -113,5 +123,5 @@ def ignore_warnings(to_ignore: List[Tuple[Exception, str]] = ignored_warnings) - """ with warnings.catch_warnings(): for category, message in to_ignore: - warnings.filterwarnings('ignore', category=category, message=message) + warnings.filterwarnings("ignore", category=category, message=message) yield diff --git a/test/test_pipeline/implementations/__init__.py b/test/test_pipeline/implementations/__init__.py index 8f0ce6cb7c..92bf78f389 100644 --- a/test/test_pipeline/implementations/__init__.py +++ b/test/test_pipeline/implementations/__init__.py @@ -1 +1 @@ -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/test/test_pipeline/implementations/test_CategoryShift.py b/test/test_pipeline/implementations/test_CategoryShift.py index 621d9b47cb..1b5e1451e6 100644 --- a/test/test_pipeline/implementations/test_CategoryShift.py +++ b/test/test_pipeline/implementations/test_CategoryShift.py @@ -1,4 +1,5 @@ import unittest + import numpy as np import scipy.sparse @@ -6,7 +7,6 @@ class CategoryShiftTest(unittest.TestCase): - def test_dense(self): X = np.random.randint(0, 255, (3, 4)) Y = CategoryShift().fit_transform(X) @@ -14,7 +14,8 @@ def test_dense(self): def test_sparse(self): X = scipy.sparse.csc_matrix( - ([1, 2, 0, 4], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4)) + ([1, 2, 0, 4], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4) + ) Y = CategoryShift().fit_transform(X) X.data += 3 self.assertTrue((Y.todense() == X.todense()).all()) @@ -29,6 +30,6 @@ def test_negative(self): CategoryShift().fit_transform(X) def test_string(self): - X = np.array([['a', 'b'], ['c', 'd']]) + X = np.array([["a", "b"], ["c", "d"]]) with self.assertRaises(ValueError): CategoryShift().fit_transform(X) diff --git a/test/test_pipeline/implementations/test_MinorityCoalescer.py b/test/test_pipeline/implementations/test_MinorityCoalescer.py index 73cbf9049a..7bdca8f1aa 100644 --- a/test/test_pipeline/implementations/test_MinorityCoalescer.py +++ b/test/test_pipeline/implementations/test_MinorityCoalescer.py @@ -1,24 +1,25 @@ import unittest -import numpy as np +import numpy as np import scipy.sparse from autosklearn.pipeline.implementations.MinorityCoalescer import MinorityCoalescer class MinorityCoalescerTest(unittest.TestCase): - @property def X1(self): # Generates an array with categories 3, 4, 5, 6, 7 and occurences of 30%, # 30%, 30%, 5% and 5% respectively - X = np.vstack(( - np.ones((30, 10)) * 3, - np.ones((30, 10)) * 4, - np.ones((30, 10)) * 5, - np.ones((5, 10)) * 6, - np.ones((5, 10)) * 7, - )) + X = np.vstack( + ( + np.ones((30, 10)) * 3, + np.ones((30, 10)) * 4, + np.ones((30, 10)) * 5, + np.ones((5, 10)) * 6, + np.ones((5, 10)) * 7, + ) + ) for col in range(X.shape[1]): np.random.shuffle(X[:, col]) return X @@ -27,13 +28,15 @@ def X1(self): def X2(self): # Generates an array with categories 3, 4, 5, 6, 7 and occurences of 5%, # 5%, 5%, 35% and 50% respectively - X = np.vstack(( - np.ones((5, 10)) * 3, - np.ones((5, 10)) * 4, - np.ones((5, 10)) * 5, - np.ones((35, 10)) * 6, - np.ones((50, 10)) * 7, - )) + X = np.vstack( + ( + np.ones((5, 10)) * 3, + np.ones((5, 10)) * 4, + np.ones((5, 10)) * 5, + np.ones((35, 10)) * 6, + np.ones((50, 10)) * 7, + ) + ) for col in range(X.shape[1]): np.random.shuffle(X[:, col]) return X @@ -48,7 +51,7 @@ def test_default(self): def test_coalesce_10_percent(self): X = self.X1 - Y = MinorityCoalescer(minimum_fraction=.1).fit_transform(X) + Y = MinorityCoalescer(minimum_fraction=0.1).fit_transform(X) for col in range(Y.shape[1]): hist = np.histogram(Y[:, col], bins=np.arange(1, 7)) np.testing.assert_array_almost_equal(hist[0], [10, 0, 30, 30, 30]) @@ -57,7 +60,7 @@ def test_coalesce_10_percent(self): def test_coalesce_10_percent_sparse(self): X = scipy.sparse.csc_matrix(self.X1) - Y = MinorityCoalescer(minimum_fraction=.1).fit_transform(X) + Y = MinorityCoalescer(minimum_fraction=0.1).fit_transform(X) # Assert no copies were made self.assertEqual(id(X), id(Y)) Y = Y.todense() @@ -75,7 +78,7 @@ def test_transform_after_fit(self): X_fit = self.X1 # Here categories 3, 4, 5 have ocurrence above 10% X_transf = self.X2 # Here it is the opposite, just categs 6 and 7 are above 10% - mc = MinorityCoalescer(minimum_fraction=.1).fit(X_fit) + mc = MinorityCoalescer(minimum_fraction=0.1).fit(X_fit) # transform() should coalesce categories as learned during fit. # Category distribution in X_transf should be irrelevant. diff --git a/test/test_pipeline/implementations/test_SparseOneHotEncoder.py b/test/test_pipeline/implementations/test_SparseOneHotEncoder.py index 731533637b..91f1827c06 100644 --- a/test/test_pipeline/implementations/test_SparseOneHotEncoder.py +++ b/test/test_pipeline/implementations/test_SparseOneHotEncoder.py @@ -1,38 +1,37 @@ import unittest import numpy as np - import scipy.sparse -import sklearn.tree import sklearn.datasets import sklearn.model_selection import sklearn.pipeline +import sklearn.tree from sklearn.impute import SimpleImputer from sklearn.tree import DecisionTreeClassifier -from autosklearn.pipeline.implementations.SparseOneHotEncoder import SparseOneHotEncoder from autosklearn.pipeline.implementations.CategoryShift import CategoryShift +from autosklearn.pipeline.implementations.SparseOneHotEncoder import SparseOneHotEncoder -sparse1 = scipy.sparse.csc_matrix(([3, 2, 1, 1, 2, 3], - ((1, 4, 5, 2, 3, 5), - (0, 0, 0, 1, 1, 1))), shape=(6, 2)) -sparse1_1h = scipy.sparse.csc_matrix(([1, 1, 1, 1, 1, 1], - ((5, 4, 1, 2, 3, 5), - (0, 1, 2, 3, 4, 5))), shape=(6, 6)) +sparse1 = scipy.sparse.csc_matrix( + ([3, 2, 1, 1, 2, 3], ((1, 4, 5, 2, 3, 5), (0, 0, 0, 1, 1, 1))), shape=(6, 2) +) +sparse1_1h = scipy.sparse.csc_matrix( + ([1, 1, 1, 1, 1, 1], ((5, 4, 1, 2, 3, 5), (0, 1, 2, 3, 4, 5))), shape=(6, 6) +) -sparse2 = scipy.sparse.csc_matrix(([2, 1, 0, 0, 0, 0], - ((1, 4, 5, 2, 3, 5), - (0, 0, 0, 1, 1, 1))), shape=(6, 2)) -sparse2_1h = scipy.sparse.csc_matrix(([1, 1, 1, 1, 1, 1], - ((5, 4, 1, 2, 3, 5), - (0, 1, 2, 3, 3, 3))), shape=(6, 4)) +sparse2 = scipy.sparse.csc_matrix( + ([2, 1, 0, 0, 0, 0], ((1, 4, 5, 2, 3, 5), (0, 0, 0, 1, 1, 1))), shape=(6, 2) +) +sparse2_1h = scipy.sparse.csc_matrix( + ([1, 1, 1, 1, 1, 1], ((5, 4, 1, 2, 3, 5), (0, 1, 2, 3, 3, 3))), shape=(6, 4) +) -sparse2_csr = scipy.sparse.csr_matrix(([2, 1, 0, 0, 0, 0], - ((1, 4, 5, 2, 3, 5), - (0, 0, 0, 1, 1, 1))), shape=(6, 2)) -sparse2_csr_1h = scipy.sparse.csr_matrix(([1, 1, 1, 1, 1, 1], - ((5, 4, 1, 2, 3, 5), - (0, 1, 2, 3, 3, 3))), shape=(6, 4)) +sparse2_csr = scipy.sparse.csr_matrix( + ([2, 1, 0, 0, 0, 0], ((1, 4, 5, 2, 3, 5), (0, 0, 0, 1, 1, 1))), shape=(6, 2) +) +sparse2_csr_1h = scipy.sparse.csr_matrix( + ([1, 1, 1, 1, 1, 1], ((5, 4, 1, 2, 3, 5), (0, 1, 2, 3, 3, 3))), shape=(6, 4) +) class TestSparseOneHotEncoder(unittest.TestCase): @@ -52,8 +51,7 @@ def _fit_then_transform(self, expected, input): transformation = ohe.fit_transform(input) self.assertIsInstance(transformation, scipy.sparse.csr_matrix) np.testing.assert_array_almost_equal( - expected.astype(float), - transformation.todense() + expected.astype(float), transformation.todense() ) self._check_arrays_equal(input, input_copy) @@ -90,23 +88,26 @@ def test_transform_with_unknown_value(self): self.assertEqual(3, np.sum(output)) def test_classification_workflow(self): - X, y = sklearn.datasets.fetch_openml(data_id=24, as_frame=False, return_X_y=True) + X, y = sklearn.datasets.fetch_openml( + data_id=24, as_frame=False, return_X_y=True + ) print(type(X)) - X_train, X_test, y_train, y_test = \ - sklearn.model_selection.train_test_split(X, y, random_state=3, - train_size=0.5, - test_size=0.5) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=3, train_size=0.5, test_size=0.5 + ) X_train = scipy.sparse.csc_matrix(X_train) X_test = scipy.sparse.csc_matrix(X_test) - pipeline = sklearn.pipeline.Pipeline(( - ('shift', CategoryShift()), - ('imput', SimpleImputer(strategy='constant', fill_value=2)), - ('ohe', SparseOneHotEncoder()), - ('tree', DecisionTreeClassifier(random_state=1)), - )) + pipeline = sklearn.pipeline.Pipeline( + ( + ("shift", CategoryShift()), + ("imput", SimpleImputer(strategy="constant", fill_value=2)), + ("ohe", SparseOneHotEncoder()), + ("tree", DecisionTreeClassifier(random_state=1)), + ) + ) pipeline.fit(X_train, y_train) pred_train = pipeline.predict(X_train) diff --git a/test/test_pipeline/implementations/test_util.py b/test/test_pipeline/implementations/test_util.py index 06f2a1eb2f..58412e0b0c 100644 --- a/test/test_pipeline/implementations/test_util.py +++ b/test/test_pipeline/implementations/test_util.py @@ -7,19 +7,44 @@ class UtilTest(unittest.TestCase): def test_softmax_binary(self): - df = np.array([-40.00643897, 34.69754581, 23.71181359, -29.89724287, - 27.06071791, -37.78334103, -40.15812461, 40.16139229, - -27.85887801, 42.67404756, -36.89753589, -36.45148009, - 54.68976306, 19.47886562, -49.99821027, -35.70205302, - -40.59639267, 32.96343916, -39.23777841, -37.86535019, - -33.10196906, 26.84144377, -36.8569686]) + df = np.array( + [ + -40.00643897, + 34.69754581, + 23.71181359, + -29.89724287, + 27.06071791, + -37.78334103, + -40.15812461, + 40.16139229, + -27.85887801, + 42.67404756, + -36.89753589, + -36.45148009, + 54.68976306, + 19.47886562, + -49.99821027, + -35.70205302, + -40.59639267, + 32.96343916, + -39.23777841, + -37.86535019, + -33.10196906, + 26.84144377, + -36.8569686, + ] + ) probas = softmax(df) - expected = [[1., 0.] if d < 0. else [0., 1.] for d in df] + expected = [[1.0, 0.0] if d < 0.0 else [0.0, 1.0] for d in df] np.testing.assert_array_almost_equal(expected, probas) def test_softmax(self): - df = np.array([[2.75021367e+10, -8.83772371e-01, -2.20516715e+27], - [-2.10848072e+11, 2.35024444e-01, 5.20106536e+25]]) + df = np.array( + [ + [2.75021367e10, -8.83772371e-01, -2.20516715e27], + [-2.10848072e11, 2.35024444e-01, 5.20106536e25], + ] + ) # With a numerically unstable softmax, the output would be something # like this: # [[ 0. 0. nan] @@ -30,6 +55,7 @@ def test_softmax(self): df = np.array([[0.1, 0.6, 0.3], [0.2, 0.3, 0.5]]) probas = softmax(df) - expected = np.array([[0.25838965, 0.42601251, 0.31559783], - [0.28943311, 0.31987306, 0.39069383]]) + expected = np.array( + [[0.25838965, 0.42601251, 0.31559783], [0.28943311, 0.31987306, 0.39069383]] + ) np.testing.assert_array_almost_equal(expected, probas) diff --git a/test/test_pipeline/test_base.py b/test/test_pipeline/test_base.py index 0d40bca0d1..f1efed23b4 100644 --- a/test/test_pipeline/test_base.py +++ b/test/test_pipeline/test_base.py @@ -5,8 +5,8 @@ import autosklearn.pipeline.base import autosklearn.pipeline.components.base -import autosklearn.pipeline.components.feature_preprocessing -import autosklearn.pipeline.components.classification +import autosklearn.pipeline.components.classification as classification +import autosklearn.pipeline.components.feature_preprocessing as feature_preprocessing class BasePipelineMock(autosklearn.pipeline.base.BasePipeline): @@ -17,42 +17,45 @@ def __init__(self): class BaseTest(unittest.TestCase): def test_get_hyperparameter_configuration_space_3choices(self): cs = ConfigSpace.configuration_space.ConfigurationSpace() - dataset_properties = {'target_type': 'classification'} + dataset_properties = {"target_type": "classification"} exclude = {} include = {} - pipeline = [('p0', - autosklearn.pipeline.components.feature_preprocessing - .FeaturePreprocessorChoice(dataset_properties)), - ('p1', - autosklearn.pipeline.components.feature_preprocessing - .FeaturePreprocessorChoice(dataset_properties)), - ('c', autosklearn.pipeline.components.classification - .ClassifierChoice(dataset_properties))] + pipeline = [ + ( + "p0", + feature_preprocessing.FeaturePreprocessorChoice(dataset_properties), + ), + ( + "p1", + feature_preprocessing.FeaturePreprocessorChoice(dataset_properties), + ), + ( + "c", + classification.ClassifierChoice(dataset_properties), + ), + ] base = BasePipelineMock() - cs = base._get_base_search_space(cs, dataset_properties, - exclude, include, pipeline) + cs = base._get_base_search_space( + cs, dataset_properties, exclude, include, pipeline + ) - self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), - 13) - self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), - 15) + self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 13) + self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 15) # for clause in sorted([str(clause) for clause in cs.forbidden_clauses]): # print(clause) self.assertEqual(148, len(cs.forbidden_clauses)) cs = ConfigSpace.configuration_space.ConfigurationSpace() - dataset_properties = {'target_type': 'classification', 'signed': True} - include = {'c': ['multinomial_nb']} - cs = base._get_base_search_space(cs, dataset_properties, - exclude, include, pipeline) - self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), - 13) - self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), - 10) - self.assertEqual(len(cs.get_hyperparameter("c:__choice__").choices), - 1) + dataset_properties = {"target_type": "classification", "signed": True} + include = {"c": ["multinomial_nb"]} + cs = base._get_base_search_space( + cs, dataset_properties, exclude, include, pipeline + ) + self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 13) + self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 10) + self.assertEqual(len(cs.get_hyperparameter("c:__choice__").choices), 1) # Mostly combinations of p0 making the data unsigned and p1 not # changing the values of the data points # for clause in sorted([str(clause) for clause in cs.forbidden_clauses]): @@ -60,42 +63,41 @@ def test_get_hyperparameter_configuration_space_3choices(self): self.assertEqual(64, len(cs.forbidden_clauses)) cs = ConfigSpace.configuration_space.ConfigurationSpace() - dataset_properties = {'target_type': 'classification', 'signed': True} + dataset_properties = {"target_type": "classification", "signed": True} include = {} - cs = base._get_base_search_space(cs, dataset_properties, - exclude, include, pipeline) - self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), - 13) - self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), - 15) - self.assertEqual(len(cs.get_hyperparameter("c:__choice__").choices), - 16) + cs = base._get_base_search_space( + cs, dataset_properties, exclude, include, pipeline + ) + self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 13) + self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 15) + self.assertEqual(len(cs.get_hyperparameter("c:__choice__").choices), 16) # for clause in sorted([str(clause) for clause in cs.forbidden_clauses]): # print(clause) self.assertEqual(110, len(cs.forbidden_clauses)) cs = ConfigSpace.configuration_space.ConfigurationSpace() - dataset_properties = {'target_type': 'classification', 'sparse': True} - cs = base._get_base_search_space(cs, dataset_properties, - exclude, include, pipeline) - self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), - 12) - self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), - 15) + dataset_properties = {"target_type": "classification", "sparse": True} + cs = base._get_base_search_space( + cs, dataset_properties, exclude, include, pipeline + ) + self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 12) + self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 15) # for clause in sorted([str(clause) for clause in cs.forbidden_clauses]): # print(clause) self.assertEqual(419, len(cs.forbidden_clauses)) cs = ConfigSpace.configuration_space.ConfigurationSpace() - dataset_properties = {'target_type': 'classification', - 'sparse': True, 'signed': True} - cs = base._get_base_search_space(cs, dataset_properties, - exclude, include, pipeline) - - self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), - 12) - self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), - 15) + dataset_properties = { + "target_type": "classification", + "sparse": True, + "signed": True, + } + cs = base._get_base_search_space( + cs, dataset_properties, exclude, include, pipeline + ) + + self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 12) + self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 15) # Data is guaranteed to be positive in cases like densifier, # extra_trees_preproc, multinomial_nb -> less constraints # for clause in sorted([str(clause) for clause in cs.forbidden_clauses]): @@ -123,52 +125,71 @@ def test_init_params_handling(self): for init_params, expected_init_params in [ ({}, {}), (None, None), - ({'M:key': 'value'}, {'key': 'value'}), + ({"M:key": "value"}, {"key": "value"}), ]: node = unittest.mock.Mock( spec=autosklearn.pipeline.components.base.AutoSklearnComponent ) node.get_hyperparameter_search_space.return_value = cs - node.key = 'value' - base.steps = [('M', node)] - base.set_hyperparameters(cs.sample_configuration(), init_params=init_params) - self.assertEqual(node.set_hyperparameters.call_args[1]['init_params'], - expected_init_params) + node.key = "value" + base.steps = [("M", node)] + base.set_hyperparameters( + cs.sample_configuration(), init_params=init_params + ) + self.assertEqual( + node.set_hyperparameters.call_args[1]["init_params"], + expected_init_params, + ) # Check for proper exception raising node = unittest.mock.Mock( spec=autosklearn.pipeline.components.base.AutoSklearnComponent ) node.get_hyperparameter_search_space.return_value = cs - base.steps = [('M', node)] - with self.assertRaisesRegex(ValueError, "Unsupported argument to init_params"): - base.set_hyperparameters(cs.sample_configuration(), init_params={'key': 'value'}) + base.steps = [("M", node)] + with self.assertRaisesRegex( + ValueError, "Unsupported argument to init_params" + ): + base.set_hyperparameters( + cs.sample_configuration(), init_params={"key": "value"} + ) # An invalid node name is passed - with self.assertRaisesRegex(ValueError, "The current node name specified via key"): - base.set_hyperparameters(cs.sample_configuration(), init_params={'N:key': 'value'}) + with self.assertRaisesRegex( + ValueError, "The current node name specified via key" + ): + base.set_hyperparameters( + cs.sample_configuration(), init_params={"N:key": "value"} + ) # The value was not properly set -- Here it happens because the # object is a magic mock, calling the method doesn't set a new parameter with self.assertRaisesRegex(ValueError, "Cannot properly set the pair"): - base.set_hyperparameters(cs.sample_configuration(), init_params={'M:key': 'value'}) + base.set_hyperparameters( + cs.sample_configuration(), init_params={"M:key": "value"} + ) def test_include_exclude_validation(self): - """ - Makes sure include and exclude arguments are validated and raises expected exception - on error + """Makes sure include and exclude arguments are validated and raises + expected exception on error """ base = BasePipelineMock() - dataset_properties = {'target_type': 'classification'} + dataset_properties = {"target_type": "classification"} base.dataset_properties = dataset_properties - base.steps = [('p0', - autosklearn.pipeline.components.feature_preprocessing - .FeaturePreprocessorChoice(dataset_properties)), - ('p1', - autosklearn.pipeline.components.feature_preprocessing - .FeaturePreprocessorChoice(dataset_properties)), - ('c', autosklearn.pipeline.components.classification - .ClassifierChoice(dataset_properties))] + base.steps = [ + ( + "p0", + feature_preprocessing.FeaturePreprocessorChoice(dataset_properties), + ), + ( + "p1", + feature_preprocessing.FeaturePreprocessorChoice(dataset_properties), + ), + ( + "c", + classification.ClassifierChoice(dataset_properties), + ), + ] def assert_value_error(include=None, exclude=None): base.include = include @@ -177,21 +198,21 @@ def assert_value_error(include=None, exclude=None): base._validate_include_exclude_params() # Same key in include and exclude argument - assert_value_error(include={'c': ['adaboost']}, exclude={'c': ['sgd']}) + assert_value_error(include={"c": ["adaboost"]}, exclude={"c": ["sgd"]}) # Invalid key in the exclude argument - assert_value_error(exclude={'p2': ['pca']}) + assert_value_error(exclude={"p2": ["pca"]}) # Invalid value type for the key in the include argument - assert_value_error(include={'c': ('adaboost', 'sgd')}, exclude=None) + assert_value_error(include={"c": ("adaboost", "sgd")}, exclude=None) # Empty list of the key in the include argument - assert_value_error(include={'c': []}) + assert_value_error(include={"c": []}) # Invalid component in the list value for the key in the include argument - assert_value_error(include={'c': ['pca']}) + assert_value_error(include={"c": ["pca"]}) # Case when all conditions passed for include and exclude - base.include = {'c': ['adaboost', 'sgd']} - base.exclude = {'p1': ['pca']} + base.include = {"c": ["adaboost", "sgd"]} + base.exclude = {"p1": ["pca"]} self.assertIsNone(base._validate_include_exclude_params()) diff --git a/test/test_pipeline/test_classification.py b/test/test_pipeline/test_classification.py index 49267b0fee..c197dd30fc 100644 --- a/test/test_pipeline/test_classification.py +++ b/test/test_pipeline/test_classification.py @@ -8,29 +8,37 @@ import unittest import unittest.mock -from joblib import Memory import numpy as np - -from sklearn.base import clone import sklearn.datasets import sklearn.decomposition -import sklearn.model_selection import sklearn.ensemble +import sklearn.model_selection import sklearn.svm -from sklearn.utils.validation import check_is_fitted - from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from joblib import Memory +from sklearn.base import clone +from sklearn.utils.validation import check_is_fitted -from autosklearn.pipeline.classification import SimpleClassificationPipeline -from autosklearn.pipeline.components.base import \ - AutoSklearnClassificationAlgorithm, AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.components.base import AutoSklearnComponent, AutoSklearnChoice, _addons import autosklearn.pipeline.components.classification as classification_components import autosklearn.pipeline.components.feature_preprocessing as preprocessing_components +from autosklearn.pipeline.classification import SimpleClassificationPipeline +from autosklearn.pipeline.components.base import ( + AutoSklearnChoice, + AutoSklearnClassificationAlgorithm, + AutoSklearnComponent, + AutoSklearnPreprocessingAlgorithm, + _addons, +) +from autosklearn.pipeline.constants import ( + DENSE, + INPUT, + PREDICTIONS, + SIGNED_DATA, + SPARSE, + UNSIGNED_DATA, +) from autosklearn.pipeline.util import get_dataset -from autosklearn.pipeline.constants import \ - DENSE, SPARSE, UNSIGNED_DATA, PREDICTIONS, SIGNED_DATA, INPUT from test.test_pipeline.ignored_warnings import classifier_warnings, ignore_warnings @@ -38,16 +46,18 @@ class DummyClassifier(AutoSklearnClassificationAlgorithm): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'AB', - 'name': 'AdaBoost Classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} + return { + "shortname": "AB", + "name": "AdaBoost Classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (PREDICTIONS,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): @@ -58,16 +68,18 @@ def get_hyperparameter_search_space(dataset_properties=None): class DummyPreprocessor(AutoSklearnPreprocessingAlgorithm): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'AB', - 'name': 'AdaBoost Classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,)} + return { + "shortname": "AB", + "name": "AdaBoost Classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): @@ -81,16 +93,18 @@ def __init__(*args, **kwargs): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'AB', - 'name': 'AdaBoost Classifier', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, - 'handles_multilabel': True, - 'handles_multioutput': False, - 'is_deterministic': True, - 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,)} + return { + "shortname": "AB", + "name": "AdaBoost Classifier", + "handles_regression": False, + "handles_classification": True, + "handles_multiclass": True, + "handles_multilabel": True, + "handles_multioutput": False, + "is_deterministic": True, + "input": (DENSE, SPARSE, UNSIGNED_DATA), + "output": (INPUT,), + } def fit(self, X, y): raise ValueError("Make sure fit is called") @@ -116,21 +130,21 @@ def test_io_dict(self): if classifiers[c] == classification_components.ClassifierChoice: continue props = classifiers[c].get_properties() - self.assertIn('input', props) - self.assertIn('output', props) - inp = props['input'] - output = props['output'] + self.assertIn("input", props) + self.assertIn("output", props) + inp = props["input"] + output = props["output"] self.assertIsInstance(inp, tuple) self.assertIsInstance(output, tuple) for i in inp: self.assertIn(i, (SPARSE, DENSE, SIGNED_DATA, UNSIGNED_DATA)) self.assertEqual(output, (PREDICTIONS,)) - self.assertIn('handles_regression', props) - self.assertFalse(props['handles_regression']) - self.assertIn('handles_classification', props) - self.assertIn('handles_multiclass', props) - self.assertIn('handles_multilabel', props) + self.assertIn("handles_regression", props) + self.assertFalse(props["handles_regression"]) + self.assertIn("handles_classification", props) + self.assertIn("handles_multiclass", props) + self.assertIn("handles_multilabel", props) def test_find_classifiers(self): """Test that the classifier components can be found @@ -143,9 +157,11 @@ def test_find_classifiers(self): classifiers = classification_components._classifiers self.assertGreaterEqual(len(classifiers), 2) for key in classifiers: - if hasattr(classifiers[key], 'get_components'): + if hasattr(classifiers[key], "get_components"): continue - self.assertIn(AutoSklearnClassificationAlgorithm, classifiers[key].__bases__) + self.assertIn( + AutoSklearnClassificationAlgorithm, classifiers[key].__bases__ + ) def test_find_preprocessors(self): """Test that preproccesor components can be found @@ -156,20 +172,23 @@ def test_find_preprocessors(self): * The inherit from AutoSklearnPreprocessingAlgorithm """ preprocessors = preprocessing_components._preprocessors - self.assertGreaterEqual(len(preprocessors), 1) + self.assertGreaterEqual(len(preprocessors), 1) for key in preprocessors: - if hasattr(preprocessors[key], 'get_components'): + if hasattr(preprocessors[key], "get_components"): continue - self.assertIn(AutoSklearnPreprocessingAlgorithm, preprocessors[key].__bases__) + self.assertIn( + AutoSklearnPreprocessingAlgorithm, preprocessors[key].__bases__ + ) def test_default_configuration(self): """Test that seeded SimpleClassificaitonPipeline returns good results on iris Expects ------- - * The performance of configuration with fixed seed gets above 96% accuracy on iris + * The performance of configuration with fixed seed gets above 96% accuracy + on iris """ - X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris") auto = SimpleClassificationPipeline(random_state=1) @@ -190,11 +209,12 @@ def test_default_configuration_multilabel(self): * The performance of a random configuratino gets above 96% on a multilabel version of iris """ - X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris', make_multilabel=True) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="iris", make_multilabel=True + ) classifier = SimpleClassificationPipeline( - dataset_properties={'multilabel': True}, - random_state=0 + dataset_properties={"multilabel": True}, random_state=0 ) cs = classifier.get_hyperparameter_search_space() @@ -218,14 +238,14 @@ def test_default_configuration_iterative_fit(self): * Random forest pipeline can be fit iteratively * Test that its number of estimators is equal to the iteration count """ - X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris") classifier = SimpleClassificationPipeline( include={ - 'classifier': ['random_forest'], - 'feature_preprocessor': ['no_preprocessing'] + "classifier": ["random_forest"], + "feature_preprocessor": ["no_preprocessing"], }, - random_state=0 + random_state=0, ) classifier.fit_transformer(X_train, Y_train) @@ -256,9 +276,7 @@ def test_multilabel(self): * All configurations should fit, predict and predict_proba successfully """ cache = Memory(location=tempfile.gettempdir()) - cached_func = cache.cache( - sklearn.datasets.make_multilabel_classification - ) + cached_func = cache.cache(sklearn.datasets.make_multilabel_classification) X, Y = cached_func( n_samples=150, n_features=20, @@ -269,14 +287,16 @@ def test_multilabel(self): sparse=False, return_indicator=True, return_distributions=False, - random_state=1 + random_state=1, ) data = { - 'X_train': X[:100, :], - 'Y_train': Y[:100, :], - 'X_test': X[101:, :], - 'Y_test': Y[101:, ] + "X_train": X[:100, :], + "Y_train": Y[:100, :], + "X_test": X[101:, :], + "Y_test": Y[ + 101:, + ], } pipeline = SimpleClassificationPipeline(dataset_properties={"multilabel": True}) @@ -301,12 +321,14 @@ def test_configurations_signed_data(self): ------- * All configurations should fit, predict and predict_proba successfully """ - dataset_properties = {'signed': True} + dataset_properties = {"signed": True} cls = SimpleClassificationPipeline(dataset_properties=dataset_properties) cs = cls.get_hyperparameter_search_space() - self._test_configurations(configurations_space=cs, dataset_properties=dataset_properties) + self._test_configurations( + configurations_space=cs, dataset_properties=dataset_properties + ) def test_configurations_sparse(self): """Tests a non-seeded random set of configurations with sparse data @@ -315,7 +337,7 @@ def test_configurations_sparse(self): ------- * All configurations should fit, predict and predict_proba successfully """ - pipeline = SimpleClassificationPipeline(dataset_properties={'sparse': True}) + pipeline = SimpleClassificationPipeline(dataset_properties={"sparse": True}) cs = pipeline.get_hyperparameter_search_space() self._test_configurations(configurations_space=cs, make_sparse=True) @@ -330,41 +352,89 @@ def test_configurations_categorical_data(self): * All configurations should fit, predict and predict_proba successfully """ pipeline = SimpleClassificationPipeline( - dataset_properties={'sparse': False}, + dataset_properties={"sparse": False}, include={ - 'feature_preprocessor': ['no_preprocessing'], - 'classifier': ['sgd', 'adaboost'] - } + "feature_preprocessor": ["no_preprocessing"], + "classifier": ["sgd", "adaboost"], + }, ) cs = pipeline.get_hyperparameter_search_space() categorical_columns = [ - True, True, True, False, False, True, True, True, False, True, True, True, True, - True, True, True, True, True, True, True, True, True, True, True, True, True, - True, True, True, True, True, True, False, False, False, True, True, True + True, + True, + True, + False, + False, + True, + True, + True, + False, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + False, + False, + False, + True, + True, + True, ] categorical = { - i: 'categorical' if is_categorical else 'numerical' + i: "categorical" if is_categorical else "numerical" for i, is_categorical in enumerate(categorical_columns) } here = os.path.dirname(__file__) - dataset_path = os.path.join(here, "components", "data_preprocessing", "dataset.pkl") + dataset_path = os.path.join( + here, "components", "data_preprocessing", "dataset.pkl" + ) X = np.loadtxt(dataset_path) y = X[:, -1].copy() X = X[:, :-1] - X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, y) + X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split( + X, y + ) - data = {'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test} + data = { + "X_train": X_train, + "Y_train": Y_train, + "X_test": X_test, + "Y_test": Y_test, + } - init_params = {'data_preprocessor:feat_type': categorical} + init_params = {"data_preprocessor:feat_type": categorical} - self._test_configurations(configurations_space=cs, dataset=data, init_params=init_params) + self._test_configurations( + configurations_space=cs, dataset=data, init_params=init_params + ) - @unittest.mock.patch('autosklearn.pipeline.components.data_preprocessing' - '.DataPreprocessorChoice.set_hyperparameters') + @unittest.mock.patch( + "autosklearn.pipeline.components.data_preprocessing" + ".DataPreprocessorChoice.set_hyperparameters" + ) def test_categorical_passed_to_one_hot_encoder(self, ohe_mock): """Test that the feat_types arg is passed to the OneHotEncoder @@ -379,36 +449,38 @@ def test_categorical_passed_to_one_hot_encoder(self, ohe_mock): # Mock the _check_init_params_honored as there is no object created, # _check_init_params_honored will fail as a datapreprocessor was never created - with unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline' - '._check_init_params_honored'): + with unittest.mock.patch( + "autosklearn.pipeline.classification.SimpleClassificationPipeline" + "._check_init_params_honored" + ): # Check through construction - feat_types = {0: 'categorical', 1: 'numerical'} + feat_types = {0: "categorical", 1: "numerical"} cls = SimpleClassificationPipeline( - init_params={'data_preprocessor:feat_type': feat_types} + init_params={"data_preprocessor:feat_type": feat_types} ) - init_args = ohe_mock.call_args[1]['init_params'] - self.assertEqual(init_args, {'feat_type': feat_types}) + init_args = ohe_mock.call_args[1]["init_params"] + self.assertEqual(init_args, {"feat_type": feat_types}) # Check through `set_hyperparameters` - feat_types = {0: 'categorical', 1: 'categorical', 2: 'numerical'} + feat_types = {0: "categorical", 1: "categorical", 2: "numerical"} default = cls.get_hyperparameter_search_space().get_default_configuration() cls.set_hyperparameters( configuration=default, - init_params={'data_preprocessor:feat_type': feat_types}, + init_params={"data_preprocessor:feat_type": feat_types}, ) - init_args = ohe_mock.call_args[1]['init_params'] - self.assertEqual(init_args, {'feat_type': feat_types}) + init_args = ohe_mock.call_args[1]["init_params"] + self.assertEqual(init_args, {"feat_type": feat_types}) def _test_configurations( self, configurations_space: ConfigurationSpace, make_sparse: bool = False, - dataset: Union[str, Dict[str, Any]] = 'digits', + dataset: Union[str, Dict[str, Any]] = "digits", init_params: Dict[str, Any] = None, dataset_properties: Dict[str, Any] = None, n_samples: int = 10, @@ -448,53 +520,55 @@ def _test_configurations( config._populate_values() # Restrict configurations which could take too long on travis-ci - restrictions = {'classifier:passive_aggressive:n_iter': 5, - 'classifier:sgd:n_iter': 5, - 'classifier:adaboost:n_estimators': 50, - 'classifier:adaboost:max_depth': 1, - 'feature_preprocessor:kernel_pca:n_components': 10, - 'feature_preprocessor:kitchen_sinks:n_components': 50, - 'classifier:proj_logit:max_epochs': 1, - 'classifier:libsvm_svc:degree': 2, - 'regressor:libsvm_svr:degree': 2, - 'feature_preprocessor:truncatedSVD:target_dim': 10, - 'feature_preprocessor:polynomial:degree': 2, - 'classifier:lda:n_components': 10, - 'feature_preprocessor:nystroem_sampler:n_components': 50, - 'feature_preprocessor:feature_agglomeration:n_clusters': 2, - 'classifier:gradient_boosting:max_leaf_nodes': 64} - - config._values.update({ - param: value - for param, value in restrictions.items() - if param in config and config[param] is not None - }) + restrictions = { + "classifier:passive_aggressive:n_iter": 5, + "classifier:sgd:n_iter": 5, + "classifier:adaboost:n_estimators": 50, + "classifier:adaboost:max_depth": 1, + "feature_preprocessor:kernel_pca:n_components": 10, + "feature_preprocessor:kitchen_sinks:n_components": 50, + "classifier:proj_logit:max_epochs": 1, + "classifier:libsvm_svc:degree": 2, + "regressor:libsvm_svr:degree": 2, + "feature_preprocessor:truncatedSVD:target_dim": 10, + "feature_preprocessor:polynomial:degree": 2, + "classifier:lda:n_components": 10, + "feature_preprocessor:nystroem_sampler:n_components": 50, + "feature_preprocessor:feature_agglomeration:n_clusters": 2, + "classifier:gradient_boosting:max_leaf_nodes": 64, + } + + config._values.update( + { + param: value + for param, value in restrictions.items() + if param in config and config[param] is not None + } + ) if isinstance(dataset, str): X_train, Y_train, X_test, Y_test = get_dataset( - dataset=dataset, - make_sparse=make_sparse, - add_NaNs=True + dataset=dataset, make_sparse=make_sparse, add_NaNs=True ) else: - X_train = dataset['X_train'].copy() - Y_train = dataset['Y_train'].copy() - X_test = dataset['X_test'].copy() - dataset['Y_test'].copy() + X_train = dataset["X_train"].copy() + Y_train = dataset["Y_train"].copy() + X_test = dataset["X_test"].copy() + dataset["Y_test"].copy() init_params_ = copy.deepcopy(init_params) cls = SimpleClassificationPipeline( - dataset_properties=dataset_properties, - init_params=init_params_ + dataset_properties=dataset_properties, init_params=init_params_ ) cls.set_hyperparameters(config, init_params=init_params_) # First make sure that for this configuration, setting the parameters # does not mistakenly set the estimator as fitted for name, step in cls.named_steps.items(): - with self.assertRaisesRegex(sklearn.exceptions.NotFittedError, - "instance is not fitted yet"): + with self.assertRaisesRegex( + sklearn.exceptions.NotFittedError, "instance is not fitted yet" + ): check_is_fitted(step) try: @@ -526,15 +600,17 @@ def _test_configurations( continue elif "Numerical problems in QDA" in e.args[0]: continue - elif 'Bug in scikit-learn' in e.args[0]: + elif "Bug in scikit-learn" in e.args[0]: continue - elif 'The condensed distance matrix must contain only finite ' \ - 'values.' in e.args[0]: + elif ( + "The condensed distance matrix must contain only finite " + "values." in e.args[0] + ): continue - elif 'Internal work array size computation failed' in e.args[0]: + elif "Internal work array size computation failed" in e.args[0]: continue # Assumed to be caused by knn with preprocessor fast_ica with whiten - elif 'Input contains NaN, infinity or a value too large' in e.args[0]: + elif "Input contains NaN, infinity or a value too large" in e.args[0]: continue else: e.args += (f"config={config}",) @@ -581,14 +657,18 @@ def test_get_hyperparameter_search_space(self): cs = pipeline.get_hyperparameter_search_space() self.assertIsInstance(cs, ConfigurationSpace) - rescale_param = 'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__' + rescale_param = ( + "data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__" + ) n_choices = len(cs.get_hyperparameter(rescale_param).choices) self.assertEqual(n_choices, 7) - n_classifiers = len(cs.get_hyperparameter('classifier:__choice__').choices) + n_classifiers = len(cs.get_hyperparameter("classifier:__choice__").choices) self.assertEqual(n_classifiers, 16) - n_preprocessors = len(cs.get_hyperparameter('feature_preprocessor:__choice__').choices) + n_preprocessors = len( + cs.get_hyperparameter("feature_preprocessor:__choice__").choices + ) self.assertEqual(n_preprocessors, 13) hyperparameters = cs.get_hyperparameters() @@ -610,69 +690,71 @@ def test_get_hyperparameter_search_space_include_exclude_models(self): Expects ------- - * Including a classifier choice has pipeline give back matching choice - * Excluding a classifier choice means it won't show up in the hyperparameter space + * Including a classifier has pipeline give back matching choice + * Excluding a classifier means it won't show up in the hyperparameter space * Including a feature preprocessor has pipeline give back matching choice - * Excluding a feature preprocessor means it won't show up in the hyperparameter space + * Excluding a feature preprocessor means it won't show up in the + hyperparameter space """ # include a classifier choice - pipeline = SimpleClassificationPipeline(include={'classifier': ['libsvm_svc']}) + pipeline = SimpleClassificationPipeline(include={"classifier": ["libsvm_svc"]}) cs = pipeline.get_hyperparameter_search_space() - expected = CategoricalHyperparameter('classifier:__choice__', ['libsvm_svc']) - returned = cs.get_hyperparameter('classifier:__choice__') + expected = CategoricalHyperparameter("classifier:__choice__", ["libsvm_svc"]) + returned = cs.get_hyperparameter("classifier:__choice__") self.assertEqual(returned, expected) # exclude a classifier choice - pipeline = SimpleClassificationPipeline(exclude={'classifier': ['libsvm_svc']}) + pipeline = SimpleClassificationPipeline(exclude={"classifier": ["libsvm_svc"]}) cs = pipeline.get_hyperparameter_search_space() - self.assertNotIn('libsvm_svc', str(cs)) + self.assertNotIn("libsvm_svc", str(cs)) # include a feature preprocessor pipeline = SimpleClassificationPipeline( - include={'feature_preprocessor': ['select_percentile_classification']} + include={"feature_preprocessor": ["select_percentile_classification"]} ) cs = pipeline.get_hyperparameter_search_space() - returned = cs.get_hyperparameter('feature_preprocessor:__choice__') + returned = cs.get_hyperparameter("feature_preprocessor:__choice__") expected = CategoricalHyperparameter( - 'feature_preprocessor:__choice__', - ['select_percentile_classification'] + "feature_preprocessor:__choice__", ["select_percentile_classification"] ) self.assertEqual(returned, expected) # exclude a feature preprocessor pipeline = SimpleClassificationPipeline( - exclude={'feature_preprocessor': ['select_percentile_classification']} + exclude={"feature_preprocessor": ["select_percentile_classification"]} ) cs = pipeline.get_hyperparameter_search_space() - self.assertNotIn('select_percentile_classification', str(cs)) + self.assertNotIn("select_percentile_classification", str(cs)) - def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(self): + def test_get_hyperparameter_search_space_preprocessor_contradicts_default( + self, + ): """Test that the default classifier gets updated based on the legal feature preprocessors that come before. Expects ------- - * With 'densifier' as only legal feature_preprocessor, 'qda' is default classifier - * With 'nystroem_sampler' as only legal feature_preprocessor, 'sgd' is default classifier + * With 'densifier' as only legal feature_preprocessor, 'qda' is default + * With 'nystroem_sampler' as only legal feature_preprocessor, 'sgd' is default """ pipeline = SimpleClassificationPipeline( - include={'feature_preprocessor': ['densifier']}, - dataset_properties={'sparse': True} + include={"feature_preprocessor": ["densifier"]}, + dataset_properties={"sparse": True}, ) cs = pipeline.get_hyperparameter_search_space() - default_choice = cs.get_hyperparameter('classifier:__choice__').default_value - self.assertEqual(default_choice, 'qda') + default_choice = cs.get_hyperparameter("classifier:__choice__").default_value + self.assertEqual(default_choice, "qda") pipeline = SimpleClassificationPipeline( - include={'feature_preprocessor': ['nystroem_sampler']} + include={"feature_preprocessor": ["nystroem_sampler"]} ) cs = pipeline.get_hyperparameter_search_space() - default_choice = cs.get_hyperparameter('classifier:__choice__').default_value - self.assertEqual(default_choice, 'sgd') + default_choice = cs.get_hyperparameter("classifier:__choice__").default_value + self.assertEqual(default_choice, "sgd") def test_get_hyperparameter_search_space_only_forbidden_combinations(self): """Test that invalid pipeline configurations raise errors @@ -686,43 +768,48 @@ def test_get_hyperparameter_search_space_only_forbidden_combinations(self): with self.assertRaisesRegex(AssertionError, "No valid pipeline found."): SimpleClassificationPipeline( include={ - 'classifier': ['multinomial_nb'], - 'feature_preprocessor': ['pca'] + "classifier": ["multinomial_nb"], + "feature_preprocessor": ["pca"], }, - dataset_properties={'sparse': True} + dataset_properties={"sparse": True}, ) - with self.assertRaisesRegex(ValueError, "Cannot find a legal default configuration."): + with self.assertRaisesRegex( + ValueError, "Cannot find a legal default configuration." + ): SimpleClassificationPipeline( include={ - 'classifier': ['liblinear_svc'], - 'feature_preprocessor': ['densifier'] + "classifier": ["liblinear_svc"], + "feature_preprocessor": ["densifier"], }, - dataset_properties={'sparse': True} + dataset_properties={"sparse": True}, ) @unittest.skip("Wait until ConfigSpace is fixed.") def test_get_hyperparameter_search_space_dataset_properties(self): cs_mc = SimpleClassificationPipeline.get_hyperparameter_search_space( - dataset_properties={'multiclass': True} + dataset_properties={"multiclass": True} ) - self.assertNotIn('bernoulli_nb', str(cs_mc)) + self.assertNotIn("bernoulli_nb", str(cs_mc)) cs_ml = SimpleClassificationPipeline.get_hyperparameter_search_space( - dataset_properties={'multilabel': True}) - self.assertNotIn('k_nearest_neighbors', str(cs_ml)) - self.assertNotIn('liblinear', str(cs_ml)) - self.assertNotIn('libsvm_svc', str(cs_ml)) - self.assertNotIn('sgd', str(cs_ml)) + dataset_properties={"multilabel": True} + ) + self.assertNotIn("k_nearest_neighbors", str(cs_ml)) + self.assertNotIn("liblinear", str(cs_ml)) + self.assertNotIn("libsvm_svc", str(cs_ml)) + self.assertNotIn("sgd", str(cs_ml)) cs_sp = SimpleClassificationPipeline.get_hyperparameter_search_space( - dataset_properties={'sparse': True}) - self.assertIn('extra_trees', str(cs_sp)) - self.assertIn('gradient_boosting', str(cs_sp)) - self.assertIn('random_forest', str(cs_sp)) + dataset_properties={"sparse": True} + ) + self.assertIn("extra_trees", str(cs_sp)) + self.assertIn("gradient_boosting", str(cs_sp)) + self.assertIn("random_forest", str(cs_sp)) cs_mc_ml = SimpleClassificationPipeline.get_hyperparameter_search_space( - dataset_properties={'multilabel': True, 'multiclass': True}) + dataset_properties={"multilabel": True, "multiclass": True} + ) self.assertEqual(cs_ml, cs_mc_ml) def test_predict_batched(self): @@ -733,12 +820,13 @@ def test_predict_batched(self): ------- * Should expect the output shape to match that of the digits dataset * Should expect a fixed call count each test run - * Should expect predict_proba with `batches` and predict_proba perform near identically + * Should expect predict_proba with `batches` and predict_proba + perform near identically """ - cls = SimpleClassificationPipeline(include={'classifier': ['sgd']}) + cls = SimpleClassificationPipeline(include={"classifier": ["sgd"]}) # Multiclass - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") with ignore_warnings(classifier_warnings): cls.fit(X_train, Y_train) @@ -764,15 +852,17 @@ def test_predict_batched_sparse(self): ------- * Should expect the output shape to match that of the digits dataset * Should expect a fixed call count each test run - * Should expect predict_proba with `batches` and predict_proba perform near identically + * Should expect predict_proba with `batches` and predict_proba + perform near identically """ cls = SimpleClassificationPipeline( - dataset_properties={'sparse': True}, - include={'classifier': ['sgd']} + dataset_properties={"sparse": True}, include={"classifier": ["sgd"]} ) # Multiclass - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="digits", make_sparse=True + ) with ignore_warnings(classifier_warnings): cls.fit(X_train, Y_train) @@ -797,11 +887,12 @@ def test_predict_proba_batched(self): ------- * Should expect the output shape to match that of the digits dataset * Should expect a fixed call count each test run - * Should expect predict_proba with `batches` and predict_proba perform near identically + * Should expect predict_proba with `batches` and predict_proba + perform near identically """ # Multiclass - cls = SimpleClassificationPipeline(include={'classifier': ['sgd']}) - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') + cls = SimpleClassificationPipeline(include={"classifier": ["sgd"]}) + X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") with ignore_warnings(classifier_warnings): cls.fit(X_train, Y_train) @@ -820,10 +911,11 @@ def test_predict_proba_batched(self): np.testing.assert_array_almost_equal(prediction_, prediction) # Multilabel - cls = SimpleClassificationPipeline(include={'classifier': ['lda']}) - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') - Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) - for y in Y_train])) + cls = SimpleClassificationPipeline(include={"classifier": ["lda"]}) + X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") + Y_train = np.array( + list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train]) + ) with ignore_warnings(classifier_warnings): cls.fit(X_train, Y_train) @@ -849,15 +941,18 @@ def test_predict_proba_batched_sparse(self): ------- * Should expect the output shape to match that of the digits dataset * Should expect a fixed call count each test run - * Should expect predict_proba with `batches` and predict_proba perform near identically + * Should expect predict_proba with `batches` and predict_proba + perform near identically """ cls = SimpleClassificationPipeline( - dataset_properties={'sparse': True, 'multiclass': True}, - include={'classifier': ['sgd']} + dataset_properties={"sparse": True, "multiclass": True}, + include={"classifier": ["sgd"]}, ) # Multiclass - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="digits", make_sparse=True + ) X_test_ = X_test.copy() with ignore_warnings(classifier_warnings): @@ -877,10 +972,12 @@ def test_predict_proba_batched_sparse(self): # Multilabel cls = SimpleClassificationPipeline( - dataset_properties={'sparse': True, 'multilabel': True}, - include={'classifier': ['lda']} + dataset_properties={"sparse": True, "multilabel": True}, + include={"classifier": ["lda"]}, + ) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="digits", make_sparse=True ) - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) X_test_ = X_test.copy() Y_train = np.array([[1 if i != y else 0 for i in range(10)] for y in Y_train]) @@ -909,7 +1006,7 @@ def test_pipeline_clonability(self): * The cloned object can be constructed from theses params * The reconstructed clone and the original have the same param values """ - X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris") auto = SimpleClassificationPipeline() @@ -952,18 +1049,24 @@ def test_add_classifier(self): * There should be 1 component after adding a classifier * The classifier should be in the search space of the Pipeline after being added """ - self.assertEqual(len(classification_components.additional_components.components), 0) - self.assertEqual(len(_addons['classification'].components), 0) + self.assertEqual( + len(classification_components.additional_components.components), 0 + ) + self.assertEqual(len(_addons["classification"].components), 0) classification_components.add_classifier(DummyClassifier) - self.assertEqual(len(classification_components.additional_components.components), 1) - self.assertEqual(len(_addons['classification'].components), 1) + self.assertEqual( + len(classification_components.additional_components.components), 1 + ) + self.assertEqual(len(_addons["classification"].components), 1) cs = SimpleClassificationPipeline().get_hyperparameter_search_space() - self.assertIn('DummyClassifier', str(cs)) + self.assertIn("DummyClassifier", str(cs)) - del classification_components.additional_components.components['DummyClassifier'] + del classification_components.additional_components.components[ + "DummyClassifier" + ] def test_add_preprocessor(self): """Test that preprocessors can be added @@ -972,22 +1075,30 @@ def test_add_preprocessor(self): ------- * There should be 0 components initially * There should be 1 component after adding a preprocessor - * The preprocessor should be in the search space of the Pipeline after being added + * The preprocessor ii in the search space of the Pipeline after being added """ - self.assertEqual(len(preprocessing_components.additional_components.components), 0) - self.assertEqual(len(_addons['feature_preprocessing'].components), 0) + self.assertEqual( + len(preprocessing_components.additional_components.components), 0 + ) + self.assertEqual(len(_addons["feature_preprocessing"].components), 0) preprocessing_components.add_preprocessor(DummyPreprocessor) - self.assertEqual(len(preprocessing_components.additional_components.components), 1) - self.assertEqual(len(_addons['feature_preprocessing'].components), 1) + self.assertEqual( + len(preprocessing_components.additional_components.components), 1 + ) + self.assertEqual(len(_addons["feature_preprocessing"].components), 1) cs = SimpleClassificationPipeline().get_hyperparameter_search_space() - self.assertIn('DummyPreprocessor', str(cs)) + self.assertIn("DummyPreprocessor", str(cs)) - del preprocessing_components.additional_components.components['DummyPreprocessor'] + del preprocessing_components.additional_components.components[ + "DummyPreprocessor" + ] - def _test_set_hyperparameter_choice(self, expected_key, implementation, config_dict): + def _test_set_hyperparameter_choice( + self, expected_key, implementation, config_dict + ): """Given a configuration in config, this procedure makes sure that the given implementation, which should be a Choice component, honors the type of the object, and any hyperparameter associated to it @@ -1001,14 +1112,16 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d # Are there further hyperparams? # A choice component might have attribute requirements that we need to check - expected_sub_key = expected_key.replace(':__choice__', ':') + implementation_type + expected_sub_key = ( + expected_key.replace(":__choice__", ":") + implementation_type + ) expected_attributes = {} - if 'data_preprocessor:__choice__' in expected_key: + if "data_preprocessor:__choice__" in expected_key: # We have to check both the numerical and categorical to_check = { - 'numerical_transformer': implementation.choice.numer_ppl.named_steps, - 'categorical_transformer': implementation.choice.categ_ppl.named_steps, - 'text_transformer': implementation.choice.txt_ppl.named_steps, + "numerical_transformer": implementation.choice.numer_ppl.named_steps, + "categorical_transformer": implementation.choice.categ_ppl.named_steps, + "text_transformer": implementation.choice.txt_ppl.named_steps, } for data_type, pipeline in to_check.items(): @@ -1016,8 +1129,7 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d # If it is a Choice, make sure it is the correct one! if isinstance(sub_step, AutoSklearnChoice): key = "data_preprocessor:feature_type:{}:{}:__choice__".format( - data_type, - sub_name + data_type, sub_name ) keys_checked.extend( self._test_set_hyperparameter_choice( @@ -1029,10 +1141,10 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d keys_checked.extend( self._test_set_hyperparameter_component( "data_preprocessor:feature_type:{}:{}".format( - data_type, - sub_name + data_type, sub_name ), - sub_step, config_dict + sub_step, + config_dict, ) ) else: @@ -1041,7 +1153,7 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d for key, value in config_dict.items(): if key != expected_key and expected_sub_key in key: - expected_attributes[key.split(':')[-1]] = value + expected_attributes[key.split(":")[-1]] = value keys_checked.append(key) if expected_attributes: @@ -1053,7 +1165,9 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d return keys_checked - def _test_set_hyperparameter_component(self, expected_key, implementation, config_dict): + def _test_set_hyperparameter_component( + self, expected_key, implementation, config_dict + ): """ Given a configuration in config, this procedure makes sure that the given implementation, which should be a autosklearn component, honors @@ -1067,15 +1181,14 @@ def _test_set_hyperparameter_component(self, expected_key, implementation, confi for key, value in config_dict.items(): if expected_key in key: keys_checked.append(key) - key = key.replace(expected_key + ':', '') - if ':' in key: - raise ValueError("This utility should only be called with a " - "matching string that produces leaf configurations, " - "that is no further colons are expected, yet key={}" - "".format( - key - ) - ) + key = key.replace(expected_key + ":", "") + if ":" in key: + raise ValueError( + "This utility should only be called with a " + "matching string that produces leaf configurations, " + "that is no further colons are expected, yet key={}" + "".format(key) + ) expected_attributes[key] = value # self.assertDictContainsSubset(expected_attributes, attributes) # Cannot check the whole dictionary, just names, as some @@ -1097,12 +1210,17 @@ def test_set_hyperparameters_honors_configuration(self): """ random_state = 1 all_combinations = list(itertools.product([True, False], repeat=4)) - for sparse, multilabel, signed, multiclass, in all_combinations: + for ( + sparse, + multilabel, + signed, + multiclass, + ) in all_combinations: dataset_properties = { - 'sparse': sparse, - 'multilabel': multilabel, - 'multiclass': multiclass, - 'signed': signed, + "sparse": sparse, + "multilabel": multilabel, + "multiclass": multiclass, + "signed": signed, } cls = SimpleClassificationPipeline( random_state=random_state, @@ -1121,36 +1239,37 @@ def test_set_hyperparameters_honors_configuration(self): keys_checked = [] for name, step in cls.named_steps.items(): - if name == 'data_preprocessor': + if name == "data_preprocessor": keys_checked.extend( self._test_set_hyperparameter_choice( - 'data_preprocessor:__choice__', step, config_dict + "data_preprocessor:__choice__", step, config_dict ) ) self.assertEqual(step.random_state, random_state) - elif name == 'balancing': + elif name == "balancing": keys_checked.extend( self._test_set_hyperparameter_component( - 'balancing', - step, config_dict + "balancing", step, config_dict ) ) - elif name == 'feature_preprocessor': + elif name == "feature_preprocessor": keys_checked.extend( self._test_set_hyperparameter_choice( - 'feature_preprocessor:__choice__', step, config_dict + "feature_preprocessor:__choice__", step, config_dict ) ) self.assertEqual(step.random_state, random_state) - elif name == 'classifier': + elif name == "classifier": keys_checked.extend( self._test_set_hyperparameter_choice( - 'classifier:__choice__', step, config_dict + "classifier:__choice__", step, config_dict ) ) self.assertEqual(step.random_state, random_state) else: - raise ValueError("Found another type of step! Need to update this check") + raise ValueError( + "Found another type of step! Need to update this check" + ) # Make sure we checked the whole configuration self.assertSetEqual(set(config_dict.keys()), set(keys_checked)) @@ -1162,18 +1281,18 @@ def test_fit_instantiates_component(self): # We reduce the search space as forbidden clauses prevent to instantiate # the user defined preprocessor manually - cls = SimpleClassificationPipeline( - include={'classifier': ['random_forest']} - ) + cls = SimpleClassificationPipeline(include={"classifier": ["random_forest"]}) cs = cls.get_hyperparameter_search_space() - self.assertIn('CrashPreprocessor', str(cs)) + self.assertIn("CrashPreprocessor", str(cs)) config = cs.sample_configuration() try: - config['feature_preprocessor:__choice__'] = 'CrashPreprocessor' + config["feature_preprocessor:__choice__"] = "CrashPreprocessor" except Exception as e: # In case of failure clean up the components and print enough information # to clean up with check in the future - del preprocessing_components.additional_components.components['CrashPreprocessor'] + del preprocessing_components.additional_components.components[ + "CrashPreprocessor" + ] self.fail("cs={} config={} Exception={}".format(cs, config, e)) cls.set_hyperparameters(config) @@ -1182,7 +1301,9 @@ def test_fit_instantiates_component(self): with ignore_warnings(classifier_warnings): cls.fit( X=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]), - y=np.array([1, 0, 1, 1]) + y=np.array([1, 0, 1, 1]), ) - del preprocessing_components.additional_components.components['CrashPreprocessor'] + del preprocessing_components.additional_components.components[ + "CrashPreprocessor" + ] diff --git a/test/test_pipeline/test_create_searchspace_util_classification.py b/test/test_pipeline/test_create_searchspace_util_classification.py index 7bf1450979..a830430097 100644 --- a/test/test_pipeline/test_create_searchspace_util_classification.py +++ b/test/test_pipeline/test_create_searchspace_util_classification.py @@ -1,20 +1,23 @@ +import unittest from collections import OrderedDict -import unittest import numpy - from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter -from autosklearn.pipeline.components.classification.liblinear_svc import LibLinear_SVC +import autosklearn.pipeline.create_searchspace_util from autosklearn.pipeline.components.classification.lda import LDA - +from autosklearn.pipeline.components.classification.liblinear_svc import LibLinear_SVC +from autosklearn.pipeline.components.feature_preprocessing.no_preprocessing import ( + NoPreprocessing, +) from autosklearn.pipeline.components.feature_preprocessing.pca import PCA -from autosklearn.pipeline.components.feature_preprocessing.truncatedSVD import TruncatedSVD -from autosklearn.pipeline.components.feature_preprocessing.no_preprocessing import NoPreprocessing -from autosklearn.pipeline.components.feature_preprocessing.random_trees_embedding \ - import RandomTreesEmbedding -import autosklearn.pipeline.create_searchspace_util +from autosklearn.pipeline.components.feature_preprocessing.random_trees_embedding import ( # noqa: E501 + RandomTreesEmbedding, +) +from autosklearn.pipeline.components.feature_preprocessing.truncatedSVD import ( + TruncatedSVD, +) class TestCreateClassificationSearchspace(unittest.TestCase): @@ -23,9 +26,9 @@ class TestCreateClassificationSearchspace(unittest.TestCase): def test_get_match_array_sparse_and_dense(self): # preproc is empty preprocessors = OrderedDict() - preprocessors['pca'] = PCA + preprocessors["pca"] = PCA classifiers = OrderedDict() - classifiers['lda'] = LDA + classifiers["lda"] = LDA # Sparse + dense class Preprocessors(object): @@ -40,62 +43,69 @@ def get_available_components(self, *args, **kwargs): # Dense m = autosklearn.pipeline.create_searchspace_util.get_match_array( - pipeline=((0, PCA), (1, LDA)), dataset_properties={'sparse': True}) + pipeline=((0, PCA), (1, LDA)), dataset_properties={"sparse": True} + ) self.assertEqual(numpy.sum(m), 0) m = autosklearn.pipeline.create_searchspace_util.get_match_array( - pipeline=((0, PCA), (1, LDA)), dataset_properties={'sparse': False}) + pipeline=((0, PCA), (1, LDA)), dataset_properties={"sparse": False} + ) self.assertEqual(m, [[1]]) # Sparse - preprocessors['tSVD'] = TruncatedSVD + preprocessors["tSVD"] = TruncatedSVD m = autosklearn.pipeline.create_searchspace_util.get_match_array( - pipeline=((0, Preprocessors), (1, LDA)), - dataset_properties={'sparse': True}) + pipeline=((0, Preprocessors), (1, LDA)), dataset_properties={"sparse": True} + ) self.assertEqual(m[0], [0]) # pca self.assertEqual(m[1], [1]) # svd m = autosklearn.pipeline.create_searchspace_util.get_match_array( pipeline=((0, Preprocessors), (1, LDA)), - dataset_properties={'sparse': False}) + dataset_properties={"sparse": False}, + ) self.assertEqual(m[0], [1]) # pca self.assertEqual(m[1], [0]) # svd - preprocessors['none'] = NoPreprocessing + preprocessors["none"] = NoPreprocessing m = autosklearn.pipeline.create_searchspace_util.get_match_array( - pipeline=((0, Preprocessors), (1, LDA)), - dataset_properties={'sparse': True}) + pipeline=((0, Preprocessors), (1, LDA)), dataset_properties={"sparse": True} + ) self.assertEqual(m[0, :], [0]) # pca self.assertEqual(m[1, :], [1]) # tsvd self.assertEqual(m[2, :], [0]) # none m = autosklearn.pipeline.create_searchspace_util.get_match_array( pipeline=((0, Preprocessors), (1, LDA)), - dataset_properties={'sparse': False}) + dataset_properties={"sparse": False}, + ) self.assertEqual(m[0, :], [1]) # pca self.assertEqual(m[1, :], [0]) # tsvd self.assertEqual(m[2, :], [1]) # none - classifiers['libsvm'] = LibLinear_SVC + classifiers["libsvm"] = LibLinear_SVC m = autosklearn.pipeline.create_searchspace_util.get_match_array( pipeline=((0, Preprocessors), (1, Classifiers)), - dataset_properties={'sparse': False}) + dataset_properties={"sparse": False}, + ) self.assertListEqual(list(m[0, :]), [1, 1]) # pca self.assertListEqual(list(m[1, :]), [0, 0]) # tsvd self.assertListEqual(list(m[2, :]), [1, 1]) # none m = autosklearn.pipeline.create_searchspace_util.get_match_array( pipeline=((0, Preprocessors), (1, Classifiers)), - dataset_properties={'sparse': True}) + dataset_properties={"sparse": True}, + ) self.assertListEqual(list(m[0, :]), [0, 0]) # pca self.assertListEqual(list(m[1, :]), [1, 1]) # tsvd self.assertListEqual(list(m[2, :]), [0, 1]) # none # Do fancy 3d stuff - preprocessors['random_trees'] = RandomTreesEmbedding + preprocessors["random_trees"] = RandomTreesEmbedding m = autosklearn.pipeline.create_searchspace_util.get_match_array( pipeline=((0, Preprocessors), (1, Preprocessors), (2, Classifiers)), - dataset_properties={'sparse': False}) + dataset_properties={"sparse": False}, + ) # PCA followed by truncated SVD is forbidden self.assertEqual(list(m[0].flatten()), [1, 1, 0, 0, 1, 1, 0, 1]) # Truncated SVD is forbidden @@ -112,28 +122,38 @@ def test_get_match_array_signed_unsigned_and_binary(self): @unittest.skip("Not currently working.") def test_add_forbidden(self): m = numpy.ones([2, 3]) - preprocessors_list = ['pa', 'pb'] - classifier_list = ['ca', 'cb', 'cc'] + preprocessors_list = ["pa", "pb"] + classifier_list = ["ca", "cb", "cc"] cs = ConfigurationSpace() - preprocessor = CategoricalHyperparameter(name='feature_preprocessor', - choices=preprocessors_list) - classifier = CategoricalHyperparameter(name='classifier', - choices=classifier_list) + preprocessor = CategoricalHyperparameter( + name="feature_preprocessor", choices=preprocessors_list + ) + classifier = CategoricalHyperparameter( + name="classifier", choices=classifier_list + ) cs.add_hyperparameter(preprocessor) cs.add_hyperparameter(classifier) new_cs = autosklearn.pipeline.create_searchspace_util.add_forbidden( - conf_space=cs, node_0_list=preprocessors_list, - node_1_list=classifier_list, matches=m, - node_0_name='feature_preprocessor', node_1_name="classifier") + conf_space=cs, + node_0_list=preprocessors_list, + node_1_list=classifier_list, + matches=m, + node_0_name="feature_preprocessor", + node_1_name="classifier", + ) self.assertEqual(len(new_cs.forbidden_clauses), 0) self.assertIsInstance(new_cs, ConfigurationSpace) m[1, 1] = 0 new_cs = autosklearn.pipeline.create_searchspace_util.add_forbidden( - conf_space=cs, node_0_list=preprocessors_list, - node_1_list=classifier_list, matches=m, - node_0_name='feature_preprocessor', node_1_name="classifier") + conf_space=cs, + node_0_list=preprocessors_list, + node_1_list=classifier_list, + matches=m, + node_0_name="feature_preprocessor", + node_1_name="classifier", + ) self.assertEqual(len(new_cs.forbidden_clauses), 1) - self.assertEqual(new_cs.forbidden_clauses[0].components[0].value, 'cb') - self.assertEqual(new_cs.forbidden_clauses[0].components[1].value, 'pb') + self.assertEqual(new_cs.forbidden_clauses[0].components[0].value, "cb") + self.assertEqual(new_cs.forbidden_clauses[0].components[1].value, "pb") self.assertIsInstance(new_cs, ConfigurationSpace) diff --git a/test/test_pipeline/test_regression.py b/test/test_pipeline/test_regression.py index fccf59af67..501b73ec5d 100644 --- a/test/test_pipeline/test_regression.py +++ b/test/test_pipeline/test_regression.py @@ -5,28 +5,36 @@ import unittest import unittest.mock -from joblib import Memory import numpy as np import sklearn.datasets import sklearn.decomposition -from sklearn.base import clone import sklearn.ensemble import sklearn.svm -from sklearn.utils.validation import check_is_fitted - from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from joblib import Memory +from sklearn.base import clone +from sklearn.utils.validation import check_is_fitted -from autosklearn.pipeline.regression import SimpleRegressionPipeline -from autosklearn.pipeline.components.base import \ - AutoSklearnPreprocessingAlgorithm, AutoSklearnRegressionAlgorithm -import autosklearn.pipeline.components.regression as regression_components -from autosklearn.pipeline.components.base import AutoSklearnComponent, AutoSklearnChoice import autosklearn.pipeline.components.feature_preprocessing as preprocessing_components +import autosklearn.pipeline.components.regression as regression_components +from autosklearn.pipeline.components.base import ( + AutoSklearnChoice, + AutoSklearnComponent, + AutoSklearnPreprocessingAlgorithm, + AutoSklearnRegressionAlgorithm, +) +from autosklearn.pipeline.constants import ( + DENSE, + PREDICTIONS, + SIGNED_DATA, + SPARSE, + UNSIGNED_DATA, +) +from autosklearn.pipeline.regression import SimpleRegressionPipeline from autosklearn.pipeline.util import get_dataset -from autosklearn.pipeline.constants import SPARSE, DENSE, SIGNED_DATA, UNSIGNED_DATA, PREDICTIONS -from test.test_pipeline.ignored_warnings import regressor_warnings, ignore_warnings +from test.test_pipeline.ignored_warnings import ignore_warnings, regressor_warnings class SimpleRegressionPipelineTest(unittest.TestCase): @@ -38,41 +46,43 @@ def test_io_dict(self): if regressors[r] == regression_components.RegressorChoice: continue props = regressors[r].get_properties() - self.assertIn('input', props) - self.assertIn('output', props) - inp = props['input'] - output = props['output'] + self.assertIn("input", props) + self.assertIn("output", props) + inp = props["input"] + output = props["output"] self.assertIsInstance(inp, tuple) self.assertIsInstance(output, tuple) for i in inp: self.assertIn(i, (SPARSE, DENSE, SIGNED_DATA, UNSIGNED_DATA)) self.assertEqual(output, (PREDICTIONS,)) - self.assertIn('handles_regression', props) - self.assertTrue(props['handles_regression']) - self.assertIn('handles_classification', props) - self.assertIn('handles_multiclass', props) - self.assertIn('handles_multilabel', props) - self.assertIn('handles_multioutput', props) - self.assertFalse(props['handles_classification']) - self.assertFalse(props['handles_multiclass']) - self.assertFalse(props['handles_multilabel']) + self.assertIn("handles_regression", props) + self.assertTrue(props["handles_regression"]) + self.assertIn("handles_classification", props) + self.assertIn("handles_multiclass", props) + self.assertIn("handles_multilabel", props) + self.assertIn("handles_multioutput", props) + self.assertFalse(props["handles_classification"]) + self.assertFalse(props["handles_multiclass"]) + self.assertFalse(props["handles_multilabel"]) def test_find_regressors(self): regressors = regression_components._regressors self.assertGreaterEqual(len(regressors), 1) for key in regressors: - if hasattr(regressors[key], 'get_components'): + if hasattr(regressors[key], "get_components"): continue self.assertIn(AutoSklearnRegressionAlgorithm, regressors[key].__bases__) def test_find_preprocessors(self): preprocessors = preprocessing_components._preprocessors - self.assertGreaterEqual(len(preprocessors), 1) + self.assertGreaterEqual(len(preprocessors), 1) for key in preprocessors: - if hasattr(preprocessors[key], 'get_components'): + if hasattr(preprocessors[key], "get_components"): continue - self.assertIn(AutoSklearnPreprocessingAlgorithm, preprocessors[key].__bases__) + self.assertIn( + AutoSklearnPreprocessingAlgorithm, preprocessors[key].__bases__ + ) def test_configurations(self): cs = SimpleRegressionPipeline().get_hyperparameter_search_space() @@ -80,27 +90,28 @@ def test_configurations(self): self._test_configurations(cs) def test_configurations_signed_data(self): - dataset_properties = {'signed': True} - cs = SimpleRegressionPipeline(dataset_properties=dataset_properties).\ - get_hyperparameter_search_space() + dataset_properties = {"signed": True} + cs = SimpleRegressionPipeline( + dataset_properties=dataset_properties + ).get_hyperparameter_search_space() - self._test_configurations(configurations_space=cs, - dataset_properties=dataset_properties) + self._test_configurations( + configurations_space=cs, dataset_properties=dataset_properties + ) def test_configurations_sparse(self): - dataset_properties = {'sparse': True} + dataset_properties = {"sparse": True} cs = SimpleRegressionPipeline( dataset_properties=dataset_properties ).get_hyperparameter_search_space() - self._test_configurations(cs, make_sparse=True, - dataset_properties=dataset_properties) + self._test_configurations( + cs, make_sparse=True, dataset_properties=dataset_properties + ) def test_multioutput(self): cache = Memory(location=tempfile.gettempdir()) - cached_func = cache.cache( - sklearn.datasets.make_regression - ) + cached_func = cache.cache(sklearn.datasets.make_regression) X, Y = cached_func( n_samples=250, n_features=20, @@ -112,24 +123,33 @@ def test_multioutput(self): noise=0.3, shuffle=True, coef=False, - random_state=1 + random_state=1, ) X_train = X[:200, :] Y_train = Y[:200, :] X_test = X[200:, :] Y_test = Y[200:, :] - data = {'X_train': X_train, 'Y_train': Y_train, - 'X_test': X_test, 'Y_test': Y_test} + data = { + "X_train": X_train, + "Y_train": Y_train, + "X_test": X_test, + "Y_test": Y_test, + } - dataset_properties = {'multioutput': True} + dataset_properties = {"multioutput": True} pipeline = SimpleRegressionPipeline(dataset_properties=dataset_properties) cs = pipeline.get_hyperparameter_search_space() self._test_configurations(cs, data=data, dataset_properties=dataset_properties) - def _test_configurations(self, configurations_space, make_sparse=False, - data=None, dataset_properties=None): + def _test_configurations( + self, + configurations_space, + make_sparse=False, + data=None, + dataset_properties=None, + ): # Use a limit of ~4GiB limit = 3072 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) @@ -141,42 +161,48 @@ def _test_configurations(self, configurations_space, make_sparse=False, config._populate_values() # Restrict configurations which could take too long on travis-ci - restrictions = {'regressor:adaboost:n_estimators': 50, - 'regressor:adaboost:max_depth': 1, - 'feature_preprocessor:kernel_pca:n_components': 10, - 'feature_preprocessor:kitchen_sinks:n_components': 50, - 'regressor:libsvm_svc:degree': 2, - 'regressor:libsvm_svr:degree': 2, - 'regressor:libsvm_svr:C': 1., - 'feature_preprocessor:truncatedSVD:target_dim': 10, - 'feature_preprocessor:polynomial:degree': 2, - 'regressor:lda:n_components': 10} + restrictions = { + "regressor:adaboost:n_estimators": 50, + "regressor:adaboost:max_depth": 1, + "feature_preprocessor:kernel_pca:n_components": 10, + "feature_preprocessor:kitchen_sinks:n_components": 50, + "regressor:libsvm_svc:degree": 2, + "regressor:libsvm_svr:degree": 2, + "regressor:libsvm_svr:C": 1.0, + "feature_preprocessor:truncatedSVD:target_dim": 10, + "feature_preprocessor:polynomial:degree": 2, + "regressor:lda:n_components": 10, + } for restrict_parameter in restrictions: restrict_to = restrictions[restrict_parameter] - if restrict_parameter in config and config[restrict_parameter] is not None: + if ( + restrict_parameter in config + and config[restrict_parameter] is not None + ): config._values[restrict_parameter] = restrict_to if data is None: X_train, Y_train, X_test, Y_test = get_dataset( - dataset='boston', make_sparse=make_sparse, add_NaNs=True) + dataset="boston", make_sparse=make_sparse, add_NaNs=True + ) else: - X_train = data['X_train'].copy() - Y_train = data['Y_train'].copy() - X_test = data['X_test'].copy() - data['Y_test'].copy() + X_train = data["X_train"].copy() + Y_train = data["Y_train"].copy() + X_test = data["X_test"].copy() + data["Y_test"].copy() cls = SimpleRegressionPipeline( - random_state=1, - dataset_properties=dataset_properties + random_state=1, dataset_properties=dataset_properties ) cls.set_hyperparameters(config) # First make sure that for this configuration, setting the parameters # does not mistakenly set the estimator as fitted for name, step in cls.named_steps.items(): - with self.assertRaisesRegex(sklearn.exceptions.NotFittedError, - "instance is not fitted yet"): + with self.assertRaisesRegex( + sklearn.exceptions.NotFittedError, "instance is not fitted yet" + ): check_is_fitted(step) try: @@ -190,9 +216,9 @@ def _test_configurations(self, configurations_space, make_sparse=False, for name, step in cls.named_steps.items(): check_is_fitted(step) except sklearn.exceptions.NotFittedError: - self.fail("config={} raised NotFittedError unexpectedly!".format( - config - )) + self.fail( + "config={} raised NotFittedError unexpectedly!".format(config) + ) cls.predict(X_test) except MemoryError: @@ -200,8 +226,7 @@ def _test_configurations(self, configurations_space, make_sparse=False, except np.linalg.LinAlgError: continue except ValueError as e: - if "Floating-point under-/overflow occurred at epoch" in \ - e.args[0]: + if "Floating-point under-/overflow occurred at epoch" in e.args[0]: continue elif "removed all features" in e.args[0]: continue @@ -209,13 +234,17 @@ def _test_configurations(self, configurations_space, make_sparse=False, continue elif "Numerical problems in QDA" in e.args[0]: continue - elif 'Bug in scikit-learn' in e.args[0]: + elif "Bug in scikit-learn" in e.args[0]: continue - elif 'The condensed distance matrix must contain only finite ' \ - 'values.' in e.args[0]: + elif ( + "The condensed distance matrix must contain only finite " + "values." in e.args[0] + ): continue - elif "zero-size array to reduction operation maximum which has no " \ - "identity" in e.args[0]: + elif ( + "zero-size array to reduction operation maximum which has no " + "identity" in e.args[0] + ): continue else: e.args += (f"config={config}",) @@ -244,7 +273,10 @@ def _test_configurations(self, configurations_space, make_sparse=False, raise e except Exception as e: - if "Multiple input features cannot have the same target value" in e.args[0]: + if ( + "Multiple input features cannot have the same target value" + in e.args[0] + ): continue else: e.args += (f"config={config}",) @@ -252,7 +284,7 @@ def _test_configurations(self, configurations_space, make_sparse=False, def test_default_configuration(self): for i in range(2): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="diabetes") auto = SimpleRegressionPipeline(random_state=1) auto = auto.fit(X_train, Y_train) predictions = auto.predict(copy.deepcopy(X_test)) @@ -266,16 +298,15 @@ def test_default_configuration_iterative_fit(self): regressor = SimpleRegressionPipeline( random_state=1, include={ - 'regressor': ['random_forest'], - 'feature_preprocessor': ['no_preprocessing'] - } + "regressor": ["random_forest"], + "feature_preprocessor": ["no_preprocessing"], + }, ) - X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="boston") regressor.fit_transformer(X_train, Y_train) for i in range(1, 11): regressor.iterative_fit(X_train, Y_train) - self.assertEqual(regressor.steps[-1][-1].choice.estimator.n_estimators, - i) + self.assertEqual(regressor.steps[-1][-1].choice.estimator.n_estimators, i) def test_repr(self): representation = repr(SimpleRegressionPipeline()) @@ -293,56 +324,50 @@ def test_get_hyperparameter_search_space(self): self.assertEqual(len(forbiddens), 35) def test_get_hyperparameter_search_space_include_exclude_models(self): - regressor = SimpleRegressionPipeline( - include={'regressor': ['random_forest']} - ) + regressor = SimpleRegressionPipeline(include={"regressor": ["random_forest"]}) cs = regressor.get_hyperparameter_search_space() self.assertEqual( - cs.get_hyperparameter('regressor:__choice__'), - CategoricalHyperparameter('regressor:__choice__', ['random_forest']), + cs.get_hyperparameter("regressor:__choice__"), + CategoricalHyperparameter("regressor:__choice__", ["random_forest"]), ) # TODO add this test when more than one regressor is present - regressor = SimpleRegressionPipeline( - exclude={'regressor': ['random_forest']} - ) + regressor = SimpleRegressionPipeline(exclude={"regressor": ["random_forest"]}) cs = regressor.get_hyperparameter_search_space() - self.assertNotIn('random_forest', str(cs)) + self.assertNotIn("random_forest", str(cs)) - regressor = SimpleRegressionPipeline( - include={'feature_preprocessor': ['pca']} - ) + regressor = SimpleRegressionPipeline(include={"feature_preprocessor": ["pca"]}) cs = regressor.get_hyperparameter_search_space() - self.assertEqual(cs.get_hyperparameter( - 'feature_preprocessor:__choice__'), - CategoricalHyperparameter('feature_preprocessor:__choice__', ['pca'])) + self.assertEqual( + cs.get_hyperparameter("feature_preprocessor:__choice__"), + CategoricalHyperparameter("feature_preprocessor:__choice__", ["pca"]), + ) regressor = SimpleRegressionPipeline( - exclude={'feature_preprocessor': ['no_preprocessing']} + exclude={"feature_preprocessor": ["no_preprocessing"]} ) cs = regressor.get_hyperparameter_search_space() - self.assertNotIn('no_preprocessing', str(cs)) + self.assertNotIn("no_preprocessing", str(cs)) - def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier( - self + def test_get_hyperparameter_search_space_preprocessor_contradicts_default( + self, ): regressor = SimpleRegressionPipeline( - include={'feature_preprocessor': ['densifier']}, - dataset_properties={'sparse': True} + include={"feature_preprocessor": ["densifier"]}, + dataset_properties={"sparse": True}, ) cs = regressor.get_hyperparameter_search_space() self.assertEqual( - cs.get_hyperparameter('regressor:__choice__').default_value, - 'gradient_boosting' + cs.get_hyperparameter("regressor:__choice__").default_value, + "gradient_boosting", ) regressor = SimpleRegressionPipeline( - include={'feature_preprocessor': ['nystroem_sampler']} + include={"feature_preprocessor": ["nystroem_sampler"]} ) cs = regressor.get_hyperparameter_search_space() self.assertEqual( - cs.get_hyperparameter('regressor:__choice__').default_value, - 'sgd' + cs.get_hyperparameter("regressor:__choice__").default_value, "sgd" ) def test_get_hyperparameter_search_space_only_forbidden_combinations(self): @@ -351,9 +376,9 @@ def test_get_hyperparameter_search_space_only_forbidden_combinations(self): "Cannot find a legal default configuration.", SimpleRegressionPipeline, include={ - 'regressor': ['random_forest'], - 'feature_preprocessor': ['kitchen_sinks'] - } + "regressor": ["random_forest"], + "feature_preprocessor": ["kitchen_sinks"], + }, ) # It must also be catched that no classifiers which can handle sparse @@ -363,14 +388,16 @@ def test_get_hyperparameter_search_space_only_forbidden_combinations(self): "Cannot find a legal default configuration", SimpleRegressionPipeline, include={ - 'regressor': ['extra_trees'], - 'feature_preprocessor': ['densifier'] + "regressor": ["extra_trees"], + "feature_preprocessor": ["densifier"], }, - dataset_properties={'sparse': True} + dataset_properties={"sparse": True}, ) - @unittest.skip("test_get_hyperparameter_search_space_dataset_properties" + - " Not yet Implemented") + @unittest.skip( + "test_get_hyperparameter_search_space_dataset_properties" + + " Not yet Implemented" + ) def test_get_hyperparameter_search_space_dataset_properties(self): # TODO: We do not have any dataset properties for regression, so this # test is somewhat stupid @@ -403,16 +430,14 @@ def test_get_hyperparameter_search_space_dataset_properties(self): """ def test_predict_batched(self): - include = {'regressor': ['decision_tree']} + include = {"regressor": ["decision_tree"]} cs = SimpleRegressionPipeline(include=include).get_hyperparameter_search_space() default = cs.get_default_configuration() regressor = SimpleRegressionPipeline( - config=default, - random_state=1, - include=include + config=default, random_state=1, include=include ) - X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="boston") regressor.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = regressor.predict(X_test_) @@ -424,12 +449,11 @@ def test_predict_batched(self): np.testing.assert_array_almost_equal(prediction_, prediction) def test_predict_batched_sparse(self): - dataset_properties = {'sparse': True} - include = {'regressor': ['decision_tree']} + dataset_properties = {"sparse": True} + include = {"regressor": ["decision_tree"]} cs = SimpleRegressionPipeline( - dataset_properties=dataset_properties, - include=include + dataset_properties=dataset_properties, include=include ).get_hyperparameter_search_space() default = cs.get_default_configuration() @@ -437,11 +461,12 @@ def test_predict_batched_sparse(self): config=default, random_state=1, dataset_properties=dataset_properties, - include=include + include=include, ) - X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston', - make_sparse=True) + X_train, Y_train, X_test, Y_test = get_dataset( + dataset="boston", make_sparse=True + ) regressor.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = regressor.predict(X_test_) @@ -465,7 +490,7 @@ def test_validate_input_Y(self): raise NotImplementedError() def test_pipeline_clonability(self): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston') + X_train, Y_train, X_test, Y_test = get_dataset(dataset="boston") auto = SimpleRegressionPipeline(random_state=1) auto = auto.fit(X_train, Y_train) auto_clone = clone(auto) @@ -494,7 +519,9 @@ def test_set_params(self): def test_get_params(self): pass - def _test_set_hyperparameter_choice(self, expected_key, implementation, config_dict): + def _test_set_hyperparameter_choice( + self, expected_key, implementation, config_dict + ): """ Given a configuration in config, this procedure makes sure that the given implementation, which should be a Choice component, honors @@ -507,14 +534,16 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d # Are there further hyperparams? # A choice component might have attribute requirements that we need to check - expected_sub_key = expected_key.replace(':__choice__', ':') + implementation_type + expected_sub_key = ( + expected_key.replace(":__choice__", ":") + implementation_type + ) expected_attributes = {} - if 'data_preprocessor:__choice__' in expected_key: + if "data_preprocessor:__choice__" in expected_key: # We have to check both the numerical and categorical to_check = { - 'numerical_transformer': implementation.choice.numer_ppl.named_steps, - 'categorical_transformer': implementation.choice.categ_ppl.named_steps, - 'text_transformer': implementation.choice.txt_ppl.named_steps, + "numerical_transformer": implementation.choice.numer_ppl.named_steps, + "categorical_transformer": implementation.choice.categ_ppl.named_steps, + "text_transformer": implementation.choice.txt_ppl.named_steps, } for data_type, pipeline in to_check.items(): @@ -522,8 +551,7 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d # If it is a Choice, make sure it is the correct one! if isinstance(sub_step, AutoSklearnChoice): key = "data_preprocessor:feature_type:{}:{}:__choice__".format( - data_type, - sub_name + data_type, sub_name ) keys_checked.extend( self._test_set_hyperparameter_choice( @@ -535,10 +563,10 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d keys_checked.extend( self._test_set_hyperparameter_component( "data_preprocessor:feature_type:{}:{}".format( - data_type, - sub_name + data_type, sub_name ), - sub_step, config_dict + sub_step, + config_dict, ) ) else: @@ -547,7 +575,7 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d else: for key, value in config_dict.items(): if key != expected_key and expected_sub_key in key: - expected_attributes[key.split(':')[-1]] = value + expected_attributes[key.split(":")[-1]] = value keys_checked.append(key) if expected_attributes: attributes = vars(implementation.choice) @@ -557,7 +585,9 @@ def _test_set_hyperparameter_choice(self, expected_key, implementation, config_d self.assertIn(expected_attribute, attributes.keys()) return keys_checked - def _test_set_hyperparameter_component(self, expected_key, implementation, config_dict): + def _test_set_hyperparameter_component( + self, expected_key, implementation, config_dict + ): """ Given a configuration in config, this procedure makes sure that the given implementation, which should be a autosklearn component, honors @@ -569,15 +599,14 @@ def _test_set_hyperparameter_component(self, expected_key, implementation, confi for key, value in config_dict.items(): if expected_key in key: keys_checked.append(key) - key = key.replace(expected_key + ':', '') - if ':' in key: - raise ValueError("This utility should only be called with a " - "matching string that produces leaf configurations, " - "that is no further colons are expected, yet key={}" - "".format( - key - ) - ) + key = key.replace(expected_key + ":", "") + if ":" in key: + raise ValueError( + "This utility should only be called with a " + "matching string that produces leaf configurations, " + "that is no further colons are expected, yet key={}" + "".format(key) + ) expected_attributes[key] = value # Cannot check the whole dictionary, just names, as some # classes map the text hyperparameter directly to a function! @@ -598,12 +627,17 @@ def test_set_hyperparameters_honors_configuration(self): """ all_combinations = list(itertools.product([True, False], repeat=4)) - for sparse, multilabel, signed, multiclass, in all_combinations: + for ( + sparse, + multilabel, + signed, + multiclass, + ) in all_combinations: dataset_properties = { - 'sparse': sparse, - 'multilabel': multilabel, - 'multiclass': multiclass, - 'signed': signed, + "sparse": sparse, + "multilabel": multilabel, + "multiclass": multiclass, + "signed": signed, } random_state = 1 auto = SimpleRegressionPipeline( @@ -623,31 +657,32 @@ def test_set_hyperparameters_honors_configuration(self): keys_checked = [] for name, step in auto.named_steps.items(): - if name == 'data_preprocessor': + if name == "data_preprocessor": keys_checked.extend( self._test_set_hyperparameter_choice( - 'data_preprocessor:__choice__', step, config_dict + "data_preprocessor:__choice__", step, config_dict ) ) self.assertEqual(step.random_state, random_state) - elif name == 'feature_preprocessor': + elif name == "feature_preprocessor": keys_checked.extend( self._test_set_hyperparameter_choice( - 'feature_preprocessor:__choice__', step, config_dict + "feature_preprocessor:__choice__", step, config_dict ) ) self.assertEqual(step.random_state, random_state) - elif name == 'regressor': + elif name == "regressor": keys_checked.extend( self._test_set_hyperparameter_choice( - 'regressor:__choice__', step, config_dict + "regressor:__choice__", step, config_dict ) ) self.assertEqual(step.random_state, random_state) else: - raise ValueError("Found another type of step! Need to update this check" - " {}. ".format(name) - ) + raise ValueError( + "Found another type of step! Need to update this check" + " {}. ".format(name) + ) # Make sure we checked the whole configuration self.assertSetEqual(set(config_dict.keys()), set(keys_checked)) diff --git a/test/test_scripts/test_metadata_generation.py b/test/test_scripts/test_metadata_generation.py index 6cc4fad38d..6c6ba70ef5 100644 --- a/test/test_scripts/test_metadata_generation.py +++ b/test/test_scripts/test_metadata_generation.py @@ -13,27 +13,29 @@ class TestMetadataGeneration(unittest.TestCase): - def setUp(self): - self.working_directory = '/tmp/autosklearn-unittest-tmp-dir-%s-%d-%d' % ( - socket.gethostname(), os.getpid(), random.randint(0, 1000000)) + self.working_directory = "/tmp/autosklearn-unittest-tmp-dir-%s-%d-%d" % ( + socket.gethostname(), + os.getpid(), + random.randint(0, 1000000), + ) def print_files(self): - print('Existing files:') + print("Existing files:") for dirpath, dirnames, filenames in os.walk(self.working_directory): print(dirpath, dirnames, filenames) def test_metadata_generation(self): regression_task_id = 360029 - regression_dataset_name = 'SWD'.lower() + regression_dataset_name = "SWD".lower() classification_task_id = 245 - classification_dataset_name = 'breast-w'.lower() + classification_dataset_name = "breast-w".lower() current_directory = __file__ - scripts_directory = os.path.abspath(os.path.join(current_directory, - '..', '..', '..', - 'scripts')) + scripts_directory = os.path.abspath( + os.path.join(current_directory, "..", "..", "..", "scripts") + ) # 1. create working directory try: @@ -44,214 +46,293 @@ def test_metadata_generation(self): # 2. should be done by the person running the unit tests! # 3. create configuration commands - script_filename = os.path.join(scripts_directory, '01_create_commands.py') - cmd = 'python3 %s --working-directory %s --test' % (script_filename, self.working_directory) - rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + script_filename = os.path.join(scripts_directory, "01_create_commands.py") + cmd = "python3 %s --working-directory %s --test" % ( + script_filename, + self.working_directory, + ) + rval = subprocess.run( + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) self.assertEqual(rval.returncode, 0, msg=str(rval)) # 4. run one of the commands to get some data - commands_output_file = os.path.join(self.working_directory, 'metadata_commands.txt') + commands_output_file = os.path.join( + self.working_directory, "metadata_commands.txt" + ) self.assertTrue(os.path.exists(commands_output_file)) with open(commands_output_file) as fh: - cmds = fh.read().split('\n') - # 6 regression, 7 classification (roc_auc + task 258 is illegal), 1 empty line - self.assertEqual(len(cmds), 18, msg='\n'.join(cmds)) + cmds = fh.read().split("\n") + # 6 regression, 7 classification (roc_auc + task 258 is illegal), + # 1 empty line + self.assertEqual(len(cmds), 18, msg="\n".join(cmds)) for task_id, dataset_name, task_type, metric in ( ( classification_task_id, classification_dataset_name, - 'classification', - 'balanced_accuracy', + "classification", + "balanced_accuracy", ), - (regression_task_id, regression_dataset_name, 'regression', 'r2') + (regression_task_id, regression_dataset_name, "regression", "r2"), ): cmd = None with open(commands_output_file) as fh: while True: cmd = fh.readline() - if 'task-id %d' % task_id in cmd and metric in cmd: + if "task-id %d" % task_id in cmd and metric in cmd: break if cmd is None: - self.fail('Did not find a command for task_id %s and metric %s in %s' - % (task_id, metric, cmds)) + self.fail( + "Did not find a command for task_id %s and metric %s in %s" + % (task_id, metric, cmds) + ) - self.assertIn('time-limit 86400', cmd) - self.assertIn('per-run-time-limit 1800', cmd) - cmd = cmd.replace('time-limit 86400', 'time-limit 60').replace( - 'per-run-time-limit 1800', 'per-run-time-limit 5') + self.assertIn("time-limit 86400", cmd) + self.assertIn("per-run-time-limit 1800", cmd) + cmd = cmd.replace("time-limit 86400", "time-limit 60").replace( + "per-run-time-limit 1800", "per-run-time-limit 5" + ) # This tells the script to use the same memory limit for testing as # for training. In production, it would use twice as much! - cmd = cmd.replace('-s 1', '-s 1 --unittest') - print('COMMAND: %s' % cmd) - rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - print('STDOUT: %s' % repr(rval.stdout), flush=True) - print('STDERR: %s' % repr(rval.stderr), flush=True) + cmd = cmd.replace("-s 1", "-s 1 --unittest") + print("COMMAND: %s" % cmd) + rval = subprocess.run( + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + print("STDOUT: %s" % repr(rval.stdout), flush=True) + print("STDERR: %s" % repr(rval.stderr), flush=True) self.print_files() - expected_output_directory = os.path.join(self.working_directory, - 'configuration', - task_type, - str(task_id), metric, - 'auto-sklearn-output') - self.assertTrue(os.path.exists(expected_output_directory), - msg=expected_output_directory) - smac_log = os.path.join(expected_output_directory, 'AutoML(1):%s.log' % dataset_name) + expected_output_directory = os.path.join( + self.working_directory, + "configuration", + task_type, + str(task_id), + metric, + "auto-sklearn-output", + ) + self.assertTrue( + os.path.exists(expected_output_directory), msg=expected_output_directory + ) + smac_log = os.path.join( + expected_output_directory, "AutoML(1):%s.log" % dataset_name + ) with open(smac_log) as fh: smac_output = fh.read() - self.assertEqual(rval.returncode, 0, msg=str(rval) + '\n' + smac_output) - expected_validation_output = os.path.join(expected_output_directory, '..', - 'validation_trajectory_1.json') + self.assertEqual(rval.returncode, 0, msg=str(rval) + "\n" + smac_output) + expected_validation_output = os.path.join( + expected_output_directory, "..", "validation_trajectory_1.json" + ) self.assertTrue(os.path.exists(expected_validation_output)) - trajectory = os.path.join(expected_output_directory, - 'smac3-output', 'run_1', 'trajectory.json') + trajectory = os.path.join( + expected_output_directory, "smac3-output", "run_1", "trajectory.json" + ) with open(expected_validation_output) as fh_validation: with open(trajectory) as fh_trajectory: traj = json.load(fh_trajectory) valid_traj = json.load(fh_validation) - print('Validation trajectory:') + print("Validation trajectory:") print(valid_traj) self.assertGreater(len(traj), 2, msg=str(valid_traj)) self.assertEqual(len(traj), len(valid_traj), msg=str(valid_traj)) for entry in valid_traj: - if task_type == 'classification': + if task_type == "classification": for metric in CLASSIFICATION_METRICS: # This is a multilabel metric - if metric in ('precision_samples', 'recall_samples', 'f1_samples'): + if metric in ( + "precision_samples", + "recall_samples", + "f1_samples", + ): continue self.assertIn(metric, entry[-1]) self.assertIsInstance(entry[-1][metric], float) - self.assertTrue(np.isfinite(entry[-1][metric]), - (metric, str(entry[-1][metric]))) + self.assertTrue( + np.isfinite(entry[-1][metric]), + (metric, str(entry[-1][metric])), + ) else: for metric in REGRESSION_METRICS: self.assertIn(metric, entry[-1]) self.assertIsInstance(entry[-1][metric], float) - self.assertTrue(np.isfinite(entry[-1][metric]), - (metric, str(entry[-1][metric]))) + self.assertTrue( + np.isfinite(entry[-1][metric]), + (metric, str(entry[-1][metric])), + ) # 5. Get the test performance of these configurations - script_filename = os.path.join(scripts_directory, '02_retrieve_metadata.py') - cmd = 'python3 %s --working-directory %s ' % (script_filename, self.working_directory) - print('COMMAND: %s' % cmd) - rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - print('STDOUT: %s' % repr(rval.stdout), flush=True) - print('STDERR: %s' % repr(rval.stderr), flush=True) + script_filename = os.path.join(scripts_directory, "02_retrieve_metadata.py") + cmd = "python3 %s --working-directory %s " % ( + script_filename, + self.working_directory, + ) + print("COMMAND: %s" % cmd) + rval = subprocess.run( + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + print("STDOUT: %s" % repr(rval.stdout), flush=True) + print("STDERR: %s" % repr(rval.stderr), flush=True) self.assertEqual(rval.returncode, 0, msg=str(rval)) - for file in ['algorithm_runs.arff', 'configurations.csv', 'description.results.txt']: - for metric in ['accuracy', 'balanced_accuracy', 'log_loss', 'roc_auc']: + for file in [ + "algorithm_runs.arff", + "configurations.csv", + "description.results.txt", + ]: + for metric in ["accuracy", "balanced_accuracy", "log_loss", "roc_auc"]: path = os.path.join( self.working_directory, - 'configuration_results', - '%s_binary.classification_dense' % metric, + "configuration_results", + "%s_binary.classification_dense" % metric, file, ) self.assertTrue(os.path.exists(path), msg=path) - for file in ['algorithm_runs.arff', 'configurations.csv', 'description.results.txt']: - for metric in ['r2', 'mean_squared_error']: + for file in [ + "algorithm_runs.arff", + "configurations.csv", + "description.results.txt", + ]: + for metric in ["r2", "mean_squared_error"]: path = os.path.join( self.working_directory, - 'configuration_results', - '%s_regression_dense' % metric, + "configuration_results", + "%s_regression_dense" % metric, file, ) self.assertTrue(os.path.exists(path), msg=path) # 6. Calculate metafeatures - script_filename = os.path.join(scripts_directory, '03_calculate_metafeatures.py') - cmd = ( - 'python3 %s --working-directory %s --test-mode ' - % (script_filename, self.working_directory) + script_filename = os.path.join( + scripts_directory, "03_calculate_metafeatures.py" + ) + cmd = "python3 %s --working-directory %s --test-mode " % ( + script_filename, + self.working_directory, + ) + rval = subprocess.run( + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) - rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) self.assertEqual(rval.returncode, 0, msg=str(rval)) - for task_type in ('classification', 'regression'): - for file in ['calculation_times.csv', 'description.features.txt', - 'feature_costs.arff', 'feature_runstatus.arff', - 'feature_values.arff']: + for task_type in ("classification", "regression"): + for file in [ + "calculation_times.csv", + "description.features.txt", + "feature_costs.arff", + "feature_runstatus.arff", + "feature_values.arff", + ]: self.assertTrue( - os.path.exists(os.path.join( - self.working_directory, - 'metafeatures', - task_type, - file) + os.path.exists( + os.path.join( + self.working_directory, "metafeatures", task_type, file + ) ) ) with open( os.path.join( - self.working_directory, 'metafeatures', 'regression', 'feature_values.arff' + self.working_directory, + "metafeatures", + "regression", + "feature_values.arff", ) ) as fh: - metafeatures_arff = fh.read().split('\n') + metafeatures_arff = fh.read().split("\n") contains_regression_id = False for line in metafeatures_arff: - if line.startswith('fri_c4_500_25,'): + if line.startswith("fri_c4_500_25,"): contains_regression_id = True self.assertTrue(contains_regression_id, msg=metafeatures_arff) with open( - os.path.join( - self.working_directory, 'metafeatures', 'classification', 'feature_values.arff' - ) + os.path.join( + self.working_directory, + "metafeatures", + "classification", + "feature_values.arff", + ) ) as fh: - metafeatures_arff = fh.read().split('\n') + metafeatures_arff = fh.read().split("\n") contains_classification_id = False for line in metafeatures_arff: - if line.startswith('anneal,'): + if line.startswith("anneal,"): contains_classification_id = True self.assertTrue(contains_classification_id, msg=metafeatures_arff) # 7. Create aslib files - script_filename = os.path.join(scripts_directory, '04_create_aslib_files.py') - cmd = 'python3 %s --working-directory %s ' % ( - script_filename, self.working_directory) - rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + script_filename = os.path.join(scripts_directory, "04_create_aslib_files.py") + cmd = "python3 %s --working-directory %s " % ( + script_filename, + self.working_directory, + ) + rval = subprocess.run( + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) self.assertEqual(rval.returncode, 0, msg=str(rval)) for metric_, combination in ( - (metric, '%s_binary.classification_dense' % metric), - (metric, '%s_regression_dense' % metric), + (metric, "%s_binary.classification_dense" % metric), + (metric, "%s_regression_dense" % metric), ): if task_type not in combination: continue - for file in ['algorithm_runs.arff', 'configurations.csv', - 'description.txt', 'feature_costs.arff', - 'feature_runstatus.arff', 'feature_values.arff', - 'readme.txt']: + for file in [ + "algorithm_runs.arff", + "configurations.csv", + "description.txt", + "feature_costs.arff", + "feature_runstatus.arff", + "feature_values.arff", + "readme.txt", + ]: expected_path = os.path.join( - self.working_directory, 'metadata', combination, file, + self.working_directory, + "metadata", + combination, + file, ) self.assertTrue(os.path.exists(expected_path), msg=expected_path) - with open(os.path.join(self.working_directory, - 'metadata', - combination, - 'algorithm_runs.arff')) as fh: + with open( + os.path.join( + self.working_directory, + "metadata", + combination, + "algorithm_runs.arff", + ) + ) as fh: algorithm_runs = arff.load(fh) - self.assertEqual(algorithm_runs['attributes'], - [('instance_id', 'STRING'), - ('repetition', 'NUMERIC'), - ('algorithm', 'STRING'), - (metric_, 'NUMERIC'), - ('runstatus', - ['ok', 'timeout', 'memout', 'not_applicable', - 'crash', 'other'])]) - self.assertEqual(len(algorithm_runs['data']), 1) - self.assertEqual(len(algorithm_runs['data'][0]), 5) - self.assertLess(algorithm_runs['data'][0][3], 0.9) - self.assertEqual(algorithm_runs['data'][0][4], 'ok') + self.assertEqual( + algorithm_runs["attributes"], + [ + ("instance_id", "STRING"), + ("repetition", "NUMERIC"), + ("algorithm", "STRING"), + (metric_, "NUMERIC"), + ( + "runstatus", + [ + "ok", + "timeout", + "memout", + "not_applicable", + "crash", + "other", + ], + ), + ], + ) + self.assertEqual(len(algorithm_runs["data"]), 1) + self.assertEqual(len(algorithm_runs["data"][0]), 5) + self.assertLess(algorithm_runs["data"][0][3], 0.9) + self.assertEqual(algorithm_runs["data"][0][4], "ok") def tearDown(self): for i in range(5): diff --git a/test/test_util/__init__.py b/test/test_util/__init__.py index cc3cd7becd..e298f0f075 100644 --- a/test/test_util/__init__.py +++ b/test/test_util/__init__.py @@ -1,2 +1,2 @@ # -*- encoding: utf-8 -*- -__author__ = 'feurerm' +__author__ = "feurerm" diff --git a/test/test_util/test_StopWatch.py b/test/test_util/test_StopWatch.py index 14038c6820..d45ecbf55d 100644 --- a/test/test_util/test_StopWatch.py +++ b/test/test_util/test_StopWatch.py @@ -22,8 +22,8 @@ def test_stopwatch_overhead(self): cpu_start = time.process_time() watch = StopWatch() for i in range(1, 1000): - watch.start_task('task_%d' % i) - watch.stop_task('task_%d' % i) + watch.start_task("task_%d" % i) + watch.stop_task("task_%d" % i) cpu_stop = time.process_time() stop = time.time() dur = stop - start @@ -36,6 +36,6 @@ def test_stopwatch_overhead(self): self.assertLess(watch.cpu_sum(), 2 * watch.wall_sum()) -if __name__ == '__main__': +if __name__ == "__main__": # import sys;sys.argv = ['', 'Test.testName'] unittest.main() diff --git a/test/test_util/test_backend.py b/test/test_util/test_backend.py index a029aef4bb..0673370b97 100644 --- a/test/test_util/test_backend.py +++ b/test/test_util/test_backend.py @@ -7,48 +7,48 @@ class BackendModelsTest(unittest.TestCase): - class BackendStub(Backend): - def __init__(self): self.__class__ = Backend def setUp(self): self.backend = self.BackendStub() - self.backend.internals_directory = '/' + self.backend.internals_directory = "/" - @unittest.mock.patch('pickle.load') - @unittest.mock.patch('os.path.exists') + @unittest.mock.patch("pickle.load") + @unittest.mock.patch("os.path.exists") def test_load_model_by_seed_and_id(self, exists_mock, pickleLoadMock): exists_mock.return_value = False - open_mock = unittest.mock.mock_open(read_data='Data') + open_mock = unittest.mock.mock_open(read_data="Data") with unittest.mock.patch( - 'autosklearn.automl_common.common.utils.backend.open', + "autosklearn.automl_common.common.utils.backend.open", open_mock, create=True, ): seed = 13 idx = 17 budget = 50.0 - expected_model = self._setup_load_model_mocks(open_mock, - pickleLoadMock, - seed, idx, budget) + expected_model = self._setup_load_model_mocks( + open_mock, pickleLoadMock, seed, idx, budget + ) actual_model = self.backend.load_model_by_seed_and_id_and_budget( - seed, idx, budget) + seed, idx, budget + ) self.assertEqual(expected_model, actual_model) - @unittest.mock.patch('pickle.load') - @unittest.mock.patch.object(builtins, 'open') - @unittest.mock.patch('os.path.exists') + @unittest.mock.patch("pickle.load") + @unittest.mock.patch.object(builtins, "open") + @unittest.mock.patch("os.path.exists") def test_loads_models_by_identifiers(self, exists_mock, openMock, pickleLoadMock): exists_mock.return_value = True seed = 13 idx = 17 budget = 50.0 expected_model = self._setup_load_model_mocks( - openMock, pickleLoadMock, seed, idx, budget) + openMock, pickleLoadMock, seed, idx, budget + ) expected_dict = {(seed, idx, budget): expected_model} actual_dict = self.backend.load_models_by_identifiers([(seed, idx, budget)]) @@ -57,15 +57,25 @@ def test_loads_models_by_identifiers(self, exists_mock, openMock, pickleLoadMock self.assertDictEqual(expected_dict, actual_dict) def _setup_load_model_mocks(self, openMock, pickleLoadMock, seed, idx, budget): - model_path = '/runs/%s_%s_%s/%s.%s.%s.model' % (seed, idx, budget, seed, idx, budget) - file_handler = 'file_handler' - expected_model = 'model' + model_path = "/runs/%s_%s_%s/%s.%s.%s.model" % ( + seed, + idx, + budget, + seed, + idx, + budget, + ) + file_handler = "file_handler" + expected_model = "model" fileMock = unittest.mock.MagicMock() fileMock.__enter__.return_value = file_handler - openMock.side_effect = \ - lambda path, flag: fileMock if path == model_path and flag == 'rb' else None - pickleLoadMock.side_effect = lambda fh: expected_model if fh == file_handler else None + openMock.side_effect = ( + lambda path, flag: fileMock if path == model_path and flag == "rb" else None + ) + pickleLoadMock.side_effect = ( + lambda fh: expected_model if fh == file_handler else None + ) return expected_model diff --git a/test/test_util/test_common.py b/test/test_util/test_common.py index 740608969d..33fa4cee31 100644 --- a/test/test_util/test_common.py +++ b/test/test_util/test_common.py @@ -18,5 +18,5 @@ def test_check_pid(self): self.assertFalse(exists) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/test/test_util/test_data.py b/test/test_util/test_data.py index 87548b413f..2bceac804a 100644 --- a/test/test_util/test_data.py +++ b/test/test_util/test_data.py @@ -1,29 +1,33 @@ -from typing import Any, List, Dict, Union -from itertools import chain -import warnings +from typing import Any, Dict, List, Union -import pytest +import warnings +from itertools import chain import numpy as np import pandas as pd +import pytest import sklearn.datasets from scipy.sparse import csr_matrix, spmatrix from autosklearn.constants import ( - BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION, - REGRESSION, MULTIOUTPUT_REGRESSION, CLASSIFICATION_TASKS, REGRESSION_TASKS + BINARY_CLASSIFICATION, + CLASSIFICATION_TASKS, + MULTICLASS_CLASSIFICATION, + MULTILABEL_CLASSIFICATION, + MULTIOUTPUT_REGRESSION, + REGRESSION, + REGRESSION_TASKS, ) from autosklearn.util.data import ( - subsample, + default_dataset_compression_arg, reduce_dataset_size_if_too_large, reduce_precision, reduction_mapping, + subsample, supported_precision_reductions, validate_dataset_compression_arg, - default_dataset_compression_arg ) - parametrize = pytest.mark.parametrize @@ -68,11 +72,14 @@ def test_validate_dataset_compression_arg_returns_with_memory_allocation( assert validate_arg["methods"] == expected_methods -@parametrize("methods", [ - ["precision"], - ["precision", "subsample"], - ["precision", "precision", "subsample"] -]) +@parametrize( + "methods", + [ + ["precision"], + ["precision", "subsample"], + ["precision", "precision", "subsample"], + ], +) def test_validate_dataset_compression_arg_returns_with_same_methods( methods: List[str], ): @@ -125,17 +132,14 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_key(key: str): ------- * Should raise a ValueError """ - bad_arg = { - **default_dataset_compression_arg, - key: 1337 - } + bad_arg = {**default_dataset_compression_arg, key: 1337} with pytest.raises(ValueError, match=r"Unknown key"): validate_dataset_compression_arg(bad_arg, memory_limit=10) @parametrize("memory_allocation", ["hello", {}, [1, 2, 3]]) def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocation_type( - memory_allocation: Any + memory_allocation: Any, ): """ Parameters @@ -148,13 +152,15 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocatio * Should raise a ValueError """ bad_arg = {"memory_allocation": memory_allocation} - with pytest.raises(ValueError, match=r"key 'memory_allocation' must be an `int` or `float`"): + with pytest.raises( + ValueError, match=r"key 'memory_allocation' must be an `int` or `float`" + ): validate_dataset_compression_arg(bad_arg, memory_limit=10) @parametrize("memory_allocation", [-0.5, 0.0, 1.0, 1.5]) def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocation_float( - memory_allocation: float + memory_allocation: float, ): """ Parameters @@ -168,16 +174,17 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocatio """ bad_arg = {"memory_allocation": memory_allocation} - with pytest.raises(ValueError, match=r"key 'memory_allocation' if float must be in \(0, 1\)"): + with pytest.raises( + ValueError, match=r"key 'memory_allocation' if float must be in \(0, 1\)" + ): validate_dataset_compression_arg(bad_arg, memory_limit=10) -@parametrize("memory_allocation, memory_limit", [ - (0, 10), (10, 10), (-20, 10), (20, 10) -]) +@parametrize( + "memory_allocation, memory_limit", [(0, 10), (10, 10), (-20, 10), (20, 10)] +) def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocation_int( - memory_allocation: int, - memory_limit: int + memory_allocation: int, memory_limit: int ): """ Parameters @@ -193,12 +200,16 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_memory_allocatio * Should raise a ValueError """ bad_arg = {"memory_allocation": memory_allocation} - with pytest.raises(ValueError, match=r"key 'memory_allocation' if int must be in \(0,"): + with pytest.raises( + ValueError, match=r"key 'memory_allocation' if int must be in \(0," + ): validate_dataset_compression_arg(bad_arg, memory_limit=memory_limit) @parametrize("methods", [10, {"hello", "world"}, []]) -def test_validate_dataset_compression_arg_raises_error_with_bad_methods_type(methods: Any): +def test_validate_dataset_compression_arg_raises_error_with_bad_methods_type( + methods: Any, +): """ Parameters ---------- @@ -214,12 +225,17 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_methods_type(met validate_dataset_compression_arg(bad_arg, memory_limit=10) -@parametrize("methods", [ - ["bad", "worse"], - ["precision", "kind_of_bad"], - ["still_bad", "precision", "subsample"] -]) -def test_validate_dataset_compression_arg_raises_error_with_bad_methods_entries(methods: Any): +@parametrize( + "methods", + [ + ["bad", "worse"], + ["precision", "kind_of_bad"], + ["still_bad", "precision", "subsample"], + ], +) +def test_validate_dataset_compression_arg_raises_error_with_bad_methods_entries( + methods: Any, +): """ Parameters ---------- @@ -235,11 +251,16 @@ def test_validate_dataset_compression_arg_raises_error_with_bad_methods_entries( validate_dataset_compression_arg(bad_arg, memory_limit=10) -@parametrize("y", [ - np.asarray(9999 * [0] + 1 * [1]), - np.asarray(4999 * [1] + 4999 * [2] + 1 * [3] + 1 * [4]), - np.asarray(4999 * [[0, 1, 1]] + 4999 * [[1, 1, 0]] + 1 * [[1, 0, 1]] + 1 * [[0, 0, 0]]) -]) +@parametrize( + "y", + [ + np.asarray(9999 * [0] + 1 * [1]), + np.asarray(4999 * [1] + 4999 * [2] + 1 * [3] + 1 * [4]), + np.asarray( + 4999 * [[0, 1, 1]] + 4999 * [[1, 1, 0]] + 1 * [[1, 0, 1]] + 1 * [[0, 0, 0]] + ), + ], +) @parametrize("random_state", list(range(5))) def test_subsample_classification_unique_labels_stay_in_training_set(y, random_state): n_samples = len(y) @@ -253,32 +274,37 @@ def test_subsample_classification_unique_labels_stay_in_training_set(y, random_s with warnings.catch_warnings(): warnings.simplefilter("ignore") X_sampled, y_sampled = subsample( - X, y, + X, + y, random_state=random_state, sample_size=sample_size, - is_classification=True + is_classification=True, ) assert X_sampled.dtype == X.dtype and y_sampled.dtype == y.dtype assert len(y_sampled) == sample_size - assert all(label in y_sampled for label in unique_labels), \ - f"sampled unique = {np.unique(y_sampled)}, original unique = {unique_labels}" + assert all( + label in y_sampled for label in unique_labels + ), f"sampled unique = {np.unique(y_sampled)}, original unique = {unique_labels}" @parametrize("X", [np.asarray([[1, 1, 1]] * 30)]) @parametrize("x_type", [list, np.ndarray, csr_matrix, pd.DataFrame]) -@parametrize("y, task", [ - (np.asarray([0] * 15 + [1] * 15), BINARY_CLASSIFICATION), - (np.asarray([0] * 10 + [1] * 10 + [2] * 10), MULTICLASS_CLASSIFICATION), - (np.asarray([[1, 0, 1]] * 30), MULTILABEL_CLASSIFICATION), - (np.asarray([1.0] * 30), REGRESSION), - (np.asarray([[1.0, 1.0, 1.0]] * 30), MULTIOUTPUT_REGRESSION), -]) +@parametrize( + "y, task", + [ + (np.asarray([0] * 15 + [1] * 15), BINARY_CLASSIFICATION), + (np.asarray([0] * 10 + [1] * 10 + [2] * 10), MULTICLASS_CLASSIFICATION), + (np.asarray([[1, 0, 1]] * 30), MULTILABEL_CLASSIFICATION), + (np.asarray([1.0] * 30), REGRESSION), + (np.asarray([[1.0, 1.0, 1.0]] * 30), MULTIOUTPUT_REGRESSION), + ], +) @parametrize("y_type", [list, np.ndarray, pd.DataFrame, pd.Series]) @parametrize("random_state", [0]) @parametrize("sample_size", [0.25, 0.5, 5, 10]) def test_subsample_validity(X, x_type, y, y_type, random_state, sample_size, task): - """ Asserts the validity of the function with all valid types + """Asserts the validity of the function with all valid types We want to make sure that `subsample` works correctly with all the types listed as x_type and y_type. @@ -289,10 +315,10 @@ def test_subsample_validity(X, x_type, y, y_type, random_state, sample_size, tas """ assert len(X) == len(y) # Make sure our test data is correct - if ( - y_type == pd.Series - and task in [MULTILABEL_CLASSIFICATION, MULTIOUTPUT_REGRESSION] - ): + if y_type == pd.Series and task in [ + MULTILABEL_CLASSIFICATION, + MULTIOUTPUT_REGRESSION, + ]: # We can't have a pd.Series with multiple values as it's 1 dimensional pytest.skip("Can't have pd.Series as y when task is n-dimensional") @@ -312,10 +338,11 @@ def convert(arr, objtype): with warnings.catch_warnings(): warnings.simplefilter("ignore") X_sampled, y_sampled = subsample( - X, y, + X, + y, random_state=random_state, sample_size=sample_size, - is_classification=task in CLASSIFICATION_TASKS + is_classification=task in CLASSIFICATION_TASKS, ) # Function to get the type of an obj @@ -359,9 +386,11 @@ def size(obj): assert size(X_sampled) == sample_size -@parametrize('X', [np.asarray([[0, 0, 1]] * 10)]) -@parametrize('dtype', supported_precision_reductions + [np.dtype('float32'), np.dtype('float64')]) -@parametrize('x_type', [np.ndarray, csr_matrix]) +@parametrize("X", [np.asarray([[0, 0, 1]] * 10)]) +@parametrize( + "dtype", supported_precision_reductions + [np.dtype("float32"), np.dtype("float64")] +) +@parametrize("x_type", [np.ndarray, csr_matrix]) def test_reduce_precision_correctly_reduces_precision(X, dtype, x_type): X = X.astype(dtype) if x_type == csr_matrix: @@ -376,13 +405,13 @@ def test_reduce_precision_correctly_reduces_precision(X, dtype, x_type): expected: Dict[type, type] = { np.float32: np.float32, np.float64: np.float32, - np.dtype('float32'): np.float32, - np.dtype('float64'): np.float32 + np.dtype("float32"): np.float32, + np.dtype("float64"): np.float32, } - if hasattr(np, 'float96'): + if hasattr(np, "float96"): expected[np.float96] = np.float64 - if hasattr(np, 'float128'): + if hasattr(np, "float128"): expected[np.float128] = np.float64 assert precision == expected[dtype] @@ -394,28 +423,40 @@ def test_reduce_precision_correctly_reduces_precision(X, dtype, x_type): assert type(X) == type(X_reduced) -@parametrize('X', [np.asarray([0, 0, 1]) * 10]) -@parametrize('dtype', [np.int32, np.int64, np.complex128]) +@parametrize("X", [np.asarray([0, 0, 1]) * 10]) +@parametrize("dtype", [np.int32, np.int64, np.complex128]) def test_reduce_precision_with_unsupported_dtypes(X, dtype): X = X.astype(dtype) with pytest.raises(ValueError) as err: reduce_precision(X) - expected = f"X.dtype = {X.dtype} not equal to any supported {supported_precision_reductions}" + expected = ( + f"X.dtype = {X.dtype} not equal to any supported " + f"{supported_precision_reductions}" + ) + assert err.value.args[0] == expected -@parametrize("X", [ - np.ones((100000, 10), dtype=np.float64) # Make it big for reductions to take place -]) +@parametrize( + "X", + [ + np.ones( + (100000, 10), dtype=np.float64 + ) # Make it big for reductions to take place + ], +) @parametrize("x_type", [csr_matrix, np.ndarray]) @parametrize("dtype", supported_precision_reductions) -@parametrize('y, is_classification', [ - (np.ones((100000,)), True), - (np.ones((100000,)), False), -]) -@parametrize('memory_allocation', [0.1, 1/5.2, 1/8, 1]) -@parametrize('operations', [['precision'], ['subsample'], ['precision', 'subsample']]) +@parametrize( + "y, is_classification", + [ + (np.ones((100000,)), True), + (np.ones((100000,)), False), + ], +) +@parametrize("memory_allocation", [0.1, 1 / 5.2, 1 / 8, 1]) +@parametrize("operations", [["precision"], ["subsample"], ["precision", "subsample"]]) def test_reduce_dataset_reduces_size_and_precision( X, x_type, dtype, y, is_classification, memory_allocation, operations ): @@ -444,13 +485,13 @@ def bytes(arr): return arr.nbytes if isinstance(arr, np.ndarray) else arr.data.nbytes # If we expect some precision reduction unless at float32 already - if 'precision' in operations and dtype != np.float32: + if "precision" in operations and dtype != np.float32: expected = reduction_mapping[X.dtype] assert X_out.dtype == expected assert bytes(X_out) < bytes(X) # If we expect some subsampling - if 'subsample' in operations: + if "subsample" in operations: assert X_out.shape[0] < X.shape[0] assert y_out.shape[0] < y.shape[0] assert bytes(X_out) < bytes(X) @@ -464,10 +505,10 @@ def test_reduce_dataset_invalid_dtype_for_precision_reduction(): reduce_dataset_size_if_too_large( X=X, y=X, - operations=['precision'], + operations=["precision"], memory_limit=1, memory_allocation=0.1, - is_classification=False + is_classification=False, ) expected_err = f"Unsupported type `{X.dtype}` for precision reduction" @@ -485,7 +526,7 @@ def test_reduce_dataset_invalid_operations(): operations=[invalid_op], memory_limit=1, memory_allocation=0.1, - is_classification=False + is_classification=False, ) expected_err = f"Unknown operation `{invalid_op}`" @@ -504,13 +545,15 @@ def test_reduce_dataset_invalid_memory_allocation_float(memory_allocation: float ------- * Should raise a ValueError """ - with pytest.raises(ValueError, match=r"memory_allocation if float must be in \(0, 1\)"): + with pytest.raises( + ValueError, match=r"memory_allocation if float must be in \(0, 1\)" + ): reduce_dataset_size_if_too_large( X=np.empty(1), y=np.empty(1), memory_limit=100, is_classification=True, - memory_allocation=memory_allocation + memory_allocation=memory_allocation, ) @@ -526,17 +569,19 @@ def test_reduce_dataset_invalid_memory_allocation_int(memory_allocation: int): ------- * Should raise a ValueError """ - with pytest.raises(ValueError, match=r"memory_allocation if int must be in \(0, memory_limit"): + with pytest.raises( + ValueError, match=r"memory_allocation if int must be in \(0, memory_limit" + ): reduce_dataset_size_if_too_large( X=np.empty(1), y=np.empty(1), is_classification=True, memory_limit=100, - memory_allocation=memory_allocation + memory_allocation=memory_allocation, ) -@parametrize("memory_allocation", ["100", {'a': 1}, [100]]) +@parametrize("memory_allocation", ["100", {"a": 1}, [100]]) def test_reduce_dataset_invalid_memory_allocation_type(memory_allocation: Any): """ Parameters @@ -554,25 +599,30 @@ def test_reduce_dataset_invalid_memory_allocation_type(memory_allocation: Any): y=np.empty(1), memory_limit=100, is_classification=True, - memory_allocation=memory_allocation + memory_allocation=memory_allocation, ) @pytest.mark.parametrize( - 'memory_limit,precision,task', + "memory_limit,precision,task", [ (memory_limit, precision, task) for task in chain(CLASSIFICATION_TASKS, REGRESSION_TASKS) for precision in (float, np.float32, np.float64, np.float128) for memory_limit in (1, 100) - ] + ], ) def test_reduce_dataset_subsampling_explicit_values(memory_limit, precision, task): random_state = 0 fixture = { BINARY_CLASSIFICATION: { 1: {float: 2621, np.float32: 2621, np.float64: 2621, np.float128: 1310}, - 100: {float: 12000, np.float32: 12000, np.float64: 12000, np.float128: 12000}, + 100: { + float: 12000, + np.float32: 12000, + np.float64: 12000, + np.float128: 12000, + }, }, MULTICLASS_CLASSIFICATION: { 1: {float: 409, np.float32: 409, np.float64: 409, np.float128: 204}, @@ -589,7 +639,7 @@ def test_reduce_dataset_subsampling_explicit_values(memory_limit, precision, tas MULTIOUTPUT_REGRESSION: { 1: {float: 1310, np.float32: 1310, np.float64: 1310, np.float128: 655}, 100: {float: 5000, np.float32: 5000, np.float64: 5000, np.float128: 5000}, - } + }, } # Create the task and data @@ -620,12 +670,13 @@ def test_reduce_dataset_subsampling_explicit_values(memory_limit, precision, tas with warnings.catch_warnings(): warnings.simplefilter("ignore") X_new, y_new = reduce_dataset_size_if_too_large( - X=X, y=y, + X=X, + y=y, random_state=random_state, memory_limit=memory_limit, is_classification=task in CLASSIFICATION_TASKS, - operations=['precision', 'subsample'], - memory_allocation=0.1 + operations=["precision", "subsample"], + memory_allocation=0.1, ) # Assert the new number of samples diff --git a/test/test_util/test_dependencies.py b/test/test_util/test_dependencies.py index 53b2285750..1c59dad51b 100644 --- a/test/test_util/test_dependencies.py +++ b/test/test_util/test_dependencies.py @@ -1,30 +1,31 @@ -import unittest -import pkg_resources import re - -from unittest.mock import patch, Mock +import unittest +from unittest.mock import Mock, patch import numpy as np +import pkg_resources -from autosklearn.util.dependencies import verify_packages, MissingPackageError, \ - IncorrectPackageVersionError +from autosklearn.util.dependencies import ( + IncorrectPackageVersionError, + MissingPackageError, + verify_packages, +) -@patch('pkg_resources.get_distribution') +@patch("pkg_resources.get_distribution") class VerifyPackagesTests(unittest.TestCase): - def test_existing_package(self, getDistributionMock): - requirement = 'package' + requirement = "package" distribution_mock = unittest.mock.Mock() getDistributionMock.return_value = distribution_mock - distribution_mock.version = '1.0.0' + distribution_mock.version = "1.0.0" verify_packages(requirement) - getDistributionMock.assert_called_once_with('package') + getDistributionMock.assert_called_once_with("package") def test_missing_package(self, getDistributionMock): - requirement = 'package' + requirement = "package" getDistributionMock.side_effect = pkg_resources.DistributionNotFound() @@ -35,7 +36,7 @@ def test_missing_package(self, getDistributionMock): requirement, ) - @patch('importlib.import_module') + @patch("importlib.import_module") def test_package_can_only_be_imported(self, import_mock, getDistributionMock): getDistributionMock.side_effect = pkg_resources.DistributionNotFound() @@ -43,60 +44,64 @@ def test_package_can_only_be_imported(self, import_mock, getDistributionMock): package.__version__ = np.__version__ import_mock.return_value = package - verify_packages('numpy') + verify_packages("numpy") def test_correct_package_versions(self, getDistributionMock): - requirement = 'package==0.1.2\n' \ - 'package>0.1\n' \ - 'package>=0.1' + requirement = "package==0.1.2\n" "package>0.1\n" "package>=0.1" moduleMock = Mock() - moduleMock.version = '0.1.2' + moduleMock.version = "0.1.2" getDistributionMock.return_value = moduleMock verify_packages(requirement) - getDistributionMock.assert_called_with('package') + getDistributionMock.assert_called_with("package") self.assertEqual(3, len(getDistributionMock.call_args_list)) def test_wrong_package_version(self, getDistributionMock): - requirement = 'package>0.1.2' + requirement = "package>0.1.2" moduleMock = Mock() - moduleMock.version = '0.1.2' + moduleMock.version = "0.1.2" getDistributionMock.return_value = moduleMock self.assertRaisesRegex( IncorrectPackageVersionError, - re.escape("found 'package' version 0.1.2 but requires package version >0.1.2"), + re.escape( + "found 'package' version 0.1.2 but requires package version >0.1.2" + ), verify_packages, requirement, - ) + ) def test_outdated_requirement(self, getDistributionMock): - requirement = 'package>=0.1' + requirement = "package>=0.1" moduleMock = Mock() - moduleMock.version = '0.0.9' + moduleMock.version = "0.0.9" getDistributionMock.return_value = moduleMock self.assertRaisesRegex( IncorrectPackageVersionError, - re.escape("found 'package' version 0.0.9 but requires package version >=0.1"), + re.escape( + "found 'package' version 0.0.9 but requires package version >=0.1" + ), verify_packages, requirement, - ) + ) def test_too_fresh_requirement(self, getDistributionMock): - requirement = 'package==0.1.2' + requirement = "package==0.1.2" moduleMock = Mock() - moduleMock.version = '0.1.3' + moduleMock.version = "0.1.3" getDistributionMock.return_value = moduleMock self.assertRaisesRegex( IncorrectPackageVersionError, - re.escape("found 'package' version 0.1.3 but requires package version ==0.1.2"), + re.escape( + "found 'package' version 0.1.3 but requires package version ==0.1.2" + ), verify_packages, requirement, - ) + ) diff --git a/test/test_util/test_logging.py b/test/test_util/test_logging.py index 568593c7c8..d824aecc02 100644 --- a/test/test_util/test_logging.py +++ b/test/test_util/test_logging.py @@ -1,47 +1,46 @@ -import os -import unittest import logging import logging.config +import os import tempfile -import yaml +import unittest +import yaml from autosklearn.util import logging_ class LoggingTest(unittest.TestCase): - def test_setup_logger(self): # Test that setup_logger function correctly configures the logger # according to the given dictionary, and uses the default # logging.yaml file if logging_config is not specified. - with open(os.path.join(os.path.dirname(__file__), 'example_config.yaml'), 'r') as fh: + with open( + os.path.join(os.path.dirname(__file__), "example_config.yaml"), "r" + ) as fh: example_config = yaml.safe_load(fh) # Configure logger with example_config.yaml. - logging_.setup_logger(logging_config=example_config, - output_dir=tempfile.gettempdir()) + logging_.setup_logger( + logging_config=example_config, output_dir=tempfile.gettempdir() + ) # example_config sets the root logger's level to CRITICAL, # which corresponds to 50. self.assertEqual(logging.getLogger().getEffectiveLevel(), 50) # This time use the default configuration. - logging_.setup_logger(logging_config=None, - output_dir=tempfile.gettempdir()) + logging_.setup_logger(logging_config=None, output_dir=tempfile.gettempdir()) # default config sets the root logger's level to DEBUG, # which corresponds to 10. self.assertEqual(logging.getLogger().getEffectiveLevel(), 10) # Make sure we log to the desired directory - logging_.setup_logger(output_dir=os.path.dirname(__file__), - filename='test.log' - ) + logging_.setup_logger(output_dir=os.path.dirname(__file__), filename="test.log") logger = logging.getLogger() - logger.info('test_setup_logger') + logger.info("test_setup_logger") - with open(os.path.join(os.path.dirname(__file__), 'test.log')) as fh: - self.assertIn('test_setup_logger', ''.join(fh.readlines())) - os.remove(os.path.join(os.path.dirname(__file__), 'test.log')) + with open(os.path.join(os.path.dirname(__file__), "test.log")) as fh: + self.assertIn("test_setup_logger", "".join(fh.readlines())) + os.remove(os.path.join(os.path.dirname(__file__), "test.log")) diff --git a/test/test_util/test_single_thread_client.py b/test/test_util/test_single_thread_client.py index 34fe7736fe..770ff9f04a 100644 --- a/test/test_util/test_single_thread_client.py +++ b/test/test_util/test_single_thread_client.py @@ -1,8 +1,6 @@ import dask.distributed - -from distributed.utils_test import inc - import pytest +from distributed.utils_test import inc from autosklearn.util.single_thread_client import SingleThreadedClient diff --git a/test/test_util/test_trials_callback.py b/test/test_util/test_trials_callback.py index 3cda8ea204..d1bfe6b748 100644 --- a/test/test_util/test_trials_callback.py +++ b/test/test_util/test_trials_callback.py @@ -13,56 +13,62 @@ class AutoMLTrialsCallBack(IncorporateRunResultCallback): - def __init__(self, fname): self.trials_num = 1 self.fname = fname with open(fname, "w") as fp: - fp.write("TrialNo, " - "StartTime, " - "EndTime, " - "Status, " - "TrainLoss, " - "ValidLoss, " - "TestLoss, " - "Classifier") + fp.write( + "TrialNo, " + "StartTime, " + "EndTime, " + "Status, " + "TrainLoss, " + "ValidLoss, " + "TestLoss, " + "Classifier" + ) def __call__( - self, smbo: 'SMBO', - run_info: RunInfo, - result: RunValue, - time_left: float, + self, + smbo: "SMBO", + run_info: RunInfo, + result: RunValue, + time_left: float, ) -> None: train_loss, valid_loss, test_loss = None, None, None trial_start_time = result.starttime trial_end_time = result.endtime trial_status = result.status.name if trial_status == StatusType.SUCCESS.name: - train_loss = result.additional_info.get('train_loss') + train_loss = result.additional_info.get("train_loss") valid_loss = result.cost - test_loss = result.additional_info.get('test_loss') - trial_classifier = run_info.config.get_dictionary()['classifier:__choice__'] + test_loss = result.additional_info.get("test_loss") + trial_classifier = run_info.config.get_dictionary()["classifier:__choice__"] with open(self.fname, "a+") as fp: - fp.write(f"\n {self.trials_num}, {trial_start_time}, {trial_end_time}, {trial_status}, " - f"{train_loss}, {valid_loss}, {test_loss}, {trial_classifier}") + fp.write( + f"\n {self.trials_num}, {trial_start_time}, {trial_end_time}," + f" {trial_status}, {train_loss}, {valid_loss}, {test_loss}," + f" {trial_classifier}" + ) self.trials_num += 1 class VerifyTrialsCallBack(unittest.TestCase): - def test_trials_callback_execution(self): trials_summary_fname = os.path.join(tempfile.gettempdir(), "trials.csv") - X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer') - cls = AutoSklearnClassifier(time_left_for_this_task=30, - initial_configurations_via_metalearning=0, - per_run_time_limit=10, - memory_limit=1024, - delete_tmp_folder_after_terminate=False, - n_jobs=1, - include={'feature_preprocessor': ['pca'], - 'classifier': ['sgd']}, - get_trials_callback=AutoMLTrialsCallBack(trials_summary_fname) - ) + X_train, Y_train, X_test, Y_test = putil.get_dataset("breast_cancer") + cls = AutoSklearnClassifier( + time_left_for_this_task=30, + initial_configurations_via_metalearning=0, + per_run_time_limit=10, + memory_limit=1024, + delete_tmp_folder_after_terminate=False, + n_jobs=1, + include={"feature_preprocessor": ["pca"], "classifier": ["sgd"]}, + get_trials_callback=AutoMLTrialsCallBack(trials_summary_fname), + ) cls.fit(X_train, Y_train, X_test, Y_test) trials = pd.read_csv(trials_summary_fname) - assert trials.shape[0] > 0, f"Auto-Sklearn explored {trials.shape[0] - 1} trials" + assert ( + trials.shape[0] > 0 + ), f"Auto-Sklearn explored {trials.shape[0] - 1} trials" diff --git a/testcommand.sh b/testcommand.sh deleted file mode 100644 index 00c8fe8321..0000000000 --- a/testcommand.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -pytest -n 3 --durations=20 --timeout=300 --dist load --timeout-method=thread --fulltrace -v $1