automl · mfeurer · May 9, 2022 · Mar 24, 2022 · Mar 24, 2022 · May 2, 2022
diff --git a/autosklearn/metalearning/metafeatures/metafeatures.py b/autosklearn/metalearning/metafeatures/metafeatures.py
@@ -1061,7 +1061,7 @@ def _calculate(self, X, y, logger, categorical):
 
 
 def calculate_all_metafeatures_encoded_labels(
-    X, y, categorical, dataset_name, logger, calculate=None, dont_calculate=None
+    X, y, feat_type, dataset_name, logger, calculate=None, dont_calculate=None
 ):
     """
     Calculate only metafeatures for which a 1HotEncoded feature matrix is necessery.
@@ -1073,7 +1073,7 @@ def calculate_all_metafeatures_encoded_labels(
     return calculate_all_metafeatures(
         X,
         y,
-        categorical,
+        feat_type,
         dataset_name,
         calculate=calculate,
         dont_calculate=dont_calculate,
@@ -1082,7 +1082,7 @@ def calculate_all_metafeatures_encoded_labels(
 
 
 def calculate_all_metafeatures_with_labels(
-    X, y, categorical, dataset_name, logger, calculate=None, dont_calculate=None
+    X, y, feat_type, dataset_name, logger, calculate=None, dont_calculate=None
 ):
     if dont_calculate is None:
         dont_calculate = set()
@@ -1092,7 +1092,7 @@ def calculate_all_metafeatures_with_labels(
     return calculate_all_metafeatures(
         X,
         y,
-        categorical,
+        feat_type,
         dataset_name,
         calculate=calculate,
         dont_calculate=dont_calculate,
@@ -1103,7 +1103,7 @@ def calculate_all_metafeatures_with_labels(
 def calculate_all_metafeatures(
     X,
     y,
-    categorical,
+    feat_type,
     dataset_name,
     logger,
     calculate=None,
@@ -1138,19 +1138,7 @@ def calculate_all_metafeatures(
                 # sparse matrices because of wrong sparse format)
                 sparse = scipy.sparse.issparse(X)
 
-                feat_type = {
-                    key: "categorical" if value else "numerical"
-                    for key, value in categorical.items()
-                }
-
-                # TODO make this more cohesive to the overall structure (quick bug fix)
-                if isinstance(X, pd.DataFrame):
-                    for key in X.select_dtypes(include="string").columns:
-                        feat_type[key] = "string"
-
                 DPP = FeatTypeSplit(
-                    # The difference between feat_type and categorical, is that
-                    # categorical has True/False instead of categorical/numerical
                     feat_type=feat_type,
                     force_sparse_output=True,
                 )
@@ -1189,7 +1177,10 @@ def calculate_all_metafeatures(
         else:
             X_ = X
             y_ = y
-            categorical_ = categorical
+            categorical_ = {
+                col: True if feat_type.lower() == "categorical" else False
+                for col, feat_type in feat_type.items()
+            }
 
         dependency = metafeatures.get_dependency(name)
         if dependency is not None:

diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py
@@ -101,11 +101,6 @@ def _calculate_metafeatures(
         # == Calculate metafeatures
         with stopwatch.time("Calculate meta-features") as task_timer:
 
-            categorical = {
-                col: True if feat_type.lower() in {"categorical", "string"} else False
-                for col, feat_type in data_feat_type.items()
-            }
-
             EXCLUDE_META_FEATURES = (
                 EXCLUDE_META_FEATURES_CLASSIFICATION
                 if data_info_task in CLASSIFICATION_TASKS
@@ -123,7 +118,7 @@ def _calculate_metafeatures(
                 result = calculate_all_metafeatures_with_labels(
                     x_train,
                     y_train,
-                    categorical=categorical,
+                    feat_type=data_feat_type,
                     dataset_name=basename,
                     dont_calculate=EXCLUDE_META_FEATURES,
                     logger=logger_,
@@ -159,15 +154,11 @@ def _calculate_metafeatures_encoded(
         )
 
         with stopwatch.time("Calculate meta-features encoded") as task_timer:
-            categorical = {
-                col: True if feat_type.lower() in {"categorical", "string"} else False
-                for col, feat_type in data_feat_type.items()
-            }
 
             result = calculate_all_metafeatures_encoded_labels(
                 x_train,
                 y_train,
-                categorical=categorical,
+                feat_type=data_feat_type,
                 dataset_name=basename,
                 dont_calculate=EXCLUDE_META_FEATURES,
                 logger=logger_,

diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features.py b/test/test_metalearning/pyMetaLearn/test_meta_features.py
@@ -53,8 +53,8 @@ def meta_train_data(request):
         for name, type_ in dataset["attributes"][:-1]
     ]
 
-    categorical = {
-        i: True if attribute == "nominal" else False
+    feat_type = {
+        i: "categorical" if attribute == "nominal" else "numerical"
         for i, attribute in enumerate(attribute_types)
     }
 
@@ -65,20 +65,20 @@ def meta_train_data(request):
     logger = logging.getLogger("Meta")
     meta_features.helper_functions.set_value(
         "MissingValues",
-        meta_features.helper_functions["MissingValues"](X, y, logger, categorical),
+        meta_features.helper_functions["MissingValues"](X, y, logger, feat_type),
     )
     meta_features.helper_functions.set_value(
         "NumSymbols",
-        meta_features.helper_functions["NumSymbols"](X, y, logger, categorical),
+        meta_features.helper_functions["NumSymbols"](X, y, logger, feat_type),
     )
     meta_features.helper_functions.set_value(
         "ClassOccurences",
         meta_features.helper_functions["ClassOccurences"](X, y, logger),
     )
     if request.param == "numpy":
-        return X, y, categorical
+        return X, y, feat_type
     elif request.param == "pandas":
-        return pd.DataFrame(X), y, categorical
+        return pd.DataFrame(X), y, feat_type
     else:
         raise ValueError(request.param)
 
@@ -97,8 +97,8 @@ def meta_train_data_transformed(request):
         "numeric" if type(type_) != list else "nominal"
         for name, type_ in dataset["attributes"][:-1]
     ]
-    categorical = {
-        i: True if attribute == "nominal" else False
+    feat_type = {
+        i: "categorical" if attribute == "nominal" else "numerical"
         for i, attribute in enumerate(attribute_types)
     }
 
@@ -109,28 +109,30 @@ def meta_train_data_transformed(request):
     logger = logging.getLogger("Meta")
     meta_features.helper_functions.set_value(
         "MissingValues",
-        meta_features.helper_functions["MissingValues"](X, y, logger, categorical),
+        meta_features.helper_functions["MissingValues"](X, y, logger, feat_type),
     )
     meta_features.helper_functions.set_value(
         "NumSymbols",
-        meta_features.helper_functions["NumSymbols"](X, y, logger, categorical),
+        meta_features.helper_functions["NumSymbols"](X, y, logger, feat_type),
     )
     meta_features.helper_functions.set_value(
         "ClassOccurences",
         meta_features.helper_functions["ClassOccurences"](X, y, logger),
     )
 
-    DPP = FeatTypeSplit(
-        feat_type={
-            col: "categorical" if category else "numerical"
-            for col, category in categorical.items()
-        }
-    )
+    DPP = FeatTypeSplit(feat_type=feat_type)
     X_transformed = DPP.fit_transform(X)
 
-    number_numerical = np.sum(~np.array(list(categorical.values())))
+    number_numerical = np.sum(
+        [True if feat_type[i] == "numerical" else False for i in feat_type.keys()]
+    )
+    number_string = np.sum(
+        [True if feat_type[i] == "string" else False for i in feat_type.keys()]
+    )
     categorical_transformed = {
-        i: True if i < (X_transformed.shape[1] - number_numerical) else False
+        i: True
+        if i < (X_transformed.shape[1] - number_numerical - number_string)
+        else False
         for i in range(X_transformed.shape[1])
     }
 
@@ -846,9 +848,9 @@ def test_1NN_multilabel(multilabel_train_data):
 def test_calculate_all_metafeatures_multilabel(multilabel_train_data):
     meta_features.helper_functions.clear()
     X, y = multilabel_train_data
-    categorical = {i: False for i in range(10)}
+    feat_type = {i: "numerical" for i in range(10)}
     mf = meta_features.calculate_all_metafeatures(
-        X, y, categorical, "Generated", logger=logging.getLogger("TestMeta")
+        X, y, feat_type, "Generated", logger=logging.getLogger("TestMeta")
     )
     assert 52 == len(mf.metafeature_values)
 
@@ -860,11 +862,12 @@ def test_calculate_all_metafeatures_same_results_across_datatypes():
     all metafeatures work in this complex dataset
     """
     X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=True)
-    categorical = {
-        col: True if X[col].dtype.name == "category" else False for col in X.columns
+    feat_type = {
+        col: "categorical" if X[col].dtype.name == "category" else "numerical"
+        for col in X.columns
     }
     mf = meta_features.calculate_all_metafeatures(
-        X, y, categorical, "2", logger=logging.getLogger("Meta")
+        X, y, feat_type, "2", logger=logging.getLogger("Meta")
     )
     assert 52 == len(mf.metafeature_values)
     expected = {
@@ -925,13 +928,12 @@ def test_calculate_all_metafeatures_same_results_across_datatypes():
 
     # Then do numpy!
     X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=False)
-    categorical = {
-        i: True if category else False
-        for i, category in enumerate(categorical.values())
-    }
+    feat_type = {i: feat_type[key] for i, key in enumerate(feat_type.keys())}
+
     mf = meta_features.calculate_all_metafeatures(
-        X, y, categorical, "2", logger=logging.getLogger("Meta")
+        X, y, feat_type, "2", logger=logging.getLogger("Meta")
     )
+
     assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected)
 
     # The column-reorder of pandas and numpy array are different after

diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py b/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py
@@ -420,7 +420,11 @@ def test_pca_skewness_first_pc(sparse_data_transformed):
 
 def test_calculate_all_metafeatures(sparse_data):
     X, y, categorical = sparse_data
+    feat_type = {
+        key: "categorical" if categorical[key] else "numerical"
+        for key in categorical.keys()
+    }
     mf = meta_features.calculate_all_metafeatures(
-        X, y, categorical, "2", logger=logging.getLogger("Meta")
+        X, y, feat_type, "2", logger=logging.getLogger("Meta")
     )
     assert 52 == len(mf.metafeature_values)