automl
diff --git a/‎autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py‎
Lines changed: 17 additions & 16 deletions b/‎autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py‎
Lines changed: 17 additions & 16 deletions
diff --git a/‎autosklearn/pipeline/components/data_preprocessing/text_feature_reduction/truncated_svd.py‎
Lines changed: 8 additions & 2 deletions b/‎autosklearn/pipeline/components/data_preprocessing/text_feature_reduction/truncated_svd.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎test/test_metalearning/pyMetaLearn/test_metalearning_configuration.py‎
Lines changed: 15 additions & 9 deletions b/‎test/test_metalearning/pyMetaLearn/test_metalearning_configuration.py‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎test/test_pipeline/components/data_preprocessing/test_data_preprocessing_feat_type.py‎
Lines changed: 14 additions & 7 deletions b/‎test/test_pipeline/components/data_preprocessing/test_data_preprocessing_feat_type.py‎
Lines changed: 14 additions & 7 deletions
@@ -14,6 +14,7 @@
 from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE
 from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
 from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA
+from autosklearn.util.common import check_for_bool
 
 
 class TfidfEncoder(AutoSklearnPreprocessingAlgorithm):
@@ -51,6 +52,9 @@ def fit(
         else:
             raise KeyError(f"Analyzer is not defined for {self.analyzer}")
 
+        self.sublinear_tf = check_for_bool(self.sublinear_tf)
+        self.binary = check_for_bool(self.binary)
+
         if isinstance(X, pd.DataFrame):
             X.fillna("", inplace=True)
             if self.per_column:
@@ -120,8 +124,8 @@ def get_properties(
         dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
     ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
         return {
-            "shortname": "RBOW",
-            "name": "Relative Bag Of Word Encoder",
+            "shortname": "TF/IDF",
+            "name": "TF/IDF Encoder",
             "handles_regression": True,
             "handles_classification": True,
             "handles_multiclass": True,
@@ -153,15 +157,14 @@ def get_hyperparameter_search_space(
             default_value=4,
         )
 
-        hp_ngram_range_word = CSH.UniformFloatHyperparameter(
+        hp_ngram_range_word = CSH.UniformIntegerHyperparameter(
             name="ngram_range_word",
             lower=1,
             upper=3,
             default_value=1,
         )
 
         hp_min_df = CSH.UniformFloatHyperparameter(
-            # Todo this can still result in building no vectorizer
             name="min_df",
             lower=0.0,
             upper=0.3,
@@ -172,17 +175,15 @@ def get_hyperparameter_search_space(
             name="max_df", lower=0.7, upper=1.0, default_value=1.0
         )
 
-        # hp_binary = CSH.CategoricalHyperparameter(
-        #     name="binary", choices=[True, False], default_value=False
-        # )
+        hp_binary = CSH.UnParametrizedHyperparameter(name="binary", value="False")
 
-        # hp_norm = CSH.CategoricalHyperparameter(
-        #     name="norm", choices=["l2", "l1"], default_value="l2"
-        # )
+        hp_norm = CSH.CategoricalHyperparameter(
+            name="norm", choices=["l2", "l1"], default_value="l2"
+        )
 
-        # hp_sublinear_tf = CSH.CategoricalHyperparameter(
-        #     name="sublinear_tf", choices=[True, False], default_value=False
-        # )
+        hp_sublinear_tf = CSH.UnParametrizedHyperparameter(
+            name="sublinear_tf", value="False"
+        )
 
         hp_per_column = CSH.CategoricalHyperparameter(
             name="per_column", choices=[True, False], default_value=False
@@ -195,9 +196,9 @@ def get_hyperparameter_search_space(
                 hp_ngram_range_word,
                 hp_max_df,
                 hp_min_df,
-                # hp_binary,
-                # hp_norm,
-                # hp_sublinear_tf,
+                hp_binary,
+                hp_norm,
+                hp_sublinear_tf,
                 hp_per_column,
             ]
         )
 
@@ -31,22 +31,28 @@ def fit(
             self.preprocessor = TruncatedSVD(
                 n_components=self.n_components, random_state=self.random_state
             )
+            self.preprocessor.fit(X)
         elif X.shape[1] <= self.n_components and X.shape[1] != 1:
             self.preprocessor = TruncatedSVD(
                 n_components=X.shape[1] - 1, random_state=self.random_state
             )
+            self.preprocessor.fit(X)
+        elif X.shape[1] == 1:
+            self.preprocessor = "passthrough"
         else:
             raise ValueError(
                 "The text embedding consists only of a single dimension.\n"
                 "Are you sure that your text data is necessary?"
             )
-        self.preprocessor.fit(X)
         return self
 
     def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
         if self.preprocessor is None:
             raise NotImplementedError()
-        return self.preprocessor.transform(X)
+        elif self.preprocessor == "passthrough":
+            return X
+        else:
+            return self.preprocessor.transform(X)
 
     @staticmethod
     def get_properties(
 
@@ -17,24 +17,30 @@ def test_metalearning_cs_size(self):
         data_dir = os.path.join(data_dir, "test_meta_base_data")
         os.chdir(data_dir)
 
+        # Total: 176, categorical: 3, numerical: 7, string: 7
+        total = 179
+        num_numerical = 6
+        num_string = 11
+        num_categorical = 3
         for feat_type, cs_size in [
-            ({"A": "numerical"}, 165),
-            ({"A": "categorical"}, 162),
-            ({"A": "string"}, 174 - 7),
-            ({"A": "numerical", "B": "categorical"}, 168),
-            ({"A": "numerical", "B": "string"}, 180 - 7),
-            ({"A": "categorical", "B": "string"}, 177 - 7),
-            ({"A": "categorical", "B": "string", "C": "numerical"}, 183 - 7),
+            ({"A": "numerical"}, total - num_string - num_categorical),
+            ({"A": "categorical"}, total - num_string - num_numerical),
+            ({"A": "string"}, total - num_categorical - num_numerical),
+            ({"A": "numerical", "B": "categorical"}, total - num_string),
+            ({"A": "numerical", "B": "string"}, total - num_categorical),
+            ({"A": "categorical", "B": "string"}, total - num_numerical),
+            ({"A": "categorical", "B": "string", "C": "numerical"}, total),
         ]:
             pipeline = autosklearn.pipeline.classification.SimpleClassificationPipeline(
                 feat_type=feat_type
             )
             self.cs = pipeline.get_hyperparameter_search_space(feat_type=feat_type)
-            # print(self.cs.get_default_configuration())
 
             self.logger = logging.getLogger()
             meta_base = MetaBase(self.cs, data_dir, logger=self.logger)
             self.meta_optimizer = metalearner.MetaLearningOptimizer(
                 "233", self.cs, meta_base, logger=self.logger
             )
-            self.assertEqual(len(self.meta_optimizer.configuration_space), cs_size)
+            self.assertEqual(
+                len(self.meta_optimizer.configuration_space), cs_size, feat_type
+            )
@@ -6,6 +6,11 @@
 
 
 class PreprocessingPipelineFeatTypeTest(unittest.TestCase):
+
+    num_numerical = 6
+    num_categorical = 3
+    num_text = 11
+
     def test_single_type(self):
         DPP = FeatTypeSplit(feat_type={"A": "numerical"})
         cs = DPP.get_hyperparameter_search_space(
@@ -22,7 +27,7 @@ def test_single_type(self):
         for key in cs.get_hyperparameters_dict().keys():
             self.assertNotIn("text", key.split(":")[0])
             self.assertNotIn("categorical", key.split(":")[0])
-        self.assertEqual(len(cs), 6)
+        self.assertEqual(len(cs), self.num_numerical)
 
         DPP = FeatTypeSplit(feat_type={"A": "categorical"})
         cs = DPP.get_hyperparameter_search_space(
@@ -39,7 +44,7 @@ def test_single_type(self):
         for key in cs.get_hyperparameters_dict().keys():
             self.assertNotIn("text", key.split(":")[0])
             self.assertNotIn("numerical", key.split(":")[0])
-        self.assertEqual(len(cs), 3)
+        self.assertEqual(len(cs), self.num_categorical)
 
         DPP = FeatTypeSplit(feat_type={"A": "string"})
         cs = DPP.get_hyperparameter_search_space(
@@ -56,7 +61,7 @@ def test_single_type(self):
         for key in cs.get_hyperparameters_dict().keys():
             self.assertNotIn("numerical", key.split(":")[0])
             self.assertNotIn("categorical", key.split(":")[0])
-        self.assertEqual(len(cs), 15 - 7)
+        self.assertEqual(len(cs), self.num_text)
 
     def test_dual_type(self):
         DPP = FeatTypeSplit(feat_type={"A": "numerical", "B": "categorical"})
@@ -73,7 +78,7 @@ def test_dual_type(self):
         )
         for key in cs.get_hyperparameters_dict().keys():
             self.assertNotIn("text", key.split(":")[0])
-        self.assertEqual(len(cs), 9)
+        self.assertEqual(len(cs), self.num_numerical + self.num_categorical)
 
         DPP = FeatTypeSplit(feat_type={"A": "categorical", "B": "string"})
         cs = DPP.get_hyperparameter_search_space(
@@ -89,7 +94,7 @@ def test_dual_type(self):
         )
         for key in cs.get_hyperparameters_dict().keys():
             self.assertNotIn("numerical", key.split(":")[0])
-        self.assertEqual(len(cs), 18 - 7)
+        self.assertEqual(len(cs), self.num_categorical + self.num_text)
 
         DPP = FeatTypeSplit(feat_type={"A": "string", "B": "numerical"})
         cs = DPP.get_hyperparameter_search_space(
@@ -105,7 +110,7 @@ def test_dual_type(self):
         )
         for key in cs.get_hyperparameters_dict().keys():
             self.assertNotIn("categorical", key.split(":")[0])
-        self.assertEqual(len(cs), 21 - 7)
+        self.assertEqual(len(cs), self.num_text + self.num_numerical)
 
     def test_triple_type(self):
         DPP = FeatTypeSplit(
@@ -132,4 +137,6 @@ def test_triple_type(self):
                 truth_table[2] = True
 
         self.assertEqual(sum(truth_table), 3)
-        self.assertEqual(len(cs), 24 - 7)
+        self.assertEqual(
+            len(cs), self.num_numerical + self.num_categorical + self.num_text
+        )