Skip to content

Commit a29b63e

Browse files
committed
Fix bug, rework tests
1 parent 5dccf51 commit a29b63e

7 files changed

Lines changed: 313 additions & 213 deletions

File tree

autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE
1515
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
1616
from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA
17+
from autosklearn.util.common import check_for_bool
1718

1819

1920
class TfidfEncoder(AutoSklearnPreprocessingAlgorithm):
@@ -51,6 +52,9 @@ def fit(
5152
else:
5253
raise KeyError(f"Analyzer is not defined for {self.analyzer}")
5354

55+
self.sublinear_tf = check_for_bool(self.sublinear_tf)
56+
self.binary = check_for_bool(self.binary)
57+
5458
if isinstance(X, pd.DataFrame):
5559
X.fillna("", inplace=True)
5660
if self.per_column:
@@ -120,8 +124,8 @@ def get_properties(
120124
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
121125
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
122126
return {
123-
"shortname": "RBOW",
124-
"name": "Relative Bag Of Word Encoder",
127+
"shortname": "TF/IDF",
128+
"name": "TF/IDF Encoder",
125129
"handles_regression": True,
126130
"handles_classification": True,
127131
"handles_multiclass": True,
@@ -153,15 +157,14 @@ def get_hyperparameter_search_space(
153157
default_value=4,
154158
)
155159

156-
hp_ngram_range_word = CSH.UniformFloatHyperparameter(
160+
hp_ngram_range_word = CSH.UniformIntegerHyperparameter(
157161
name="ngram_range_word",
158162
lower=1,
159163
upper=3,
160164
default_value=1,
161165
)
162166

163167
hp_min_df = CSH.UniformFloatHyperparameter(
164-
# Todo this can still result in building no vectorizer
165168
name="min_df",
166169
lower=0.0,
167170
upper=0.3,
@@ -172,17 +175,15 @@ def get_hyperparameter_search_space(
172175
name="max_df", lower=0.7, upper=1.0, default_value=1.0
173176
)
174177

175-
# hp_binary = CSH.CategoricalHyperparameter(
176-
# name="binary", choices=[True, False], default_value=False
177-
# )
178+
hp_binary = CSH.UnParametrizedHyperparameter(name="binary", value="False")
178179

179-
# hp_norm = CSH.CategoricalHyperparameter(
180-
# name="norm", choices=["l2", "l1"], default_value="l2"
181-
# )
180+
hp_norm = CSH.CategoricalHyperparameter(
181+
name="norm", choices=["l2", "l1"], default_value="l2"
182+
)
182183

183-
# hp_sublinear_tf = CSH.CategoricalHyperparameter(
184-
# name="sublinear_tf", choices=[True, False], default_value=False
185-
# )
184+
hp_sublinear_tf = CSH.UnParametrizedHyperparameter(
185+
name="sublinear_tf", value="False"
186+
)
186187

187188
hp_per_column = CSH.CategoricalHyperparameter(
188189
name="per_column", choices=[True, False], default_value=False
@@ -195,9 +196,9 @@ def get_hyperparameter_search_space(
195196
hp_ngram_range_word,
196197
hp_max_df,
197198
hp_min_df,
198-
# hp_binary,
199-
# hp_norm,
200-
# hp_sublinear_tf,
199+
hp_binary,
200+
hp_norm,
201+
hp_sublinear_tf,
201202
hp_per_column,
202203
]
203204
)

autosklearn/pipeline/components/data_preprocessing/text_feature_reduction/truncated_svd.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,22 +31,28 @@ def fit(
3131
self.preprocessor = TruncatedSVD(
3232
n_components=self.n_components, random_state=self.random_state
3333
)
34+
self.preprocessor.fit(X)
3435
elif X.shape[1] <= self.n_components and X.shape[1] != 1:
3536
self.preprocessor = TruncatedSVD(
3637
n_components=X.shape[1] - 1, random_state=self.random_state
3738
)
39+
self.preprocessor.fit(X)
40+
elif X.shape[1] == 1:
41+
self.preprocessor = "passthrough"
3842
else:
3943
raise ValueError(
4044
"The text embedding consists only of a single dimension.\n"
4145
"Are you sure that your text data is necessary?"
4246
)
43-
self.preprocessor.fit(X)
4447
return self
4548

4649
def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
4750
if self.preprocessor is None:
4851
raise NotImplementedError()
49-
return self.preprocessor.transform(X)
52+
elif self.preprocessor == "passthrough":
53+
return X
54+
else:
55+
return self.preprocessor.transform(X)
5056

5157
@staticmethod
5258
def get_properties(

test/test_metalearning/pyMetaLearn/test_metalearning_configuration.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,24 +17,30 @@ def test_metalearning_cs_size(self):
1717
data_dir = os.path.join(data_dir, "test_meta_base_data")
1818
os.chdir(data_dir)
1919

20+
# Total: 176, categorical: 3, numerical: 7, string: 7
21+
total = 179
22+
num_numerical = 6
23+
num_string = 11
24+
num_categorical = 3
2025
for feat_type, cs_size in [
21-
({"A": "numerical"}, 165),
22-
({"A": "categorical"}, 162),
23-
({"A": "string"}, 174 - 7),
24-
({"A": "numerical", "B": "categorical"}, 168),
25-
({"A": "numerical", "B": "string"}, 180 - 7),
26-
({"A": "categorical", "B": "string"}, 177 - 7),
27-
({"A": "categorical", "B": "string", "C": "numerical"}, 183 - 7),
26+
({"A": "numerical"}, total - num_string - num_categorical),
27+
({"A": "categorical"}, total - num_string - num_numerical),
28+
({"A": "string"}, total - num_categorical - num_numerical),
29+
({"A": "numerical", "B": "categorical"}, total - num_string),
30+
({"A": "numerical", "B": "string"}, total - num_categorical),
31+
({"A": "categorical", "B": "string"}, total - num_numerical),
32+
({"A": "categorical", "B": "string", "C": "numerical"}, total),
2833
]:
2934
pipeline = autosklearn.pipeline.classification.SimpleClassificationPipeline(
3035
feat_type=feat_type
3136
)
3237
self.cs = pipeline.get_hyperparameter_search_space(feat_type=feat_type)
33-
# print(self.cs.get_default_configuration())
3438

3539
self.logger = logging.getLogger()
3640
meta_base = MetaBase(self.cs, data_dir, logger=self.logger)
3741
self.meta_optimizer = metalearner.MetaLearningOptimizer(
3842
"233", self.cs, meta_base, logger=self.logger
3943
)
40-
self.assertEqual(len(self.meta_optimizer.configuration_space), cs_size)
44+
self.assertEqual(
45+
len(self.meta_optimizer.configuration_space), cs_size, feat_type
46+
)

test/test_pipeline/components/data_preprocessing/test_data_preprocessing_feat_type.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66

77

88
class PreprocessingPipelineFeatTypeTest(unittest.TestCase):
9+
10+
num_numerical = 6
11+
num_categorical = 3
12+
num_text = 11
13+
914
def test_single_type(self):
1015
DPP = FeatTypeSplit(feat_type={"A": "numerical"})
1116
cs = DPP.get_hyperparameter_search_space(
@@ -22,7 +27,7 @@ def test_single_type(self):
2227
for key in cs.get_hyperparameters_dict().keys():
2328
self.assertNotIn("text", key.split(":")[0])
2429
self.assertNotIn("categorical", key.split(":")[0])
25-
self.assertEqual(len(cs), 6)
30+
self.assertEqual(len(cs), self.num_numerical)
2631

2732
DPP = FeatTypeSplit(feat_type={"A": "categorical"})
2833
cs = DPP.get_hyperparameter_search_space(
@@ -39,7 +44,7 @@ def test_single_type(self):
3944
for key in cs.get_hyperparameters_dict().keys():
4045
self.assertNotIn("text", key.split(":")[0])
4146
self.assertNotIn("numerical", key.split(":")[0])
42-
self.assertEqual(len(cs), 3)
47+
self.assertEqual(len(cs), self.num_categorical)
4348

4449
DPP = FeatTypeSplit(feat_type={"A": "string"})
4550
cs = DPP.get_hyperparameter_search_space(
@@ -56,7 +61,7 @@ def test_single_type(self):
5661
for key in cs.get_hyperparameters_dict().keys():
5762
self.assertNotIn("numerical", key.split(":")[0])
5863
self.assertNotIn("categorical", key.split(":")[0])
59-
self.assertEqual(len(cs), 15 - 7)
64+
self.assertEqual(len(cs), self.num_text)
6065

6166
def test_dual_type(self):
6267
DPP = FeatTypeSplit(feat_type={"A": "numerical", "B": "categorical"})
@@ -73,7 +78,7 @@ def test_dual_type(self):
7378
)
7479
for key in cs.get_hyperparameters_dict().keys():
7580
self.assertNotIn("text", key.split(":")[0])
76-
self.assertEqual(len(cs), 9)
81+
self.assertEqual(len(cs), self.num_numerical + self.num_categorical)
7782

7883
DPP = FeatTypeSplit(feat_type={"A": "categorical", "B": "string"})
7984
cs = DPP.get_hyperparameter_search_space(
@@ -89,7 +94,7 @@ def test_dual_type(self):
8994
)
9095
for key in cs.get_hyperparameters_dict().keys():
9196
self.assertNotIn("numerical", key.split(":")[0])
92-
self.assertEqual(len(cs), 18 - 7)
97+
self.assertEqual(len(cs), self.num_categorical + self.num_text)
9398

9499
DPP = FeatTypeSplit(feat_type={"A": "string", "B": "numerical"})
95100
cs = DPP.get_hyperparameter_search_space(
@@ -105,7 +110,7 @@ def test_dual_type(self):
105110
)
106111
for key in cs.get_hyperparameters_dict().keys():
107112
self.assertNotIn("categorical", key.split(":")[0])
108-
self.assertEqual(len(cs), 21 - 7)
113+
self.assertEqual(len(cs), self.num_text + self.num_numerical)
109114

110115
def test_triple_type(self):
111116
DPP = FeatTypeSplit(
@@ -132,4 +137,6 @@ def test_triple_type(self):
132137
truth_table[2] = True
133138

134139
self.assertEqual(sum(truth_table), 3)
135-
self.assertEqual(len(cs), 24 - 7)
140+
self.assertEqual(
141+
len(cs), self.num_numerical + self.num_categorical + self.num_text
142+
)

0 commit comments

Comments
 (0)