diff --git a/autosklearn/metalearning/metafeatures/metafeature.py b/autosklearn/metalearning/metafeatures/metafeature.py index 033b76116b..ec97eafb22 100644 --- a/autosklearn/metalearning/metafeatures/metafeature.py +++ b/autosklearn/metalearning/metafeatures/metafeature.py @@ -15,19 +15,18 @@ def __init__(self): pass @abstractmethod - def _calculate(cls, X, y, logger, categorical): + def _calculate(cls, X, y, logger, feat_type): pass - def __call__(self, X, y, logger, categorical=None): - if categorical is None: - categorical = [False for i in range(X.shape[1])] + def __call__(self, X, y, logger, feat_type=None): + if feat_type is None: + feat_type = {i: "numerical" for i in range(X.shape[1])} starttime = time.time() - try: if scipy.sparse.issparse(X) and hasattr(self, "_calculate_sparse"): - value = self._calculate_sparse(X, y, logger, categorical) + value = self._calculate_sparse(X, y, logger, feat_type) else: - value = self._calculate(X, y, logger, categorical) + value = self._calculate(X, y, logger, feat_type) comment = "" except MemoryError: value = None diff --git a/autosklearn/metalearning/metafeatures/metafeatures.py b/autosklearn/metalearning/metafeatures/metafeatures.py index 3c95fbf22f..2c4d9cf4bd 100644 --- a/autosklearn/metalearning/metafeatures/metafeatures.py +++ b/autosklearn/metalearning/metafeatures/metafeatures.py @@ -135,13 +135,13 @@ def wrapper(metafeature_class): ################################################################################ @metafeatures.define("NumberOfInstances") class NumberOfInstances(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): return float(X.shape[0]) @metafeatures.define("LogNumberOfInstances", dependency="NumberOfInstances") class LogNumberOfInstances(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): return np.log(metafeatures.get_value("NumberOfInstances")) @@ -154,7 +154,7 @@ class NumberOfClasses(MetaFeature): does this for each label seperately and returns the mean. """ - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): if type_of_target(y) == "multilabel-indicator": # We have a label binary indicator array: # each sample is one row of a 2d array of shape (n_samples, n_classes) @@ -167,23 +167,23 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("NumberOfFeatures") class NumberOfFeatures(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): return float(X.shape[1]) @metafeatures.define("LogNumberOfFeatures", dependency="NumberOfFeatures") class LogNumberOfFeatures(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): return np.log(metafeatures.get_value("NumberOfFeatures")) @helper_functions.define("MissingValues") class MissingValues(HelperFunction): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): missing = pd.isna(X) return missing - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): data = [True if not np.isfinite(x) else False for x in X.data] missing = X.__class__((data, X.indices, X.indptr), shape=X.shape, dtype=bool) return missing @@ -191,12 +191,12 @@ def _calculate_sparse(self, X, y, logger, categorical): @metafeatures.define("NumberOfInstancesWithMissingValues", dependency="MissingValues") class NumberOfInstancesWithMissingValues(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): missing = helper_functions.get_value("MissingValues") num_missing = missing.sum(axis=1) return float(np.sum([1 if num > 0 else 0 for num in num_missing])) - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): missing = helper_functions.get_value("MissingValues") new_missing = missing.tocsr() num_missing = [ @@ -212,7 +212,7 @@ def _calculate_sparse(self, X, y, logger, categorical): dependency="NumberOfInstancesWithMissingValues", ) class PercentageOfInstancesWithMissingValues(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): n_missing = metafeatures.get_value("NumberOfInstancesWithMissingValues") n_total = float(metafeatures["NumberOfInstances"](X, y, logger).value) return float(n_missing / n_total) @@ -220,12 +220,12 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("NumberOfFeaturesWithMissingValues", dependency="MissingValues") class NumberOfFeaturesWithMissingValues(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): missing = helper_functions.get_value("MissingValues") num_missing = missing.sum(axis=0) return float(np.sum([1 if num > 0 else 0 for num in num_missing])) - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): missing = helper_functions.get_value("MissingValues") new_missing = missing.tocsc() num_missing = [ @@ -241,7 +241,7 @@ def _calculate_sparse(self, X, y, logger, categorical): dependency="NumberOfFeaturesWithMissingValues", ) class PercentageOfFeaturesWithMissingValues(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): n_missing = metafeatures.get_value("NumberOfFeaturesWithMissingValues") n_total = float(metafeatures["NumberOfFeatures"](X, y, logger).value) return float(n_missing / n_total) @@ -249,7 +249,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("NumberOfMissingValues", dependency="MissingValues") class NumberOfMissingValues(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): if scipy.sparse.issparse(X): return float(helper_functions.get_value("MissingValues").sum()) else: @@ -258,7 +258,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("PercentageOfMissingValues", dependency="NumberOfMissingValues") class PercentageOfMissingValues(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): return float(metafeatures.get_value("NumberOfMissingValues")) / float( X.shape[0] * X.shape[1] ) @@ -267,24 +267,24 @@ def _calculate(self, X, y, logger, categorical): # TODO: generalize this! @metafeatures.define("NumberOfNumericFeatures") class NumberOfNumericFeatures(MetaFeature): - def _calculate(self, X, y, logger, categorical): - return len(categorical) - np.sum(list(categorical.values())) + def _calculate(self, X, y, logger, feat_type): + return np.sum([value == "numerical" for value in feat_type.values()]) @metafeatures.define("NumberOfCategoricalFeatures") class NumberOfCategoricalFeatures(MetaFeature): - def _calculate(self, X, y, logger, categorical): - return np.sum(list(categorical.values())) + def _calculate(self, X, y, logger, feat_type): + return np.sum([value == "categorical" for value in feat_type.values()]) @metafeatures.define("RatioNumericalToNominal") class RatioNumericalToNominal(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): num_categorical = float( - metafeatures["NumberOfCategoricalFeatures"](X, y, logger, categorical).value + metafeatures["NumberOfCategoricalFeatures"](X, y, logger, feat_type).value ) num_numerical = float( - metafeatures["NumberOfNumericFeatures"](X, y, logger, categorical).value + metafeatures["NumberOfNumericFeatures"](X, y, logger, feat_type).value ) if num_categorical == 0.0: return 0.0 @@ -293,12 +293,12 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("RatioNominalToNumerical") class RatioNominalToNumerical(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): num_categorical = float( - metafeatures["NumberOfCategoricalFeatures"](X, y, logger, categorical).value + metafeatures["NumberOfCategoricalFeatures"](X, y, logger, feat_type).value ) num_numerical = float( - metafeatures["NumberOfNumericFeatures"](X, y, logger, categorical).value + metafeatures["NumberOfNumericFeatures"](X, y, logger, feat_type).value ) if num_numerical == 0.0: return 0.0 @@ -309,7 +309,7 @@ def _calculate(self, X, y, logger, categorical): # Number of attributes divided by number of samples @metafeatures.define("DatasetRatio") class DatasetRatio(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): return float(metafeatures["NumberOfFeatures"](X, y, logger).value) / float( metafeatures["NumberOfInstances"](X, y, logger).value ) @@ -317,13 +317,13 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("LogDatasetRatio", dependency="DatasetRatio") class LogDatasetRatio(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): return np.log(metafeatures.get_value("DatasetRatio")) @metafeatures.define("InverseDatasetRatio") class InverseDatasetRatio(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): return float(metafeatures["NumberOfInstances"](X, y, logger).value) / float( metafeatures["NumberOfFeatures"](X, y, logger).value ) @@ -331,17 +331,17 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("LogInverseDatasetRatio", dependency="InverseDatasetRatio") class LogInverseDatasetRatio(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): return np.log(metafeatures.get_value("InverseDatasetRatio")) @helper_functions.define("ClassOccurences") class ClassOccurences(HelperFunction): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): if len(y.shape) == 2: occurences = [] for i in range(y.shape[1]): - occurences.append(self._calculate(X, y[:, i], logger, categorical)) + occurences.append(self._calculate(X, y[:, i], logger, feat_type)) return occurences else: occurence_dict = defaultdict(float) @@ -352,7 +352,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("ClassProbabilityMin", dependency="ClassOccurences") class ClassProbabilityMin(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): occurences = helper_functions.get_value("ClassOccurences") min_value = np.iinfo(np.int64).max @@ -371,7 +371,7 @@ def _calculate(self, X, y, logger, categorical): # aka default accuracy @metafeatures.define("ClassProbabilityMax", dependency="ClassOccurences") class ClassProbabilityMax(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): occurences = helper_functions.get_value("ClassOccurences") max_value = -1 @@ -389,7 +389,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("ClassProbabilityMean", dependency="ClassOccurences") class ClassProbabilityMean(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): occurence_dict = helper_functions.get_value("ClassOccurences") if len(y.shape) == 2: @@ -408,7 +408,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("ClassProbabilitySTD", dependency="ClassOccurences") class ClassProbabilitySTD(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): occurence_dict = helper_functions.get_value("ClassOccurences") if len(y.shape) == 2: @@ -434,7 +434,11 @@ def _calculate(self, X, y, logger, categorical): # be the counterpart for the skewness and kurtosis of the numerical features @helper_functions.define("NumSymbols") class NumSymbols(HelperFunction): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): + categorical = { + key: True if value.lower() == "categorical" else False + for key, value in feat_type.items() + } symbols_per_column = [] for i in range(X.shape[1]): if categorical[X.columns[i] if hasattr(X, "columns") else i]: @@ -446,7 +450,11 @@ def _calculate(self, X, y, logger, categorical): symbols_per_column.append(num_unique) return symbols_per_column - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): + categorical = { + key: True if value.lower() == "categorical" else False + for key, value in feat_type.items() + } symbols_per_column = [] new_X = X.tocsc() for i in range(new_X.shape[1]): @@ -459,7 +467,7 @@ def _calculate_sparse(self, X, y, logger, categorical): @metafeatures.define("SymbolsMin", dependency="NumSymbols") class SymbolsMin(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): # The minimum can only be zero if there are no nominal features, # otherwise it is at least one # TODO: shouldn't this rather be two? @@ -472,7 +480,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("SymbolsMax", dependency="NumSymbols") class SymbolsMax(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): values = helper_functions.get_value("NumSymbols") if len(values) == 0: return 0 @@ -481,7 +489,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("SymbolsMean", dependency="NumSymbols") class SymbolsMean(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): # TODO: categorical attributes without a symbol don't count towards this # measure values = [val for val in helper_functions.get_value("NumSymbols") if val > 0] @@ -491,7 +499,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("SymbolsSTD", dependency="NumSymbols") class SymbolsSTD(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): values = [val for val in helper_functions.get_value("NumSymbols") if val > 0] std = np.nanstd(values) return std if np.isfinite(std) else 0 @@ -499,7 +507,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("SymbolsSum", dependency="NumSymbols") class SymbolsSum(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): sum = np.nansum(helper_functions.get_value("NumSymbols")) return sum if np.isfinite(sum) else 0 @@ -514,10 +522,14 @@ def _calculate(self, X, y, logger, categorical): @helper_functions.define("Kurtosisses") class Kurtosisses(HelperFunction): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): + numerical = { + key: True if value.lower() == "numerical" else False + for key, value in feat_type.items() + } kurts = [] for i in range(X.shape[1]): - if not categorical[X.columns[i] if hasattr(X, "columns") else i]: + if numerical[X.columns[i] if hasattr(X, "columns") else i]: kurts.append( scipy.stats.kurtosis( X.iloc[:, i] if hasattr(X, "iloc") else X[:, i] @@ -525,11 +537,15 @@ def _calculate(self, X, y, logger, categorical): ) return kurts - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): + numerical = { + key: True if value.lower() == "numerical" else False + for key, value in feat_type.items() + } kurts = [] X_new = X.tocsc() for i in range(X_new.shape[1]): - if not categorical[X.columns[i] if hasattr(X, "columns") else i]: + if numerical[X.columns[i] if hasattr(X, "columns") else i]: start = X_new.indptr[i] stop = X_new.indptr[i + 1] kurts.append(scipy.stats.kurtosis(X_new.data[start:stop])) @@ -538,7 +554,7 @@ def _calculate_sparse(self, X, y, logger, categorical): @metafeatures.define("KurtosisMin", dependency="Kurtosisses") class KurtosisMin(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): kurts = helper_functions.get_value("Kurtosisses") minimum = np.nanmin(kurts) if len(kurts) > 0 else 0 return minimum if np.isfinite(minimum) else 0 @@ -546,7 +562,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("KurtosisMax", dependency="Kurtosisses") class KurtosisMax(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): kurts = helper_functions.get_value("Kurtosisses") maximum = np.nanmax(kurts) if len(kurts) > 0 else 0 return maximum if np.isfinite(maximum) else 0 @@ -554,7 +570,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("KurtosisMean", dependency="Kurtosisses") class KurtosisMean(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): kurts = helper_functions.get_value("Kurtosisses") mean = np.nanmean(kurts) if len(kurts) > 0 else 0 return mean if np.isfinite(mean) else 0 @@ -562,7 +578,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("KurtosisSTD", dependency="Kurtosisses") class KurtosisSTD(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): kurts = helper_functions.get_value("Kurtosisses") std = np.nanstd(kurts) if len(kurts) > 0 else 0 return std if np.isfinite(std) else 0 @@ -570,20 +586,28 @@ def _calculate(self, X, y, logger, categorical): @helper_functions.define("Skewnesses") class Skewnesses(HelperFunction): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): + numerical = { + key: True if value.lower() == "numerical" else False + for key, value in feat_type.items() + } skews = [] for i in range(X.shape[1]): - if not categorical[X.columns[i] if hasattr(X, "columns") else i]: + if numerical[X.columns[i] if hasattr(X, "columns") else i]: skews.append( scipy.stats.skew(X.iloc[:, i] if hasattr(X, "iloc") else X[:, i]) ) return skews - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): + numerical = { + key: True if value.lower() == "numerical" else False + for key, value in feat_type.items() + } skews = [] X_new = X.tocsc() for i in range(X_new.shape[1]): - if not categorical[X.columns[i] if hasattr(X, "columns") else i]: + if numerical[X.columns[i] if hasattr(X, "columns") else i]: start = X_new.indptr[i] stop = X_new.indptr[i + 1] skews.append(scipy.stats.skew(X_new.data[start:stop])) @@ -592,7 +616,7 @@ def _calculate_sparse(self, X, y, logger, categorical): @metafeatures.define("SkewnessMin", dependency="Skewnesses") class SkewnessMin(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): skews = helper_functions.get_value("Skewnesses") minimum = np.nanmin(skews) if len(skews) > 0 else 0 return minimum if np.isfinite(minimum) else 0 @@ -600,7 +624,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("SkewnessMax", dependency="Skewnesses") class SkewnessMax(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): skews = helper_functions.get_value("Skewnesses") maximum = np.nanmax(skews) if len(skews) > 0 else 0 return maximum if np.isfinite(maximum) else 0 @@ -608,7 +632,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("SkewnessMean", dependency="Skewnesses") class SkewnessMean(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): skews = helper_functions.get_value("Skewnesses") mean = np.nanmean(skews) if len(skews) > 0 else 0 return mean if np.isfinite(mean) else 0 @@ -616,7 +640,7 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("SkewnessSTD", dependency="Skewnesses") class SkewnessSTD(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): skews = helper_functions.get_value("Skewnesses") std = np.nanstd(skews) if len(skews) > 0 else 0 return std if np.isfinite(std) else 0 @@ -637,7 +661,7 @@ def cancor2(X, y): # Information-theoretic metafeatures @metafeatures.define("ClassEntropy") class ClassEntropy(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): labels = 1 if len(y.shape) == 1 else y.shape[1] entropies = [] @@ -687,7 +711,7 @@ def _calculate(self, X, y, logger, categorical): # Linear discriminant learner @metafeatures.define("LandmarkLDA") class LandmarkLDA(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): import sklearn.discriminant_analysis if type(y) in ("binary", "multiclass"): @@ -727,14 +751,14 @@ def _calculate(self, X, y, logger, categorical): self.logger.warning("LDA failed: %s Returned 0 instead!" % e) return np.NaN - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): return np.NaN # Naive Bayes @metafeatures.define("LandmarkNaiveBayes") class LandmarkNaiveBayes(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): import sklearn.naive_bayes if type(y) in ("binary", "multiclass"): @@ -767,14 +791,14 @@ def _calculate(self, X, y, logger, categorical): ) return accuracy / 5 - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): return np.NaN # Cart learner instead of C5.0 @metafeatures.define("LandmarkDecisionTree") class LandmarkDecisionTree(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): import sklearn.tree if type(y) in ("binary", "multiclass"): @@ -808,7 +832,7 @@ def _calculate(self, X, y, logger, categorical): ) return accuracy / 5 - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): return np.NaN @@ -821,7 +845,7 @@ def _calculate_sparse(self, X, y, logger, categorical): # saves a lot of time... @metafeatures.define("LandmarkDecisionNodeLearner") class LandmarkDecisionNodeLearner(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): import sklearn.tree if type(y) in ("binary", "multiclass"): @@ -860,13 +884,13 @@ def _calculate(self, X, y, logger, categorical): ) return accuracy / 5 - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): return np.NaN @metafeatures.define("LandmarkRandomNodeLearner") class LandmarkRandomNodeLearner(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): import sklearn.tree if type(y) in ("binary", "multiclass"): @@ -898,7 +922,7 @@ def _calculate(self, X, y, logger, categorical): ) return accuracy / 5 - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): return np.NaN @@ -931,7 +955,7 @@ def landmark_worst_node_learner(X, y): # intuition behind this landmark, but Elite 1NN is used nowhere else... @metafeatures.define("Landmark1NN") class Landmark1NN(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): import sklearn.neighbors if type(y) in ("binary", "multiclass"): @@ -974,7 +998,7 @@ def _calculate(self, X, y, logger, categorical): # kurtosis of a dataset projected onto one principal component @helper_functions.define("PCA") class PCA(HelperFunction): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): import sklearn.decomposition pca = sklearn.decomposition.PCA(copy=True) @@ -992,7 +1016,7 @@ def _calculate(self, X, y, logger, categorical): self.logger.warning("Failed to compute a Principle Component Analysis") return None - def _calculate_sparse(self, X, y, logger, categorical): + def _calculate_sparse(self, X, y, logger, feat_type): import sklearn.decomposition rs = np.random.RandomState(42) @@ -1016,7 +1040,7 @@ def _calculate_sparse(self, X, y, logger, categorical): # Maybe define some more... @metafeatures.define("PCAFractionOfComponentsFor95PercentVariance", dependency="PCA") class PCAFractionOfComponentsFor95PercentVariance(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): pca_ = helper_functions.get_value("PCA") if pca_ is None: return np.NaN @@ -1031,7 +1055,7 @@ def _calculate(self, X, y, logger, categorical): # Kurtosis of first PC @metafeatures.define("PCAKurtosisFirstPC", dependency="PCA") class PCAKurtosisFirstPC(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): pca_ = helper_functions.get_value("PCA") if pca_ is None: return np.NaN @@ -1047,7 +1071,7 @@ def _calculate(self, X, y, logger, categorical): # Skewness of first PC @metafeatures.define("PCASkewnessFirstPC", dependency="PCA") class PCASkewnessFirstPC(MetaFeature): - def _calculate(self, X, y, logger, categorical): + def _calculate(self, X, y, logger, feat_type): pca_ = helper_functions.get_value("PCA") if pca_ is None: return np.NaN @@ -1061,7 +1085,7 @@ def _calculate(self, X, y, logger, categorical): def calculate_all_metafeatures_encoded_labels( - X, y, categorical, dataset_name, logger, calculate=None, dont_calculate=None + X, y, feat_type, dataset_name, logger, calculate=None, dont_calculate=None ): """ Calculate only metafeatures for which a 1HotEncoded feature matrix is necessery. @@ -1073,7 +1097,7 @@ def calculate_all_metafeatures_encoded_labels( return calculate_all_metafeatures( X, y, - categorical, + feat_type, dataset_name, calculate=calculate, dont_calculate=dont_calculate, @@ -1082,7 +1106,7 @@ def calculate_all_metafeatures_encoded_labels( def calculate_all_metafeatures_with_labels( - X, y, categorical, dataset_name, logger, calculate=None, dont_calculate=None + X, y, feat_type, dataset_name, logger, calculate=None, dont_calculate=None ): if dont_calculate is None: dont_calculate = set() @@ -1092,7 +1116,7 @@ def calculate_all_metafeatures_with_labels( return calculate_all_metafeatures( X, y, - categorical, + feat_type, dataset_name, calculate=calculate, dont_calculate=dont_calculate, @@ -1103,7 +1127,7 @@ def calculate_all_metafeatures_with_labels( def calculate_all_metafeatures( X, y, - categorical, + feat_type, dataset_name, logger, calculate=None, @@ -1138,11 +1162,6 @@ def calculate_all_metafeatures( # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) - feat_type = { - key: "categorical" if value else "numerical" - for key, value in categorical.items() - } - # TODO make this more cohesive to the overall structure (quick bug fix) if isinstance(X, pd.DataFrame): for key in X.select_dtypes(include="string").columns: @@ -1155,8 +1174,8 @@ def calculate_all_metafeatures( force_sparse_output=True, ) X_transformed = DPP.fit_transform(X) - categorical_transformed = { - i: False for i in range(X_transformed.shape[1]) + feat_type_transformed = { + i: "numerical" for i in range(X_transformed.shape[1]) } # Densify the transformed matrix @@ -1185,11 +1204,11 @@ def calculate_all_metafeatures( X_ = X_transformed y_ = y_transformed - categorical_ = categorical_transformed + feat_type_ = feat_type_transformed else: X_ = X y_ = y - categorical_ = categorical + feat_type_ = feat_type dependency = metafeatures.get_dependency(name) if dependency is not None: @@ -1206,14 +1225,14 @@ def calculate_all_metafeatures( elif is_helper_function and not helper_functions.is_calculated(dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency]( - X_, y_, categorical=categorical_, logger=logger + X_, y_, feat_type=feat_type_, logger=logger ) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) - value = metafeatures[name](X_, y_, logger, categorical_) + value = metafeatures[name](X_, y_, logger, feat_type_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type.py b/autosklearn/pipeline/components/data_preprocessing/feature_type.py index 5c37e4cb98..bd42d8a67a 100644 --- a/autosklearn/pipeline/components/data_preprocessing/feature_type.py +++ b/autosklearn/pipeline/components/data_preprocessing/feature_type.py @@ -162,9 +162,9 @@ def fit( sklearn_transf_spec = [ (name, transformer, feature_columns) for name, transformer, feature_columns in [ - ("text_transformer", self.txt_ppl, text_features), ("categorical_transformer", self.categ_ppl, categorical_features), ("numerical_transformer", self.numer_ppl, numerical_features), + ("text_transformer", self.txt_ppl, text_features), ] if len(feature_columns) > 0 ] diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index 608c58921d..c463badb1d 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -101,11 +101,6 @@ def _calculate_metafeatures( # == Calculate metafeatures with stopwatch.time("Calculate meta-features") as task_timer: - categorical = { - col: True if feat_type.lower() in {"categorical", "string"} else False - for col, feat_type in data_feat_type.items() - } - EXCLUDE_META_FEATURES = ( EXCLUDE_META_FEATURES_CLASSIFICATION if data_info_task in CLASSIFICATION_TASKS @@ -123,7 +118,7 @@ def _calculate_metafeatures( result = calculate_all_metafeatures_with_labels( x_train, y_train, - categorical=categorical, + feat_type=data_feat_type, dataset_name=basename, dont_calculate=EXCLUDE_META_FEATURES, logger=logger_, @@ -159,15 +154,11 @@ def _calculate_metafeatures_encoded( ) with stopwatch.time("Calculate meta-features encoded") as task_timer: - categorical = { - col: True if feat_type.lower() in {"categorical", "string"} else False - for col, feat_type in data_feat_type.items() - } result = calculate_all_metafeatures_encoded_labels( x_train, y_train, - categorical=categorical, + feat_type=data_feat_type, dataset_name=basename, dont_calculate=EXCLUDE_META_FEATURES, logger=logger_, diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features.py b/test/test_metalearning/pyMetaLearn/test_meta_features.py index 40048c708a..3dea2c762b 100644 --- a/test/test_metalearning/pyMetaLearn/test_meta_features.py +++ b/test/test_metalearning/pyMetaLearn/test_meta_features.py @@ -48,14 +48,9 @@ def meta_train_data(request): dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class - attribute_types = [ - "numeric" if type(type_) != list else "nominal" - for name, type_ in dataset["attributes"][:-1] - ] - - categorical = { - i: True if attribute == "nominal" else False - for i, attribute in enumerate(attribute_types) + feat_type = { + idx: "numerical" if type(type_) != list else "categorical" + for idx, (name, type_) in enumerate(dataset["attributes"][:-1]) } data = np.array(dataset["data"], dtype=np.float64) @@ -65,20 +60,32 @@ def meta_train_data(request): logger = logging.getLogger("Meta") meta_features.helper_functions.set_value( "MissingValues", - meta_features.helper_functions["MissingValues"](X, y, logger, categorical), + meta_features.helper_functions["MissingValues"](X, y, logger, feat_type), ) meta_features.helper_functions.set_value( "NumSymbols", - meta_features.helper_functions["NumSymbols"](X, y, logger, categorical), + meta_features.helper_functions["NumSymbols"](X, y, logger, feat_type), ) meta_features.helper_functions.set_value( "ClassOccurences", meta_features.helper_functions["ClassOccurences"](X, y, logger), ) if request.param == "numpy": - return X, y, categorical + return X, y, feat_type elif request.param == "pandas": - return pd.DataFrame(X), y, categorical + dtypes = {} + for key, value in feat_type.items(): + if value == "categorical": + dtypes[key] = "category" + elif value == "numerical": + dtypes[key] = "float64" + elif value == "string": + dtypes[key] = "string" + else: + raise KeyError + + X = pd.DataFrame(X).astype(dtypes) + return X, y, feat_type else: raise ValueError(request.param) @@ -93,13 +100,9 @@ def meta_train_data_transformed(request): dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class - attribute_types = [ - "numeric" if type(type_) != list else "nominal" - for name, type_ in dataset["attributes"][:-1] - ] - categorical = { - i: True if attribute == "nominal" else False - for i, attribute in enumerate(attribute_types) + feat_type = { + idx: "numerical" if type(type_) != list else "categorical" + for idx, (name, type_) in enumerate(dataset["attributes"][:-1]) } data = np.array(dataset["data"], dtype=np.float64) @@ -109,30 +112,20 @@ def meta_train_data_transformed(request): logger = logging.getLogger("Meta") meta_features.helper_functions.set_value( "MissingValues", - meta_features.helper_functions["MissingValues"](X, y, logger, categorical), + meta_features.helper_functions["MissingValues"](X, y, logger, feat_type), ) meta_features.helper_functions.set_value( "NumSymbols", - meta_features.helper_functions["NumSymbols"](X, y, logger, categorical), + meta_features.helper_functions["NumSymbols"](X, y, logger, feat_type), ) meta_features.helper_functions.set_value( "ClassOccurences", meta_features.helper_functions["ClassOccurences"](X, y, logger), ) - DPP = FeatTypeSplit( - feat_type={ - col: "categorical" if category else "numerical" - for col, category in categorical.items() - } - ) + DPP = FeatTypeSplit(feat_type=feat_type) X_transformed = DPP.fit_transform(X) - - number_numerical = np.sum(~np.array(list(categorical.values()))) - categorical_transformed = { - i: True if i < (X_transformed.shape[1] - number_numerical) else False - for i in range(X_transformed.shape[1]) - } + feat_type_transformed = {i: "numerical" for i in range(X_transformed.shape[1])} # pre-compute values for transformed inputs meta_features.helper_functions.set_value( @@ -142,55 +135,66 @@ def meta_train_data_transformed(request): meta_features.helper_functions.set_value( "Skewnesses", meta_features.helper_functions["Skewnesses"]( - X_transformed, y, logger, categorical_transformed + X_transformed, y, logger, feat_type_transformed ), ) meta_features.helper_functions.set_value( "Kurtosisses", meta_features.helper_functions["Kurtosisses"]( - X_transformed, y, logger, categorical_transformed + X_transformed, y, logger, feat_type_transformed ), ) if request.param == "numpy": - return X_transformed, y, categorical_transformed + return X_transformed, y, feat_type_transformed elif request.param == "pandas": - return pd.DataFrame(X_transformed), y, categorical_transformed + dtypes = {} + for key, value in feat_type.items(): + if value == "categorical": + dtypes[key] = "category" + elif value == "numerical": + dtypes[key] = "float64" + elif value == "string": + dtypes[key] = "string" + else: + raise KeyError + X_transformed = pd.DataFrame(X_transformed).astype(dtypes) + return X_transformed, y, feat_type_transformed else: raise ValueError(request.param) def test_number_of_instance(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["NumberOfInstances"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == 898 assert isinstance(mf, MetaFeatureValue) def test_number_of_classes(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["NumberOfClasses"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == 5 assert isinstance(mf, MetaFeatureValue) def test_number_of_features(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["NumberOfFeatures"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == 38 assert isinstance(mf, MetaFeatureValue) def test_missing_values(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.helper_functions["MissingValues"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert isinstance(mf.value, pd.DataFrame if hasattr(X, "iloc") else np.ndarray) assert mf.value.shape == X.shape @@ -198,162 +202,162 @@ def test_missing_values(meta_train_data): def test_number_of_Instances_with_missing_values(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["NumberOfInstancesWithMissingValues"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == 898 assert isinstance(mf, MetaFeatureValue) def test_percentage_of_Instances_with_missing_values(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data meta_features.metafeatures.set_value( "NumberOfInstancesWithMissingValues", meta_features.metafeatures["NumberOfInstancesWithMissingValues"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ), ) mf = meta_features.metafeatures["PercentageOfInstancesWithMissingValues"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert pytest.approx(mf.value) == 1.0 assert isinstance(mf, MetaFeatureValue) def test_number_of_features_with_missing_values(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["NumberOfFeaturesWithMissingValues"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == 29 assert isinstance(mf, MetaFeatureValue) def test_percentage_of_features_with_missing_values(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data meta_features.metafeatures.set_value( "NumberOfFeaturesWithMissingValues", meta_features.metafeatures["NumberOfFeaturesWithMissingValues"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ), ) mf = meta_features.metafeatures["PercentageOfFeaturesWithMissingValues"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert pytest.approx(mf.value) == float(29) / float(38) assert isinstance(mf, MetaFeatureValue) def test_number_of_missing_values(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data np.save("/tmp/debug", X) mf = meta_features.metafeatures["NumberOfMissingValues"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == 22175 assert isinstance(mf, MetaFeatureValue) def test_percentage_missing_values(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data meta_features.metafeatures.set_value( "NumberOfMissingValues", meta_features.metafeatures["NumberOfMissingValues"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ), ) mf = meta_features.metafeatures["PercentageOfMissingValues"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert pytest.approx(mf.value) == (float(22175) / float(38 * 898)) assert isinstance(mf, MetaFeatureValue) def test_number_of_numeric_features(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["NumberOfNumericFeatures"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == 6 assert isinstance(mf, MetaFeatureValue) def test_number_of_categorical_features(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["NumberOfCategoricalFeatures"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == 32 assert isinstance(mf, MetaFeatureValue) def test_ratio_numerical_to_categorical(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["RatioNumericalToNominal"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert pytest.approx(mf.value) == (float(6) / float(32)) assert isinstance(mf, MetaFeatureValue) def test_ratio_categorical_to_numerical(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["RatioNominalToNumerical"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert pytest.approx(mf.value) == (float(32) / float(6)) assert isinstance(mf, MetaFeatureValue) def test_dataset_ratio(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["DatasetRatio"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert pytest.approx(mf.value) == (float(38) / float(898)) assert isinstance(mf, MetaFeatureValue) def test_inverse_dataset_ratio(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["InverseDatasetRatio"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert pytest.approx(mf.value) == (float(898) / float(38)) assert isinstance(mf, MetaFeatureValue) def test_class_occurences(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.helper_functions["ClassOccurences"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == {0.0: 8.0, 1.0: 99.0, 2.0: 684.0, 4.0: 67.0, 5.0: 40.0} def test_class_probability_min(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["ClassProbabilityMin"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert pytest.approx(mf.value) == (float(8) / float(898)) assert isinstance(mf, MetaFeatureValue) def test_class_probability_max(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["ClassProbabilityMax"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert pytest.approx(mf.value) == (float(684) / float(898)) assert isinstance(mf, MetaFeatureValue) def test_class_probability_mean(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["ClassProbabilityMean"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) prob_mean = (classes / float(898)).mean() @@ -362,9 +366,9 @@ def test_class_probability_mean(meta_train_data): def test_class_probability_std(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["ClassProbabilitySTD"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) prob_std = (classes / float(898)).std() @@ -373,9 +377,9 @@ def test_class_probability_std(meta_train_data): def test_num_symbols(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.helper_functions["NumSymbols"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) symbol_frequency = [ 2, @@ -415,26 +419,26 @@ def test_num_symbols(meta_train_data): def test_symbols_min(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["SymbolsMin"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == 1 def test_symbols_max(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data # this is attribute steel mf = meta_features.metafeatures["SymbolsMax"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == 7 def test_symbols_mean(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["SymbolsMean"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) # Empty looking spaces denote empty attributes symbol_frequency = [ @@ -469,9 +473,9 @@ def test_symbols_mean(meta_train_data): def test_symbols_std(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["SymbolsSTD"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) symbol_frequency = [ 2, @@ -505,17 +509,17 @@ def test_symbols_std(meta_train_data): def test_symbols_sum(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["SymbolsSum"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) assert mf.value == 49 def test_class_entropy(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.metafeatures["ClassEntropy"]( - X, y, logging.getLogger("Meta"), categorical + X, y, logging.getLogger("Meta"), feat_type ) classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) classes = classes / sum(classes) @@ -525,96 +529,96 @@ def test_class_entropy(meta_train_data): def test_calculate_all_metafeatures(meta_train_data): - X, y, categorical = meta_train_data + X, y, feat_type = meta_train_data mf = meta_features.calculate_all_metafeatures( - X, y, categorical, "2", logger=logging.getLogger("Meta") + X, y, feat_type, "2", logger=logging.getLogger("Meta") ) assert 52 == len(mf.metafeature_values) assert mf.metafeature_values["NumberOfCategoricalFeatures"].value == 32 def test_kurtosisses(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed mf = meta_features.helper_functions["Kurtosisses"]( - X_transformed, y, logging.getLogger("Meta"), categorical_transformed + X_transformed, y, logging.getLogger("Meta"), feat_type_transformed ) - assert 6 == len(mf.value) + assert 81 == len(mf.value) def test_kurtosis_min(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["KurtosisMin"]( - X_transformed, y, logging.getLogger("Meta"), categorical_transformed + X_transformed, y, logging.getLogger("Meta"), feat_type_transformed ) def test_kurtosis_max(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["KurtosisMax"]( - X_transformed, y, logging.getLogger("Meta"), categorical_transformed + X_transformed, y, logging.getLogger("Meta"), feat_type_transformed ) def test_kurtosis_mean(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["KurtosisMean"]( - X_transformed, y, logging.getLogger("Meta"), categorical_transformed + X_transformed, y, logging.getLogger("Meta"), feat_type_transformed ) def test_kurtosis_std(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["KurtosisSTD"]( - X_transformed, y, logging.getLogger("Meta"), categorical_transformed + X_transformed, y, logging.getLogger("Meta"), feat_type_transformed ) def test_skewnesses(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed mf = meta_features.helper_functions["Skewnesses"]( - X_transformed, y, logging.getLogger("Meta"), categorical_transformed + X_transformed, y, logging.getLogger("Meta"), feat_type_transformed ) - assert 6 == len(mf.value) + assert 81 == len(mf.value) def test_skewness_min(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["SkewnessMin"]( - X_transformed, y, logging.getLogger("Meta"), categorical_transformed + X_transformed, y, logging.getLogger("Meta"), feat_type_transformed ) def test_skewness_max(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["SkewnessMax"]( - X_transformed, y, logging.getLogger("Meta"), categorical_transformed + X_transformed, y, logging.getLogger("Meta"), feat_type_transformed ) def test_skewness_mean(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["SkewnessMean"]( - X_transformed, y, logging.getLogger("Meta"), categorical_transformed + X_transformed, y, logging.getLogger("Meta"), feat_type_transformed ) def test_skewness_std(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["SkewnessSTD"]( - X_transformed, y, logging.getLogger("Meta"), categorical_transformed + X_transformed, y, logging.getLogger("Meta"), feat_type_transformed ) def test_landmark_lda(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["LandmarkLDA"]( X_transformed, y, logging.getLogger("Meta") @@ -622,7 +626,7 @@ def test_landmark_lda(meta_train_data_transformed): def test_landmark_naive_bayes(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["LandmarkNaiveBayes"]( X_transformed, y, logging.getLogger("Meta") @@ -630,7 +634,7 @@ def test_landmark_naive_bayes(meta_train_data_transformed): def test_landmark_decision_tree(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["LandmarkDecisionTree"]( X_transformed, y, logging.getLogger("Meta") @@ -638,7 +642,7 @@ def test_landmark_decision_tree(meta_train_data_transformed): def test_decision_node(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["LandmarkDecisionNodeLearner"]( X_transformed, y, logging.getLogger("Meta") @@ -646,7 +650,7 @@ def test_decision_node(meta_train_data_transformed): def test_random_node(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["LandmarkRandomNodeLearner"]( X_transformed, y, logging.getLogger("Meta") @@ -655,7 +659,7 @@ def test_random_node(meta_train_data_transformed): @unittest.skip("Currently not implemented!") def test_worst_node(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["LandmarkWorstNodeLearner"]( X_transformed, y, logging.getLogger("Meta") @@ -663,7 +667,7 @@ def test_worst_node(meta_train_data_transformed): def test_1NN(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed # TODO: somehow compute the expected output? meta_features.metafeatures["Landmark1NN"]( X_transformed, y, logging.getLogger("Meta") @@ -671,12 +675,12 @@ def test_1NN(meta_train_data_transformed): def test_pca(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed meta_features.helper_functions["PCA"](X_transformed, y, logging.getLogger("Meta")) def test_pca_95percent(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed mf = meta_features.metafeatures["PCAFractionOfComponentsFor95PercentVariance"]( X_transformed, y, logging.getLogger("Meta") ) @@ -684,7 +688,7 @@ def test_pca_95percent(meta_train_data_transformed): def test_pca_kurtosis_first_pc(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed mf = meta_features.metafeatures["PCAKurtosisFirstPC"]( X_transformed, y, logging.getLogger("Meta") ) @@ -692,7 +696,7 @@ def test_pca_kurtosis_first_pc(meta_train_data_transformed): def test_pca_skewness_first_pc(meta_train_data_transformed): - X_transformed, y, categorical_transformed = meta_train_data_transformed + X_transformed, y, feat_type_transformed = meta_train_data_transformed mf = meta_features.metafeatures["PCASkewnessFirstPC"]( X_transformed, y, logging.getLogger("Meta") ) @@ -846,9 +850,9 @@ def test_1NN_multilabel(multilabel_train_data): def test_calculate_all_metafeatures_multilabel(multilabel_train_data): meta_features.helper_functions.clear() X, y = multilabel_train_data - categorical = {i: False for i in range(10)} + feat_type = {i: "numerical" for i in range(10)} mf = meta_features.calculate_all_metafeatures( - X, y, categorical, "Generated", logger=logging.getLogger("TestMeta") + X, y, feat_type, "Generated", logger=logging.getLogger("TestMeta") ) assert 52 == len(mf.metafeature_values) @@ -860,11 +864,12 @@ def test_calculate_all_metafeatures_same_results_across_datatypes(): all metafeatures work in this complex dataset """ X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=True) - categorical = { - col: True if X[col].dtype.name == "category" else False for col in X.columns + feat_type = { + col: "categorical" if X[col].dtype.name == "category" else "numerical" + for col in X.columns } mf = meta_features.calculate_all_metafeatures( - X, y, categorical, "2", logger=logging.getLogger("Meta") + X, y, feat_type, "2", logger=logging.getLogger("Meta") ) assert 52 == len(mf.metafeature_values) expected = { @@ -925,12 +930,9 @@ def test_calculate_all_metafeatures_same_results_across_datatypes(): # Then do numpy! X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=False) - categorical = { - i: True if category else False - for i, category in enumerate(categorical.values()) - } + feat_type = {i: value for i, value in enumerate(feat_type.values())} mf = meta_features.calculate_all_metafeatures( - X, y, categorical, "2", logger=logging.getLogger("Meta") + X, y, feat_type, "2", logger=logging.getLogger("Meta") ) assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected) diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py b/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py index 992032a349..66eb2072e1 100644 --- a/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py +++ b/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py @@ -29,8 +29,8 @@ def sparse_data(): "numeric" if type(type_) != list else "nominal" for name, type_ in dataset["attributes"][:-1] ] - categorical = { - i: True if attribute == "nominal" else False + feat_type = { + i: "categorical" if attribute == "nominal" else "numerical" for i, attribute in enumerate(attribute_types) } @@ -53,21 +53,21 @@ def sparse_data(): # Precompute some helper functions helpers.set_value( "MissingValues", - helpers["MissingValues"](X, y, logger, categorical), + helpers["MissingValues"](X, y, logger, feat_type), ) mf.set_value( "NumberOfMissingValues", - mf["NumberOfMissingValues"](X, y, logger, categorical), + mf["NumberOfMissingValues"](X, y, logger, feat_type), ) helpers.set_value( "NumSymbols", - helpers["NumSymbols"](X, y, logger, categorical), + helpers["NumSymbols"](X, y, logger, feat_type), ) helpers.set_value( "ClassOccurences", helpers["ClassOccurences"](X, y, logger), ) - return X, y, categorical + return X, y, feat_type @pytest.fixture @@ -84,8 +84,8 @@ def sparse_data_transformed(): "numeric" if type(type_) != list else "nominal" for name, type_ in dataset["attributes"][:-1] ] - categorical = { - i: True if attribute == "nominal" else False + feat_type = { + i: "categorical" if attribute == "nominal" else "numerical" for i, attribute in enumerate(attribute_types) } @@ -100,12 +100,7 @@ def sparse_data_transformed(): X_sparse[NaNs] = 0 X_sparse = sparse.csr_matrix(X_sparse) - ohe = FeatTypeSplit( - feat_type={ - col: "categorical" if category else "numerical" - for col, category in categorical.items() - } - ) + ohe = FeatTypeSplit(feat_type=feat_type) X_transformed = X_sparse.copy() X_transformed = ohe.fit_transform(X_transformed) imp = SimpleImputer(copy=False) @@ -113,12 +108,8 @@ def sparse_data_transformed(): standard_scaler = StandardScaler(with_mean=False) X_transformed = standard_scaler.fit_transform(X_transformed) - # Transform the array which indicates the categorical metafeatures - number_numerical = np.sum(~np.array(list(categorical.values()))) - categorical_transformed = { - i: True if i < (X_transformed.shape[1] - number_numerical) else False - for i in range(X_transformed.shape[1]) - } + # Transform the array which indicates the numerical metafeatures + feat_type_transformed = {i: "numerical" for i in range(X_transformed.shape[1])} X = X_sparse X_transformed = X_transformed @@ -134,15 +125,15 @@ def sparse_data_transformed(): ) helpers.set_value( "MissingValues", - helpers["MissingValues"](X, y, logger, categorical), + helpers["MissingValues"](X, y, logger, feat_type), ) mf.set_value( "NumberOfMissingValues", - mf["NumberOfMissingValues"](X, y, logger, categorical), + mf["NumberOfMissingValues"](X, y, logger, feat_type), ) helpers.set_value( "NumSymbols", - helpers["NumSymbols"](X, y, logger, categorical), + helpers["NumSymbols"](X, y, logger, feat_type), ) helpers.set_value( "ClassOccurences", @@ -150,13 +141,13 @@ def sparse_data_transformed(): ) helpers.set_value( "Skewnesses", - helpers["Skewnesses"](X_transformed, y, logger, categorical_transformed), + helpers["Skewnesses"](X_transformed, y, logger, feat_type_transformed), ) helpers.set_value( "Kurtosisses", - helpers["Kurtosisses"](X_transformed, y, logger, categorical_transformed), + helpers["Kurtosisses"](X_transformed, y, logger, feat_type_transformed), ) - return X_transformed, y, categorical_transformed + return X_transformed, y, feat_type_transformed def test_missing_values(sparse_data): @@ -419,8 +410,8 @@ def test_pca_skewness_first_pc(sparse_data_transformed): def test_calculate_all_metafeatures(sparse_data): - X, y, categorical = sparse_data + X, y, feat_type = sparse_data mf = meta_features.calculate_all_metafeatures( - X, y, categorical, "2", logger=logging.getLogger("Meta") + X, y, feat_type, "2", logger=logging.getLogger("Meta") ) assert 52 == len(mf.metafeature_values)