-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathautomated_featransform.py
More file actions
156 lines (133 loc) · 4.39 KB
/
automated_featransform.py
File metadata and controls
156 lines (133 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from featransform.pipeline import Featransform
from featransform.core.models import PipelineConfig, ProcessingConfig, OptimizationConfig, ModelConfig
from featransform.core.enums import (
ImputationStrategy, EncodingStrategy, SelectionStrategy, ModelFamily
)
from featransform.utils.data_generator import DatasetGenerator
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=Warning)
"""Advanced configuration example with full control over pipeline components."""
###################################################### Main Example
print("\n" + "=" * 60)
print("Advanced Featransform Configuration")
print("=" * 60)
# Generate dataset
X, y = DatasetGenerator.generate(
task='multiclass_classification',
n_samples=5000,
n_features=20,
n_informative=15,
add_datetime=True,
n_datetime_cols=2,
add_categorical=True,
n_categorical=3,
add_missing=True,
missing_rate=0.1,
random_state=42
)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"\nDataset shape: {X.shape}")
print(f"Features with nulls: {X.isnull().sum().sum()}")
print(f"Datetime columns: {X.select_dtypes(include=['datetime64']).shape[1]}")
print(f"Categorical columns: {X.select_dtypes(include=['category']).shape[1]}")
###################################################### Configure Pipeline from Scratch
print("\n" + "=" * 60)
print("Custom Pipeline Configuration")
print("=" * 60)
# Create fully customized configuration
config = PipelineConfig(
task_type="multiclass_classification",
# Processing configuration
processing=ProcessingConfig(
imputation_strategy=ImputationStrategy.ITERATIVE,
encoding_strategy=EncodingStrategy.LABEL,
handle_datetime=True,
drop_constant=True,
drop_duplicates=True
),
# Anomaly detection models
anomaly_models=[
ModelConfig(
model_family=ModelFamily.ISOLATION_FOREST,
parameters={'n_estimators': 300, 'contamination': 0.002}
),
ModelConfig(
model_family=ModelFamily.LOCAL_OUTLIER_FACTOR,
parameters={'n_neighbors': 20, 'contamination': 0.002}
),
ModelConfig(
model_family=ModelFamily.ONE_CLASS_SVM,
parameters={'nu': 0.05, 'kernel': 'rbf'}
),
ModelConfig(
model_family=ModelFamily.ELLIPTIC_ENVELOPE,
parameters={'contamination': 0.002}
)
],
# Clustering models
clustering_models=[
ModelConfig(
model_family=ModelFamily.KMEANS,
parameters={'n_clusters': 3}
),
ModelConfig(
model_family=ModelFamily.BIRCH,
parameters={'threshold': 0.5, 'branching_factor': 50}
),
ModelConfig(
model_family=ModelFamily.MINI_BATCH_KMEANS,
parameters={'n_clusters': 3}
),
ModelConfig(
model_family=ModelFamily.GAUSSIAN_MIXTURE,
parameters={'n_components': 3}
)
],
# Dimensionality reduction models
dimensionality_models=[
ModelConfig(
model_family=ModelFamily.PCA,
parameters={'n_components': 0.95}
),
ModelConfig(
model_family=ModelFamily.TRUNCATED_SVD,
parameters={'n_components': 6}
),
ModelConfig(
model_family=ModelFamily.FAST_ICA,
parameters={'n_components': 6}
)
],
# Optimization configuration
optimization=OptimizationConfig(
selection_strategy=SelectionStrategy.IMPORTANCE,
n_iterations=10,
validation_split=0.3,
min_features=10
),
verbose=True,
n_jobs=-1,
random_state=42
)
###################################################### Fit & Transform
print("\n" + "=" * 60)
print("Fitting Pipeline")
print("=" * 60)
# Create and fit pipeline
ft = Featransform(config)
ft.fit(X_train, y_train)
# Transform data
X_train_transformed = ft.transform(X_train)
X_test_transformed = ft.transform(X_test)
print(f"\nOriginal features: {X_train.shape[1]}")
print(f"Transformed features: {X_train_transformed.shape[1]}")
###################################################### Get Results
print("\n" + "=" * 60)
print("Pipeline Results")
print("=" * 60)
# Get pipeline summary and optimization report
ft.report_optimization()