Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
0e2e45f
Wip 1
Jan 28, 2024
14ff603
new ideas
Jan 29, 2024
4d8e4ae
new ideas
Jan 29, 2024
e324389
commit
fabioscantamburlo Jan 30, 2024
2a78dc1
commit
fabioscantamburlo Jan 30, 2024
55fcd63
exclude venv
fabioscantamburlo Jan 30, 2024
82d9719
Wip2
fabioscantamburlo Jan 31, 2024
bfe8ba3
Wip3
fabioscantamburlo Feb 1, 2024
5397d34
Wip 3.5
fabioscantamburlo Feb 1, 2024
7ae7d10
Pushing some optim
fabioscantamburlo Feb 5, 2024
365a78d
Doc string and examples
fabioscantamburlo Feb 6, 2024
acf356a
Docstring WIP
fabioscantamburlo Feb 6, 2024
b5160af
Docstring WIP2
fabioscantamburlo Feb 6, 2024
5d12ada
Adding something
fabioscantamburlo Feb 7, 2024
899f315
Bugfix
fabioscantamburlo Feb 12, 2024
a0f0470
Mkdocs and small fixes
fabioscantamburlo Feb 12, 2024
7591f85
Added tests
fabioscantamburlo Feb 13, 2024
0053735
Added scripts
fabioscantamburlo Feb 13, 2024
8fecf4a
Wip4
fabioscantamburlo Feb 19, 2024
b08b04b
removing tests
fabioscantamburlo Feb 19, 2024
9d8c20b
Added tests and some bugifx
fabioscantamburlo Feb 20, 2024
077b0d1
revert pandastransformer
fabioscantamburlo Feb 20, 2024
5d55a2e
Update sklego/feature_selection/mrmr.py
fabioscantamburlo Feb 22, 2024
2e107ef
Resolving comments on PR
fabioscantamburlo Feb 22, 2024
1d4340c
features
fabioscantamburlo Feb 28, 2024
8f4481e
venv
fabioscantamburlo Feb 28, 2024
a6713d6
Add missing file
fabioscantamburlo Feb 28, 2024
25f5613
Wip userguide
fabioscantamburlo Feb 28, 2024
774f170
Merge branch 'FEATURE-MRMR-UserGuide' into FEATURE-MRMR
fabioscantamburlo Mar 1, 2024
4454266
Merge branch 'main' into FEATURE-MRMR
fabioscantamburlo Mar 1, 2024
3f21b0a
typing
fabioscantamburlo Mar 1, 2024
249d17f
Update sklego/feature_selection/mrmr.py
fabioscantamburlo Mar 3, 2024
6c772d8
Update sklego/feature_selection/mrmr.py
fabioscantamburlo Mar 3, 2024
b9b5bfc
Update docs/user-guide/feature-selection.md
fabioscantamburlo Mar 3, 2024
86e0dc6
Update sklego/feature_selection/mrmr.py
fabioscantamburlo Mar 3, 2024
f788026
resolve comments
fabioscantamburlo Mar 3, 2024
87da50c
clean
fabioscantamburlo Mar 3, 2024
287977e
suggestions + general rephrase
fabioscantamburlo Mar 3, 2024
8b4c40a
Typo
fabioscantamburlo Mar 4, 2024
a1df5fe
Update docs/user-guide/feature-selection.md
fabioscantamburlo Mar 9, 2024
3b2bc7a
Merge branch 'main' into FEATURE-MRMR
fabioscantamburlo Mar 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ venv/
ENV/
env.bak/
venv.bak/
venv*/

# Spyder project settings
.spyderproject
Expand All @@ -120,4 +121,4 @@ dmypy.json
.DS_Store

# Local Netlify folder
.netlify
.netlify
122 changes: 122 additions & 0 deletions docs/_scripts/feature-selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from pathlib import Path

_file = Path(__file__)
print(f"Executing {_file}")


_static_path = Path("_static") / _file.stem
_static_path.mkdir(parents=True, exist_ok=True)

# --8<-- [start:mrmr-commonimports]
from sklearn.datasets import fetch_openml
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklego.feature_selection import MaximumRelevanceMinimumRedundancy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# --8<-- [end:mrmr-commonimports]

# --8<-- [start:mrmr-intro]

# Download MNIST dataset using scikit-learn
mnist = fetch_openml("mnist_784", cache=True)

# Assign features and labels
X_pd, y_pd = mnist["data"], mnist["target"].astype(int)

X, y = X_pd.to_numpy(), y_pd.to_numpy()
t_t_s_params = {'test_size': 10000, 'random_state': 42}
X_train, X_test, y_train, y_test = train_test_split(X, y, **t_t_s_params)
X_train = X_train.reshape(60000, 28 * 28)
X_test = X_test.reshape(10000, 28 * 28)
# --8<-- [end:mrmr-intro]

# --8<-- [start:mrmr-smile]
def smile_relevance(X, y):
rows = 28
cols = 28
smiling_face = np.zeros((rows, cols), dtype=int)

# Set the values for the eyes, nose,
# and mouth with adjusted positions and sizes
# Left eye
smiling_face[10:13, 8:10] = 1
# Right eye
smiling_face[10:13, 18:20] = 1
# Upper part of the mouth
smiling_face[18:20, 10:18] = 1
# Left edge of the open mouth
smiling_face[16:18, 8:10] = 1
# Right edge of the open mouth
smiling_face[16:18, 18:20] = 1

# Add the nose as four pixels one pixel higher
smiling_face[14, 13:15] = 1
smiling_face[27, :] = 1
return smiling_face.reshape(rows * cols,)


def smile_redundancy(X, selected, left):
return np.ones(len(left))
# --8<-- [end:mrmr-smile]

# --8<-- [start:mrmr-core]
K = 38
mrmr = MaximumRelevanceMinimumRedundancy(k=K,
kind="auto",
redundancy_func="p",
relevance_func="f")
mrmr_s = MaximumRelevanceMinimumRedundancy(k=K,
redundancy_func=smile_redundancy,
relevance_func=smile_relevance)

f = f_classif(X_train ,y_train.reshape(60000,))[0]
f_features = np.argsort(np.nan_to_num(f, nan=np.finfo(float).eps))[-K:]
mi = mutual_info_classif(X_train, y_train.reshape(60000,))
mi_features = np.argsort(np.nan_to_num(mi, nan=np.finfo(float).eps))[-K:]
mrmr_features = mrmr.fit(X_train, y_train).selected_features_
mrmr_smile_features = mrmr_s.fit(X_train, y_train).selected_features_

# --8<-- [end:mrmr-core]
# --8<-- [start:mrmr-selected-features]
# Define features dictionary
features = {
"f_classif": f_features,
"mutual_info": mi_features,
"mrmr": mrmr_features,
"mrmr_smile": mrmr_smile_features,
}
for name, s_f in features.items():
model = HistGradientBoostingClassifier(random_state=42)
model.fit(X_train[:, s_f], y_train.squeeze())
y_pred = model.predict(X_test[:, s_f])
print(f"Feature selection method: {name}")
print(f"F1 score: {round(f1_score(y_test, y_pred, average="weighted"), 3)}")

# --8<-- [end:mrmr-selected-features]

# --8<-- [start:mrmr-plots]
# Create figure and axes for the plots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Iterate through the features dictionary and plot the images
for idx, (name, s_f) in enumerate(features.items()):
row = idx // 2
col = idx % 2

a = np.zeros(28 * 28)
a[s_f] = 1
ax = axes[row, col]
plot_= sns.heatmap(a.reshape(28, 28), cmap="binary", ax=ax, cbar=False)
ax.set_title(name)




# --8<-- [end:mrmr-plots]
plt.tight_layout()
plt.savefig(_static_path / "mrmr-feature-selection-mnist.png")
plt.clf()
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 6 additions & 0 deletions docs/api/feature-selection.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Features Selection

:::sklego.feature_selection.mrmr.MaximumRelevanceMinimumRedundancy
options:
show_root_full_path: true
show_root_heading: true
6 changes: 6 additions & 0 deletions docs/api/features-selection.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Features Selection

:::sklego.feature_selection.mrmr.MaximumRelevanceMinimumRedundancy
options:
show_root_full_path: true
show_root_heading: true
71 changes: 71 additions & 0 deletions docs/user-guide/feature-selection.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Feature Selection

## Maximum Relevance Minimum Redundancy

The [`Maximum Relevance Minimum Redundancy`][MaximumRelevanceMinimumRedundancy-api] (MRMR) is an iterative feature selection method commonly used in data science to select a subset of features from a larger feature set. The goal of MRMR is to choose features that have high *relevance* to the target variable while minimizing *redundancy* among the already selected features.

MRMR is heavily dependent on the two functions used to determine relevace and redundancy. However, the paper [Maximum Relevanceand Minimum Redundancy Feature Selection Methods for a Marketing Machine Learning Platform](https://arxiv.org/pdf/1908.05376.pdf) shows that using [f_classif](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html) or [f_regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html) as relevance function and Pearson correlation as redundancy function is the best choice for a variety of different problems and in general is a good choice.

Inspired by the Medium article [Feature Selection: How To Throw Away 95% of Your Data and Get 95% Accuracy](https://towardsdatascience.com/feature-selection-how-to-throw-away-95-of-your-data-and-get-95-accuracy-ad41ca016877) we showcase a practical application using the well known mnist dataset.

Note that although the default scikit-lego MRMR implementation uses redundancy and relevance as defined in [Maximum Relevanceand Minimum Redundancy Feature Selection Methods for a Marketing Machine Learning Platform](https://arxiv.org/pdf/1908.05376.pdf), our implementation offers the possibility of defining custom functions, that may be necessary in different scenarios depending on the data.

We will compare this list of well known filters method:

- F statistical test ([ANOVA F-test](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html)).
- Mutual information approximation based on sklearn implementation.

Against the default scikit-lego MRMR implementation and a custom MRMR implementation aimed to select features in order to draw a smiling face on the plot showing the minst letters.



??? example "MRMR imports"
```py
--8<-- "docs/_scripts/feature-selection.py:mrmr-commonimports"
```

```py title="MRMR mnist"
--8<-- "docs/_scripts/feature-selection.py:mrmr-intro"
```

As custom functions, we implemented the smile redundancy and smile relevance.

```py title="MRMR smile functions"
--8<-- "docs/_scripts/feature-selection.py:mrmr-smile"
```

Then we execute the main code part.

```py title="MRMR core"
--8<-- "docs/_scripts/feature-selection.py:mrmr-core"
```

After the execution it is possible to inspect the F1-score for the selected features:

```py title="MRMR mnist selected features"
--8<-- "docs/_scripts/feature-selection.py:mrmr-selected-features"
```

```console hl_lines="5-6"
Feature selection method: f_classif
F1 score: 0.854
Feature selection method: mutual_info
F1 score: 0.879
Feature selection method: mrmr
F1 score: 0.925
Feature selection method: mrmr_smile
F1 score: 0.849
```

The MRMR feature selection model provides better results compared against the other methods, although the smile technique performs rather good as well.

Finally, we can take a look at the selected features.

??? example "MRMR generate plots"
```py
--8<-- "docs/_scripts/feature-selection.py:mrmr-plots"
```

![selected-features-mrmr](../_static/feature-selection/mrmr-feature-selection-mnist.png)

[MaximumRelevanceMinimumRedundancy-api]: ../../api/feature-selection#sklego.feature_selection.mrmr.MaximumRelevanceMinimumRedundancy
2 changes: 2 additions & 0 deletions mkdocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ nav:
- Datasets: user-guide/datasets.md
- Linear Models: user-guide/linear-models.md
- Mixture Methods: user-guide/mixture-methods.md
- Feature Selection: user-guide/feature-selection.md
- Naive Bayes: user-guide/naive-bayes.md
- Meta Models: user-guide/meta-models.md
- Fairness: user-guide/fairness.md
Expand All @@ -147,6 +148,7 @@ nav:
- Meta: api/meta.md
- Metrics: api/metrics.md
- Mixture: api/mixture.md
- Feature Selection: api/feature-selection.md
- Model Selection: api/model-selection.md
- Naive Bayes: api/naive-bayes.md
- Neighbors: api/neighbors.md
Expand Down
5 changes: 5 additions & 0 deletions sklego/feature_selection/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__all__ = [
"MaximumRelevanceMinimumRedundancy",
]

from sklego.feature_selection.mrmr import MaximumRelevanceMinimumRedundancy
Loading