From f626eaf1792cd09ae39b94dafa142579ec29a13e Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 5 May 2024 14:31:37 +0100 Subject: [PATCH 1/5] feat: make ColumnDropped dataframe-agnostic --- pyproject.toml | 3 +- sklego/preprocessing/pandastransformers.py | 40 ++++++++++--------- .../test_preprocessing/test_columndropper.py | 38 +++++++++++++++--- 3 files changed, 57 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 81408fe48..c5a61fcc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ maintainers = [ ] dependencies = [ + "narwhals>=0.7.16", "pandas>=1.1.5", "scikit-learn>=1.0", "importlib-metadata >= 1.0; python_version < '3.8'", @@ -61,6 +62,7 @@ docs = [ ] test-dep = [ + "polars", "pytest>=6.2.5", "pytest-xdist>=1.34.0", "pytest-cov>=2.6.1", @@ -111,4 +113,3 @@ markers = [ "formulaic: tests that require formulaic (deselect with '-m \"not formulaic\"')", "umap: tests that require umap (deselect with '-m \"not umap\"')" ] - diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index 92160df8e..cbceb3a9e 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -1,3 +1,4 @@ +import narwhals as nw import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted @@ -6,7 +7,7 @@ class ColumnDropper(BaseEstimator, TransformerMixin): - """The `ColumnDropper` transformer allows dropping specific columns from a pandas DataFrame by name. + """The `ColumnDropper` transformer allows dropping specific columns from a DataFrame by name. Can be useful in a sklearn Pipeline. Parameters @@ -19,6 +20,15 @@ class ColumnDropper(BaseEstimator, TransformerMixin): feature_names_ : list[str] The names of the features to keep during transform. + Notes + ----- + Supported dataframes are: + + - pandas + - Polars (eager or lazy) + - Modin + - cuDF + Examples -------- ```py @@ -81,14 +91,14 @@ def fit(self, X, y=None): Checks: - 1. If input is a `pd.DataFrame` object + 1. If input is a supported DataFrame 2. If column names are in such DataFrame Parameters ---------- - X : pd.DataFrame + X : DataFrame The data on which we apply the column selection. - y : pd.Series, default=None + y : Series, default=None Ignored, present for compatibility. Returns @@ -99,25 +109,25 @@ def fit(self, X, y=None): Raises ------ TypeError - If `X` is not a `pd.DataFrame` object. + If `X` is not a supported DataFrame. KeyError If one or more of the columns provided doesn't exist in the input DataFrame. ValueError If dropping the specified columns would result in an empty output DataFrame. """ self.columns_ = as_list(self.columns) - self._check_X_for_type(X) + X = nw.from_native(X) self._check_column_names(X) - self.feature_names_ = X.columns.drop(self.columns_).tolist() + self.feature_names_ = [x for x in X.columns if x not in self.columns_] self._check_column_length() return self def transform(self, X): - """Returns a pandas DataFrame with only the specified columns. + """Returns a DataFrame with only the specified columns. Parameters ---------- - X : pd.DataFrame + X : DataFrame The data on which we apply the column selection. Returns @@ -131,10 +141,10 @@ def transform(self, X): If `X` is not a `pd.DataFrame` object. """ check_is_fitted(self, ["feature_names_"]) - self._check_X_for_type(X) + X = nw.from_native(X) if self.columns_: - return X.drop(columns=self.columns_) - return X + return nw.to_native(X.drop(self.columns_)) + return nw.to_native(X) def get_feature_names(self): """Alias for `.feature_names_` attribute""" @@ -151,12 +161,6 @@ def _check_column_names(self, X): if len(non_existent_columns) > 0: raise KeyError(f"{list(non_existent_columns)} column(s) not in DataFrame") - @staticmethod - def _check_X_for_type(X): - """Checks if input of the Selector is of the required dtype""" - if not isinstance(X, pd.DataFrame): - raise TypeError("Provided variable X is not of type pandas.DataFrame") - class PandasTypeSelector(BaseEstimator, TransformerMixin): """The `PandasTypeSelector` transformer allows to select columns in a pandas DataFrame based on their type. diff --git a/tests/test_preprocessing/test_columndropper.py b/tests/test_preprocessing/test_columndropper.py index 9738913fe..9234c4a24 100644 --- a/tests/test_preprocessing/test_columndropper.py +++ b/tests/test_preprocessing/test_columndropper.py @@ -1,6 +1,8 @@ import pandas as pd +import polars as pl import pytest -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal as pandas_assert_frame_equal +from polars.testing import assert_frame_equal as polars_assert_frame_equal from sklearn.pipeline import make_pipeline from sklego.preprocessing import ColumnDropper @@ -19,6 +21,19 @@ def df(): ) +@pytest.fixture() +def df_polars(): + return pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [10, 9, 8, 7, 6, 5], + "c": ["a", "b", "a", "b", "c", "c"], + "d": ["b", "a", "a", "b", "a", "b"], + "e": [0, 1, 0, 1, 0, 1], + } + ) + + def test_drop_two(df): result_df = ColumnDropper(["a", "b"]).fit_transform(df) expected_df = pd.DataFrame( @@ -29,7 +44,7 @@ def test_drop_two(df): } ) - assert_frame_equal(result_df, expected_df) + pandas_assert_frame_equal(result_df, expected_df) def test_drop_one(df): @@ -43,7 +58,7 @@ def test_drop_one(df): } ) - assert_frame_equal(result_df, expected_df) + pandas_assert_frame_equal(result_df, expected_df) def test_drop_all(df): @@ -53,7 +68,7 @@ def test_drop_all(df): def test_drop_none(df): result_df = ColumnDropper([]).fit_transform(df) - assert_frame_equal(result_df, df) + pandas_assert_frame_equal(result_df, df) def test_drop_not_in_frame(df): @@ -73,10 +88,23 @@ def test_drop_one_in_pipeline(df): } ) - assert_frame_equal(result_df, expected_df) + pandas_assert_frame_equal(result_df, expected_df) def test_get_feature_names(): df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]}) transformer = ColumnDropper("a").fit(df) assert transformer.get_feature_names() == ["b"] + + +def test_drop_two_polars(df_polars): + result_df = ColumnDropper(["a", "b"]).fit_transform(df_polars) + expected_df = pl.DataFrame( + { + "c": ["a", "b", "a", "b", "c", "c"], + "d": ["b", "a", "a", "b", "a", "b"], + "e": [0, 1, 0, 1, 0, 1], + } + ) + + polars_assert_frame_equal(result_df, expected_df) From 36d9524ffa7436630d2912f98ea86831d85e0d08 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 6 May 2024 15:55:51 +0100 Subject: [PATCH 2/5] use narwhals[polars] in pyproject.toml, link to list of supported libraries --- pyproject.toml | 2 +- sklego/preprocessing/pandastransformers.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c5a61fcc2..aadc3613e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ docs = [ ] test-dep = [ - "polars", + "narwhals[polars]", "pytest>=6.2.5", "pytest-xdist>=1.34.0", "pytest-cov>=2.6.1", diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index cbceb3a9e..90cdd0994 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -29,6 +29,9 @@ class ColumnDropper(BaseEstimator, TransformerMixin): - Modin - cuDF + See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/) for an + up-to-date list (and to learn how to add your dataframe library to it!). + Examples -------- ```py @@ -132,13 +135,13 @@ def transform(self, X): Returns ------- - pd.DataFrame + DataFrame The data with the specified columns dropped. Raises ------ TypeError - If `X` is not a `pd.DataFrame` object. + If `X` is not a supported DataFrame object. """ check_is_fitted(self, ["feature_names_"]) X = nw.from_native(X) From a3116969b5993e59dfe23766d7e65db0d97647fb Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 6 May 2024 20:03:13 +0100 Subject: [PATCH 3/5] note that narwhals is used for cross-dataframe support --- sklego/preprocessing/pandastransformers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index 90cdd0994..118d4bf13 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -22,6 +22,7 @@ class ColumnDropper(BaseEstimator, TransformerMixin): Notes ----- + Native cross-dataframe support is achieved using [Narwhals](https://narwhals-dev.github.io). Supported dataframes are: - pandas @@ -30,7 +31,7 @@ class ColumnDropper(BaseEstimator, TransformerMixin): - cuDF See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/) for an - up-to-date list (and to learn how to add your dataframe library to it!). + up-to-date list (and to learn how you can add your dataframe library to it!). Examples -------- From ea17b3c098cab09e23353ebaab118ba6f83a8706 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 7 May 2024 06:45:39 +0200 Subject: [PATCH 4/5] test refactor --- .../test_preprocessing/test_columndropper.py | 143 ++++++------------ 1 file changed, 43 insertions(+), 100 deletions(-) diff --git a/tests/test_preprocessing/test_columndropper.py b/tests/test_preprocessing/test_columndropper.py index 9234c4a24..8976e4c4e 100644 --- a/tests/test_preprocessing/test_columndropper.py +++ b/tests/test_preprocessing/test_columndropper.py @@ -1,110 +1,53 @@ +from contextlib import nullcontext as does_not_raise + import pandas as pd import polars as pl import pytest from pandas.testing import assert_frame_equal as pandas_assert_frame_equal from polars.testing import assert_frame_equal as polars_assert_frame_equal -from sklearn.pipeline import make_pipeline +from sklearn.pipeline import Pipeline, make_pipeline from sklego.preprocessing import ColumnDropper @pytest.fixture() -def df(): - return pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [10, 9, 8, 7, 6, 5], - "c": ["a", "b", "a", "b", "c", "c"], - "d": ["b", "a", "a", "b", "a", "b"], - "e": [0, 1, 0, 1, 0, 1], - } - ) - - -@pytest.fixture() -def df_polars(): - return pl.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [10, 9, 8, 7, 6, 5], - "c": ["a", "b", "a", "b", "c", "c"], - "d": ["b", "a", "a", "b", "a", "b"], - "e": [0, 1, 0, 1, 0, 1], - } - ) - - -def test_drop_two(df): - result_df = ColumnDropper(["a", "b"]).fit_transform(df) - expected_df = pd.DataFrame( - { - "c": ["a", "b", "a", "b", "c", "c"], - "d": ["b", "a", "a", "b", "a", "b"], - "e": [0, 1, 0, 1, 0, 1], - } - ) - - pandas_assert_frame_equal(result_df, expected_df) - - -def test_drop_one(df): - result_df = ColumnDropper(["e"]).fit_transform(df) - expected_df = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [10, 9, 8, 7, 6, 5], - "c": ["a", "b", "a", "b", "c", "c"], - "d": ["b", "a", "a", "b", "a", "b"], - } - ) - - pandas_assert_frame_equal(result_df, expected_df) - - -def test_drop_all(df): - with pytest.raises(ValueError): - ColumnDropper(["a", "b", "c", "d", "e"]).fit_transform(df) - - -def test_drop_none(df): - result_df = ColumnDropper([]).fit_transform(df) - pandas_assert_frame_equal(result_df, df) - - -def test_drop_not_in_frame(df): - with pytest.raises(KeyError): - ColumnDropper(["f"]).fit_transform(df) - - -def test_drop_one_in_pipeline(df): - pipe = make_pipeline(ColumnDropper(["e"])) - result_df = pipe.fit_transform(df) - expected_df = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [10, 9, 8, 7, 6, 5], - "c": ["a", "b", "a", "b", "c", "c"], - "d": ["b", "a", "a", "b", "a", "b"], - } - ) - - pandas_assert_frame_equal(result_df, expected_df) - - -def test_get_feature_names(): - df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]}) - transformer = ColumnDropper("a").fit(df) - assert transformer.get_feature_names() == ["b"] - - -def test_drop_two_polars(df_polars): - result_df = ColumnDropper(["a", "b"]).fit_transform(df_polars) - expected_df = pl.DataFrame( - { - "c": ["a", "b", "a", "b", "c", "c"], - "d": ["b", "a", "a", "b", "a", "b"], - "e": [0, 1, 0, 1, 0, 1], - } - ) - - polars_assert_frame_equal(result_df, expected_df) +def data(): + return { + "a": [1, 2, 3, 4, 5, 6], + "b": [10, 9, 8, 7, 6, 5], + "c": ["a", "b", "a", "b", "c", "c"], + "d": ["b", "a", "a", "b", "a", "b"], + "e": [0, 1, 0, 1, 0, 1], + } + + +@pytest.mark.parametrize( + "frame_func, assert_func", + [ + (pd.DataFrame, pandas_assert_frame_equal), + (pl.DataFrame, polars_assert_frame_equal), + ], +) +@pytest.mark.parametrize( + "to_drop, context", + [ + (["e"], does_not_raise()), # one + (["a", "b"], does_not_raise()), # two + ([], does_not_raise()), # none + (["a", "b", "c", "d", "e"], pytest.raises(ValueError)), # all + (["f"], pytest.raises(KeyError)), # not in data + ], +) +@pytest.mark.parametrize("wrapper", [lambda x: x, make_pipeline]) +def test_drop(data, frame_func, assert_func, to_drop, context, wrapper): + sub_data = {k: v for k, v in data.items() if k not in to_drop} + + with context: + transformer = wrapper(ColumnDropper(to_drop)) + result_df = transformer.fit_transform(frame_func(data)) + expected_df = frame_func(sub_data) + + assert_func(result_df, expected_df) + + if not isinstance(transformer, Pipeline): + assert transformer.get_feature_names() == list(sub_data.keys()) From cad5fbf058820feccbfdf338c8ceec5f9e776984 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 7 May 2024 10:26:41 +0200 Subject: [PATCH 5/5] docstrings --- sklego/preprocessing/pandastransformers.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index 118d4bf13..23a2783b1 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -22,7 +22,9 @@ class ColumnDropper(BaseEstimator, TransformerMixin): Notes ----- - Native cross-dataframe support is achieved using [Narwhals](https://narwhals-dev.github.io). + Native cross-dataframe support is achieved using + [Narwhals](https://narwhals-dev.github.io/narwhals/){:target="_blank"}. + Supported dataframes are: - pandas @@ -30,8 +32,8 @@ class ColumnDropper(BaseEstimator, TransformerMixin): - Modin - cuDF - See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/) for an - up-to-date list (and to learn how you can add your dataframe library to it!). + See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/){:target="_blank"} for an up-to-date list + (and to learn how you can add your dataframe library to it!). Examples -------- @@ -53,7 +55,7 @@ class ColumnDropper(BaseEstimator, TransformerMixin): 2 1.80 45 ''' - # Selecting multiple columns from a pandas DataFrame + # Dropping multiple columns from a pandas DataFrame ColumnDropper(["length", "shoesize"]).fit_transform(df) ''' name @@ -62,7 +64,7 @@ class ColumnDropper(BaseEstimator, TransformerMixin): 2 Alex ''' - # Selecting non-existent columns returns in a KeyError + # Dropping non-existent columns results in a KeyError ColumnDropper(["weight"]).fit_transform(df) # Traceback (most recent call last): # ... @@ -81,10 +83,12 @@ class ColumnDropper(BaseEstimator, TransformerMixin): # [-1.13554995]]) ``` - !!! warning - - - Raises a `TypeError` if input provided is not a DataFrame. - - Raises a `ValueError` if columns provided are not in the input DataFrame. + Raises + ------ + TypeError + If input provided is not a DataFrame. + KeyError + If columns provided are not in the input DataFrame. """ def __init__(self, columns: list):