diff --git a/pyproject.toml b/pyproject.toml index 81408fe48..aadc3613e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ maintainers = [ ] dependencies = [ + "narwhals>=0.7.16", "pandas>=1.1.5", "scikit-learn>=1.0", "importlib-metadata >= 1.0; python_version < '3.8'", @@ -61,6 +62,7 @@ docs = [ ] test-dep = [ + "narwhals[polars]", "pytest>=6.2.5", "pytest-xdist>=1.34.0", "pytest-cov>=2.6.1", @@ -111,4 +113,3 @@ markers = [ "formulaic: tests that require formulaic (deselect with '-m \"not formulaic\"')", "umap: tests that require umap (deselect with '-m \"not umap\"')" ] - diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index 92160df8e..23a2783b1 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -1,3 +1,4 @@ +import narwhals as nw import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted @@ -6,7 +7,7 @@ class ColumnDropper(BaseEstimator, TransformerMixin): - """The `ColumnDropper` transformer allows dropping specific columns from a pandas DataFrame by name. + """The `ColumnDropper` transformer allows dropping specific columns from a DataFrame by name. Can be useful in a sklearn Pipeline. Parameters @@ -19,6 +20,21 @@ class ColumnDropper(BaseEstimator, TransformerMixin): feature_names_ : list[str] The names of the features to keep during transform. + Notes + ----- + Native cross-dataframe support is achieved using + [Narwhals](https://narwhals-dev.github.io/narwhals/){:target="_blank"}. + + Supported dataframes are: + + - pandas + - Polars (eager or lazy) + - Modin + - cuDF + + See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/){:target="_blank"} for an up-to-date list + (and to learn how you can add your dataframe library to it!). + Examples -------- ```py @@ -39,7 +55,7 @@ class ColumnDropper(BaseEstimator, TransformerMixin): 2 1.80 45 ''' - # Selecting multiple columns from a pandas DataFrame + # Dropping multiple columns from a pandas DataFrame ColumnDropper(["length", "shoesize"]).fit_transform(df) ''' name @@ -48,7 +64,7 @@ class ColumnDropper(BaseEstimator, TransformerMixin): 2 Alex ''' - # Selecting non-existent columns returns in a KeyError + # Dropping non-existent columns results in a KeyError ColumnDropper(["weight"]).fit_transform(df) # Traceback (most recent call last): # ... @@ -67,10 +83,12 @@ class ColumnDropper(BaseEstimator, TransformerMixin): # [-1.13554995]]) ``` - !!! warning - - - Raises a `TypeError` if input provided is not a DataFrame. - - Raises a `ValueError` if columns provided are not in the input DataFrame. + Raises + ------ + TypeError + If input provided is not a DataFrame. + KeyError + If columns provided are not in the input DataFrame. """ def __init__(self, columns: list): @@ -81,14 +99,14 @@ def fit(self, X, y=None): Checks: - 1. If input is a `pd.DataFrame` object + 1. If input is a supported DataFrame 2. If column names are in such DataFrame Parameters ---------- - X : pd.DataFrame + X : DataFrame The data on which we apply the column selection. - y : pd.Series, default=None + y : Series, default=None Ignored, present for compatibility. Returns @@ -99,42 +117,42 @@ def fit(self, X, y=None): Raises ------ TypeError - If `X` is not a `pd.DataFrame` object. + If `X` is not a supported DataFrame. KeyError If one or more of the columns provided doesn't exist in the input DataFrame. ValueError If dropping the specified columns would result in an empty output DataFrame. """ self.columns_ = as_list(self.columns) - self._check_X_for_type(X) + X = nw.from_native(X) self._check_column_names(X) - self.feature_names_ = X.columns.drop(self.columns_).tolist() + self.feature_names_ = [x for x in X.columns if x not in self.columns_] self._check_column_length() return self def transform(self, X): - """Returns a pandas DataFrame with only the specified columns. + """Returns a DataFrame with only the specified columns. Parameters ---------- - X : pd.DataFrame + X : DataFrame The data on which we apply the column selection. Returns ------- - pd.DataFrame + DataFrame The data with the specified columns dropped. Raises ------ TypeError - If `X` is not a `pd.DataFrame` object. + If `X` is not a supported DataFrame object. """ check_is_fitted(self, ["feature_names_"]) - self._check_X_for_type(X) + X = nw.from_native(X) if self.columns_: - return X.drop(columns=self.columns_) - return X + return nw.to_native(X.drop(self.columns_)) + return nw.to_native(X) def get_feature_names(self): """Alias for `.feature_names_` attribute""" @@ -151,12 +169,6 @@ def _check_column_names(self, X): if len(non_existent_columns) > 0: raise KeyError(f"{list(non_existent_columns)} column(s) not in DataFrame") - @staticmethod - def _check_X_for_type(X): - """Checks if input of the Selector is of the required dtype""" - if not isinstance(X, pd.DataFrame): - raise TypeError("Provided variable X is not of type pandas.DataFrame") - class PandasTypeSelector(BaseEstimator, TransformerMixin): """The `PandasTypeSelector` transformer allows to select columns in a pandas DataFrame based on their type. diff --git a/tests/test_preprocessing/test_columndropper.py b/tests/test_preprocessing/test_columndropper.py index 9738913fe..8976e4c4e 100644 --- a/tests/test_preprocessing/test_columndropper.py +++ b/tests/test_preprocessing/test_columndropper.py @@ -1,82 +1,53 @@ +from contextlib import nullcontext as does_not_raise + import pandas as pd +import polars as pl import pytest -from pandas.testing import assert_frame_equal -from sklearn.pipeline import make_pipeline +from pandas.testing import assert_frame_equal as pandas_assert_frame_equal +from polars.testing import assert_frame_equal as polars_assert_frame_equal +from sklearn.pipeline import Pipeline, make_pipeline from sklego.preprocessing import ColumnDropper @pytest.fixture() -def df(): - return pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [10, 9, 8, 7, 6, 5], - "c": ["a", "b", "a", "b", "c", "c"], - "d": ["b", "a", "a", "b", "a", "b"], - "e": [0, 1, 0, 1, 0, 1], - } - ) - - -def test_drop_two(df): - result_df = ColumnDropper(["a", "b"]).fit_transform(df) - expected_df = pd.DataFrame( - { - "c": ["a", "b", "a", "b", "c", "c"], - "d": ["b", "a", "a", "b", "a", "b"], - "e": [0, 1, 0, 1, 0, 1], - } - ) - - assert_frame_equal(result_df, expected_df) - - -def test_drop_one(df): - result_df = ColumnDropper(["e"]).fit_transform(df) - expected_df = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [10, 9, 8, 7, 6, 5], - "c": ["a", "b", "a", "b", "c", "c"], - "d": ["b", "a", "a", "b", "a", "b"], - } - ) - - assert_frame_equal(result_df, expected_df) - - -def test_drop_all(df): - with pytest.raises(ValueError): - ColumnDropper(["a", "b", "c", "d", "e"]).fit_transform(df) - - -def test_drop_none(df): - result_df = ColumnDropper([]).fit_transform(df) - assert_frame_equal(result_df, df) - - -def test_drop_not_in_frame(df): - with pytest.raises(KeyError): - ColumnDropper(["f"]).fit_transform(df) - - -def test_drop_one_in_pipeline(df): - pipe = make_pipeline(ColumnDropper(["e"])) - result_df = pipe.fit_transform(df) - expected_df = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [10, 9, 8, 7, 6, 5], - "c": ["a", "b", "a", "b", "c", "c"], - "d": ["b", "a", "a", "b", "a", "b"], - } - ) - - assert_frame_equal(result_df, expected_df) - - -def test_get_feature_names(): - df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]}) - transformer = ColumnDropper("a").fit(df) - assert transformer.get_feature_names() == ["b"] +def data(): + return { + "a": [1, 2, 3, 4, 5, 6], + "b": [10, 9, 8, 7, 6, 5], + "c": ["a", "b", "a", "b", "c", "c"], + "d": ["b", "a", "a", "b", "a", "b"], + "e": [0, 1, 0, 1, 0, 1], + } + + +@pytest.mark.parametrize( + "frame_func, assert_func", + [ + (pd.DataFrame, pandas_assert_frame_equal), + (pl.DataFrame, polars_assert_frame_equal), + ], +) +@pytest.mark.parametrize( + "to_drop, context", + [ + (["e"], does_not_raise()), # one + (["a", "b"], does_not_raise()), # two + ([], does_not_raise()), # none + (["a", "b", "c", "d", "e"], pytest.raises(ValueError)), # all + (["f"], pytest.raises(KeyError)), # not in data + ], +) +@pytest.mark.parametrize("wrapper", [lambda x: x, make_pipeline]) +def test_drop(data, frame_func, assert_func, to_drop, context, wrapper): + sub_data = {k: v for k, v in data.items() if k not in to_drop} + + with context: + transformer = wrapper(ColumnDropper(to_drop)) + result_df = transformer.fit_transform(frame_func(data)) + expected_df = frame_func(sub_data) + + assert_func(result_df, expected_df) + + if not isinstance(transformer, Pipeline): + assert transformer.get_feature_names() == list(sub_data.keys())