koaning · FBruzzesi · May 7, 2024 · May 5, 2024 · May 6, 2024 · May 6, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ maintainers = [
 ]
 
 dependencies = [
+    "narwhals>=0.7.16",
     "pandas>=1.1.5",
     "scikit-learn>=1.0",
     "importlib-metadata >= 1.0; python_version < '3.8'",
@@ -61,6 +62,7 @@ docs = [
 ]
 
 test-dep = [
+    "narwhals[polars]",
     "pytest>=6.2.5",
     "pytest-xdist>=1.34.0",
     "pytest-cov>=2.6.1",
@@ -111,4 +113,3 @@ markers = [
     "formulaic: tests that require formulaic (deselect with '-m \"not formulaic\"')",
     "umap: tests that require umap (deselect with '-m \"not umap\"')"
 ]
-
diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py
@@ -1,3 +1,4 @@
+import narwhals as nw
 import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.validation import check_is_fitted
@@ -6,7 +7,7 @@
 
 
 class ColumnDropper(BaseEstimator, TransformerMixin):
-    """The `ColumnDropper` transformer allows dropping specific columns from a pandas DataFrame by name.
+    """The `ColumnDropper` transformer allows dropping specific columns from a DataFrame by name.
     Can be useful in a sklearn Pipeline.
 
     Parameters
@@ -19,6 +20,21 @@ class ColumnDropper(BaseEstimator, TransformerMixin):
     feature_names_ : list[str]
         The names of the features to keep during transform.
 
+    Notes
+    -----
+    Native cross-dataframe support is achieved using
+    [Narwhals](https://narwhals-dev.github.io/narwhals/){:target="_blank"}.
+
+    Supported dataframes are:
+
+    - pandas
+    - Polars (eager or lazy)
+    - Modin
+    - cuDF
+
+    See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/){:target="_blank"} for an up-to-date list
+    (and to learn how you can add your dataframe library to it!).
+
     Examples
     --------
     ```py
@@ -39,7 +55,7 @@ class ColumnDropper(BaseEstimator, TransformerMixin):
     2    1.80        45
     '''
 
-    # Selecting multiple columns from a pandas DataFrame
+    # Dropping multiple columns from a pandas DataFrame
     ColumnDropper(["length", "shoesize"]).fit_transform(df)
     '''
          name
@@ -48,7 +64,7 @@ class ColumnDropper(BaseEstimator, TransformerMixin):
     2    Alex
     '''
 
-    # Selecting non-existent columns returns in a KeyError
+    # Dropping non-existent columns results in a KeyError
     ColumnDropper(["weight"]).fit_transform(df)
     # Traceback (most recent call last):
     #     ...
@@ -67,10 +83,12 @@ class ColumnDropper(BaseEstimator, TransformerMixin):
     #        [-1.13554995]])
     ```
 
-    !!! warning
-
-        - Raises a `TypeError` if input provided is not a DataFrame.
-        - Raises a `ValueError` if columns provided are not in the input DataFrame.
+    Raises
+    ------
+    TypeError
+        If input provided is not a DataFrame.
+    KeyError
+        If columns provided are not in the input DataFrame.
     """
 
     def __init__(self, columns: list):
@@ -81,14 +99,14 @@ def fit(self, X, y=None):
 
         Checks:
 
-        1. If input is a `pd.DataFrame` object
+        1. If input is a supported DataFrame
         2. If column names are in such DataFrame
 
         Parameters
         ----------
-        X : pd.DataFrame
+        X : DataFrame
             The data on which we apply the column selection.
-        y : pd.Series, default=None
+        y : Series, default=None
             Ignored, present for compatibility.
 
         Returns
@@ -99,42 +117,42 @@ def fit(self, X, y=None):
         Raises
         ------
         TypeError
-            If `X` is not a `pd.DataFrame` object.
+            If `X` is not a supported DataFrame.
         KeyError
             If one or more of the columns provided doesn't exist in the input DataFrame.
         ValueError
             If dropping the specified columns would result in an empty output DataFrame.
         """
         self.columns_ = as_list(self.columns)
-        self._check_X_for_type(X)
+        X = nw.from_native(X)
         self._check_column_names(X)
-        self.feature_names_ = X.columns.drop(self.columns_).tolist()
+        self.feature_names_ = [x for x in X.columns if x not in self.columns_]
         self._check_column_length()
         return self
 
     def transform(self, X):
-        """Returns a pandas DataFrame with only the specified columns.
+        """Returns a DataFrame with only the specified columns.
 
         Parameters
         ----------
-        X : pd.DataFrame
+        X : DataFrame
             The data on which we apply the column selection.
 
         Returns
         -------
-        pd.DataFrame
+        DataFrame
             The data with the specified columns dropped.
 
         Raises
         ------
         TypeError
-            If `X` is not a `pd.DataFrame` object.
+            If `X` is not a supported DataFrame object.
         """
         check_is_fitted(self, ["feature_names_"])
-        self._check_X_for_type(X)
+        X = nw.from_native(X)
         if self.columns_:
-            return X.drop(columns=self.columns_)
-        return X
+            return nw.to_native(X.drop(self.columns_))
+        return nw.to_native(X)
 
     def get_feature_names(self):
         """Alias for `.feature_names_` attribute"""
@@ -151,12 +169,6 @@ def _check_column_names(self, X):
         if len(non_existent_columns) > 0:
             raise KeyError(f"{list(non_existent_columns)} column(s) not in DataFrame")
 
-    @staticmethod
-    def _check_X_for_type(X):
-        """Checks if input of the Selector is of the required dtype"""
-        if not isinstance(X, pd.DataFrame):
-            raise TypeError("Provided variable X is not of type pandas.DataFrame")
-
 
 class PandasTypeSelector(BaseEstimator, TransformerMixin):
     """The `PandasTypeSelector` transformer allows to select columns in a pandas DataFrame based on their type.

diff --git a/tests/test_preprocessing/test_columndropper.py b/tests/test_preprocessing/test_columndropper.py
@@ -1,82 +1,53 @@
+from contextlib import nullcontext as does_not_raise
+
 import pandas as pd
+import polars as pl
 import pytest
-from pandas.testing import assert_frame_equal
-from sklearn.pipeline import make_pipeline
+from pandas.testing import assert_frame_equal as pandas_assert_frame_equal
+from polars.testing import assert_frame_equal as polars_assert_frame_equal
+from sklearn.pipeline import Pipeline, make_pipeline
 
 from sklego.preprocessing import ColumnDropper
 
 
 @pytest.fixture()
-def df():
-    return pd.DataFrame(
-        {
-            "a": [1, 2, 3, 4, 5, 6],
-            "b": [10, 9, 8, 7, 6, 5],
-            "c": ["a", "b", "a", "b", "c", "c"],
-            "d": ["b", "a", "a", "b", "a", "b"],
-            "e": [0, 1, 0, 1, 0, 1],
-        }
-    )
-
-
-def test_drop_two(df):
-    result_df = ColumnDropper(["a", "b"]).fit_transform(df)
-    expected_df = pd.DataFrame(
-        {
-            "c": ["a", "b", "a", "b", "c", "c"],
-            "d": ["b", "a", "a", "b", "a", "b"],
-            "e": [0, 1, 0, 1, 0, 1],
-        }
-    )
-
-    assert_frame_equal(result_df, expected_df)
-
-
-def test_drop_one(df):
-    result_df = ColumnDropper(["e"]).fit_transform(df)
-    expected_df = pd.DataFrame(
-        {
-            "a": [1, 2, 3, 4, 5, 6],
-            "b": [10, 9, 8, 7, 6, 5],
-            "c": ["a", "b", "a", "b", "c", "c"],
-            "d": ["b", "a", "a", "b", "a", "b"],
-        }
-    )
-
-    assert_frame_equal(result_df, expected_df)
-
-
-def test_drop_all(df):
-    with pytest.raises(ValueError):
-        ColumnDropper(["a", "b", "c", "d", "e"]).fit_transform(df)
-
-
-def test_drop_none(df):
-    result_df = ColumnDropper([]).fit_transform(df)
-    assert_frame_equal(result_df, df)
-
-
-def test_drop_not_in_frame(df):
-    with pytest.raises(KeyError):
-        ColumnDropper(["f"]).fit_transform(df)
-
-
-def test_drop_one_in_pipeline(df):
-    pipe = make_pipeline(ColumnDropper(["e"]))
-    result_df = pipe.fit_transform(df)
-    expected_df = pd.DataFrame(
-        {
-            "a": [1, 2, 3, 4, 5, 6],
-            "b": [10, 9, 8, 7, 6, 5],
-            "c": ["a", "b", "a", "b", "c", "c"],
-            "d": ["b", "a", "a", "b", "a", "b"],
-        }
-    )
-
-    assert_frame_equal(result_df, expected_df)
-
-
-def test_get_feature_names():
-    df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]})
-    transformer = ColumnDropper("a").fit(df)
-    assert transformer.get_feature_names() == ["b"]
+def data():
+    return {
+        "a": [1, 2, 3, 4, 5, 6],
+        "b": [10, 9, 8, 7, 6, 5],
+        "c": ["a", "b", "a", "b", "c", "c"],
+        "d": ["b", "a", "a", "b", "a", "b"],
+        "e": [0, 1, 0, 1, 0, 1],
+    }
+
+
+@pytest.mark.parametrize(
+    "frame_func, assert_func",
+    [
+        (pd.DataFrame, pandas_assert_frame_equal),
+        (pl.DataFrame, polars_assert_frame_equal),
+    ],
+)
+@pytest.mark.parametrize(
+    "to_drop, context",
+    [
+        (["e"], does_not_raise()),  # one
+        (["a", "b"], does_not_raise()),  # two
+        ([], does_not_raise()),  # none
+        (["a", "b", "c", "d", "e"], pytest.raises(ValueError)),  # all
+        (["f"], pytest.raises(KeyError)),  # not in data
+    ],
+)
+@pytest.mark.parametrize("wrapper", [lambda x: x, make_pipeline])
+def test_drop(data, frame_func, assert_func, to_drop, context, wrapper):
+    sub_data = {k: v for k, v in data.items() if k not in to_drop}
+
+    with context:
+        transformer = wrapper(ColumnDropper(to_drop))
+        result_df = transformer.fit_transform(frame_func(data))
+        expected_df = frame_func(sub_data)
+
+        assert_func(result_df, expected_df)
+
+        if not isinstance(transformer, Pipeline):
+            assert transformer.get_feature_names() == list(sub_data.keys())