koaning · FBruzzesi · May 7, 2024 · May 5, 2024 · May 6, 2024 · May 6, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ maintainers = [
 ]
 
 dependencies = [
+    "narwhals>=0.7.16",
     "pandas>=1.1.5",
     "scikit-learn>=1.0",
     "importlib-metadata >= 1.0; python_version < '3.8'",
@@ -61,6 +62,7 @@ docs = [
 ]
 
 test-dep = [
+    "narwhals[polars]",
     "pytest>=6.2.5",
     "pytest-xdist>=1.34.0",
     "pytest-cov>=2.6.1",
@@ -111,4 +113,3 @@ markers = [
     "formulaic: tests that require formulaic (deselect with '-m \"not formulaic\"')",
     "umap: tests that require umap (deselect with '-m \"not umap\"')"
 ]
-
diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py
@@ -1,3 +1,4 @@
+import narwhals as nw
 import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.validation import check_is_fitted
@@ -6,7 +7,7 @@
 
 
 class ColumnDropper(BaseEstimator, TransformerMixin):
-    """The `ColumnDropper` transformer allows dropping specific columns from a pandas DataFrame by name.
+    """The `ColumnDropper` transformer allows dropping specific columns from a DataFrame by name.
     Can be useful in a sklearn Pipeline.
 
     Parameters
@@ -19,6 +20,18 @@ class ColumnDropper(BaseEstimator, TransformerMixin):
     feature_names_ : list[str]
         The names of the features to keep during transform.
 
+    Notes
+    -----
+    Supported dataframes are:
+
+    - pandas
+    - Polars (eager or lazy)
+    - Modin
+    - cuDF
+
+    See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/) for an
+    up-to-date list (and to learn how to add your dataframe library to it!).
+
     Examples
     --------
     ```py
@@ -81,14 +94,14 @@ def fit(self, X, y=None):
 
         Checks:
 
-        1. If input is a `pd.DataFrame` object
+        1. If input is a supported DataFrame
         2. If column names are in such DataFrame
 
         Parameters
         ----------
-        X : pd.DataFrame
+        X : DataFrame
             The data on which we apply the column selection.
-        y : pd.Series, default=None
+        y : Series, default=None
             Ignored, present for compatibility.
 
         Returns
@@ -99,42 +112,42 @@ def fit(self, X, y=None):
         Raises
         ------
         TypeError
-            If `X` is not a `pd.DataFrame` object.
+            If `X` is not a supported DataFrame.
         KeyError
             If one or more of the columns provided doesn't exist in the input DataFrame.
         ValueError
             If dropping the specified columns would result in an empty output DataFrame.
         """
         self.columns_ = as_list(self.columns)
-        self._check_X_for_type(X)
+        X = nw.from_native(X)
         self._check_column_names(X)
-        self.feature_names_ = X.columns.drop(self.columns_).tolist()
+        self.feature_names_ = [x for x in X.columns if x not in self.columns_]
         self._check_column_length()
         return self
 
     def transform(self, X):
-        """Returns a pandas DataFrame with only the specified columns.
+        """Returns a DataFrame with only the specified columns.
 
         Parameters
         ----------
-        X : pd.DataFrame
+        X : DataFrame
             The data on which we apply the column selection.
 
         Returns
         -------
-        pd.DataFrame
+        DataFrame
             The data with the specified columns dropped.
 
         Raises
         ------
         TypeError
-            If `X` is not a `pd.DataFrame` object.
+            If `X` is not a supported DataFrame object.
         """
         check_is_fitted(self, ["feature_names_"])
-        self._check_X_for_type(X)
+        X = nw.from_native(X)
         if self.columns_:
-            return X.drop(columns=self.columns_)
-        return X
+            return nw.to_native(X.drop(self.columns_))
+        return nw.to_native(X)
 
     def get_feature_names(self):
         """Alias for `.feature_names_` attribute"""
@@ -151,12 +164,6 @@ def _check_column_names(self, X):
         if len(non_existent_columns) > 0:
             raise KeyError(f"{list(non_existent_columns)} column(s) not in DataFrame")
 
-    @staticmethod
-    def _check_X_for_type(X):
-        """Checks if input of the Selector is of the required dtype"""
-        if not isinstance(X, pd.DataFrame):
-            raise TypeError("Provided variable X is not of type pandas.DataFrame")
-
 
 class PandasTypeSelector(BaseEstimator, TransformerMixin):
     """The `PandasTypeSelector` transformer allows to select columns in a pandas DataFrame based on their type.

diff --git a/tests/test_preprocessing/test_columndropper.py b/tests/test_preprocessing/test_columndropper.py
@@ -1,6 +1,8 @@
 import pandas as pd
+import polars as pl
 import pytest
-from pandas.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal as pandas_assert_frame_equal
+from polars.testing import assert_frame_equal as polars_assert_frame_equal
 from sklearn.pipeline import make_pipeline
 
 from sklego.preprocessing import ColumnDropper
@@ -19,6 +21,19 @@ def df():
     )
 
 
+@pytest.fixture()
+def df_polars():
+    return pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6],
+            "b": [10, 9, 8, 7, 6, 5],
+            "c": ["a", "b", "a", "b", "c", "c"],
+            "d": ["b", "a", "a", "b", "a", "b"],
+            "e": [0, 1, 0, 1, 0, 1],
+        }
+    )
+
+
 def test_drop_two(df):
     result_df = ColumnDropper(["a", "b"]).fit_transform(df)
     expected_df = pd.DataFrame(
@@ -29,7 +44,7 @@ def test_drop_two(df):
         }
     )
 
-    assert_frame_equal(result_df, expected_df)
+    pandas_assert_frame_equal(result_df, expected_df)
 
 
 def test_drop_one(df):
@@ -43,7 +58,7 @@ def test_drop_one(df):
         }
     )
 
-    assert_frame_equal(result_df, expected_df)
+    pandas_assert_frame_equal(result_df, expected_df)
 
 
 def test_drop_all(df):
@@ -53,7 +68,7 @@ def test_drop_all(df):
 
 def test_drop_none(df):
     result_df = ColumnDropper([]).fit_transform(df)
-    assert_frame_equal(result_df, df)
+    pandas_assert_frame_equal(result_df, df)
 
 
 def test_drop_not_in_frame(df):
@@ -73,10 +88,23 @@ def test_drop_one_in_pipeline(df):
         }
     )
 
-    assert_frame_equal(result_df, expected_df)
+    pandas_assert_frame_equal(result_df, expected_df)
 
 
 def test_get_feature_names():
     df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]})
     transformer = ColumnDropper("a").fit(df)
     assert transformer.get_feature_names() == ["b"]
+
+
+def test_drop_two_polars(df_polars):
+    result_df = ColumnDropper(["a", "b"]).fit_transform(df_polars)
+    expected_df = pl.DataFrame(
+        {
+            "c": ["a", "b", "a", "b", "c", "c"],
+            "d": ["b", "a", "a", "b", "a", "b"],
+            "e": [0, 1, 0, 1, 0, 1],
+        }
+    )
+
+    polars_assert_frame_equal(result_df, expected_df)