feat: make ColumnDropped dataframe-agnostic

MarcoGorelli · MarcoGorelli · commit 8f221548ad9f · 2024-05-05T14:50:19.000+01:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ maintainers = [
 ]
 
 dependencies = [
+    "narwhals>=0.7.16",
     "pandas>=1.1.5",
     "scikit-learn>=1.0",
     "importlib-metadata >= 1.0; python_version < '3.8'",
@@ -61,6 +62,7 @@ docs = [
 ]
 
 test-dep = [
+    "polars",
     "pytest>=6.2.5",
     "pytest-xdist>=1.34.0",
     "pytest-cov>=2.6.1",
@@ -111,4 +113,3 @@ markers = [
     "formulaic: tests that require formulaic (deselect with '-m \"not formulaic\"')",
     "umap: tests that require umap (deselect with '-m \"not umap\"')"
 ]
-
diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py
@@ -1,3 +1,4 @@
+import narwhals as nw
 import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.validation import check_is_fitted
@@ -106,9 +107,9 @@ def fit(self, X, y=None):
             If dropping the specified columns would result in an empty output DataFrame.
         """
         self.columns_ = as_list(self.columns)
-        self._check_X_for_type(X)
+        X = nw.from_native(X)
         self._check_column_names(X)
-        self.feature_names_ = X.columns.drop(self.columns_).tolist()
+        self.feature_names_ = [x for x in X.columns if x not in self.columns_]
         self._check_column_length()
         return self
 
@@ -131,10 +132,10 @@ def transform(self, X):
             If `X` is not a `pd.DataFrame` object.
         """
         check_is_fitted(self, ["feature_names_"])
-        self._check_X_for_type(X)
+        X = nw.from_native(X)
         if self.columns_:
-            return X.drop(columns=self.columns_)
-        return X
+            return nw.to_native(X.drop(self.columns_))
+        return nw.to_native(X)
 
     def get_feature_names(self):
         """Alias for `.feature_names_` attribute"""
@@ -151,12 +152,6 @@ def _check_column_names(self, X):
         if len(non_existent_columns) > 0:
             raise KeyError(f"{list(non_existent_columns)} column(s) not in DataFrame")
 
-    @staticmethod
-    def _check_X_for_type(X):
-        """Checks if input of the Selector is of the required dtype"""
-        if not isinstance(X, pd.DataFrame):
-            raise TypeError("Provided variable X is not of type pandas.DataFrame")
-
 
 class PandasTypeSelector(BaseEstimator, TransformerMixin):
     """The `PandasTypeSelector` transformer allows to select columns in a pandas DataFrame based on their type.
diff --git a/tests/test_preprocessing/test_columndropper.py b/tests/test_preprocessing/test_columndropper.py
@@ -1,6 +1,8 @@
 import pandas as pd
+import polars as pl
 import pytest
-from pandas.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal as pandas_assert_frame_equal
+from polars.testing import assert_frame_equal as polars_assert_frame_equal
 from sklearn.pipeline import make_pipeline
 
 from sklego.preprocessing import ColumnDropper
@@ -19,6 +21,19 @@ def df():
     )
 
 
+@pytest.fixture()
+def df_polars():
+    return pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6],
+            "b": [10, 9, 8, 7, 6, 5],
+            "c": ["a", "b", "a", "b", "c", "c"],
+            "d": ["b", "a", "a", "b", "a", "b"],
+            "e": [0, 1, 0, 1, 0, 1],
+        }
+    )
+
+
 def test_drop_two(df):
     result_df = ColumnDropper(["a", "b"]).fit_transform(df)
     expected_df = pd.DataFrame(
@@ -29,7 +44,7 @@ def test_drop_two(df):
         }
     )
 
-    assert_frame_equal(result_df, expected_df)
+    pandas_assert_frame_equal(result_df, expected_df)
 
 
 def test_drop_one(df):
@@ -43,7 +58,7 @@ def test_drop_one(df):
         }
     )
 
-    assert_frame_equal(result_df, expected_df)
+    pandas_assert_frame_equal(result_df, expected_df)
 
 
 def test_drop_all(df):
@@ -53,7 +68,7 @@ def test_drop_all(df):
 
 def test_drop_none(df):
     result_df = ColumnDropper([]).fit_transform(df)
-    assert_frame_equal(result_df, df)
+    pandas_assert_frame_equal(result_df, df)
 
 
 def test_drop_not_in_frame(df):
@@ -73,10 +88,23 @@ def test_drop_one_in_pipeline(df):
         }
     )
 
-    assert_frame_equal(result_df, expected_df)
+    pandas_assert_frame_equal(result_df, expected_df)
 
 
 def test_get_feature_names():
     df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]})
     transformer = ColumnDropper("a").fit(df)
     assert transformer.get_feature_names() == ["b"]
+
+
+def test_drop_two_polars(df_polars):
+    result_df = ColumnDropper(["a", "b"]).fit_transform(df_polars)
+    expected_df = pl.DataFrame(
+        {
+            "c": ["a", "b", "a", "b", "c", "c"],
+            "d": ["b", "a", "a", "b", "a", "b"],
+            "e": [0, 1, 0, 1, 0, 1],
+        }
+    )
+
+    polars_assert_frame_equal(result_df, expected_df)

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ maintainers = [`
`20`	`20`	`]`
`21`	`21`
`22`	`22`	`dependencies = [`
	`23`	`+ "narwhals>=0.7.16",`
`23`	`24`	`"pandas>=1.1.5",`
`24`	`25`	`"scikit-learn>=1.0",`
`25`	`26`	`"importlib-metadata >= 1.0; python_version < '3.8'",`
`@@ -61,6 +62,7 @@ docs = [`
`61`	`62`	`]`
`62`	`63`
`63`	`64`	`test-dep = [`
	`65`	`+ "polars",`
`64`	`66`	`"pytest>=6.2.5",`
`65`	`67`	`"pytest-xdist>=1.34.0",`
`66`	`68`	`"pytest-cov>=2.6.1",`
`@@ -111,4 +113,3 @@ markers = [`
`111`	`113`	`"formulaic: tests that require formulaic (deselect with '-m \"not formulaic\"')",`
`112`	`114`	`"umap: tests that require umap (deselect with '-m \"not umap\"')"`
`113`	`115`	`]`
`114`		`-`