Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ maintainers = [
]

dependencies = [
"narwhals>=0.7.16",
"pandas>=1.1.5",
"scikit-learn>=1.0",
"importlib-metadata >= 1.0; python_version < '3.8'",
Expand Down Expand Up @@ -61,6 +62,7 @@ docs = [
]

test-dep = [
"narwhals[polars]",
"pytest>=6.2.5",
"pytest-xdist>=1.34.0",
"pytest-cov>=2.6.1",
Expand Down Expand Up @@ -111,4 +113,3 @@ markers = [
"formulaic: tests that require formulaic (deselect with '-m \"not formulaic\"')",
"umap: tests that require umap (deselect with '-m \"not umap\"')"
]

47 changes: 27 additions & 20 deletions sklego/preprocessing/pandastransformers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import narwhals as nw
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
Expand All @@ -6,7 +7,7 @@


class ColumnDropper(BaseEstimator, TransformerMixin):
"""The `ColumnDropper` transformer allows dropping specific columns from a pandas DataFrame by name.
"""The `ColumnDropper` transformer allows dropping specific columns from a DataFrame by name.
Can be useful in a sklearn Pipeline.

Parameters
Expand All @@ -19,6 +20,18 @@ class ColumnDropper(BaseEstimator, TransformerMixin):
feature_names_ : list[str]
The names of the features to keep during transform.

Notes
-----
Supported dataframes are:

- pandas
- Polars (eager or lazy)
- Modin
- cuDF

See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/) for an
up-to-date list (and to learn how to add your dataframe library to it!).

Examples
--------
```py
Expand Down Expand Up @@ -81,14 +94,14 @@ def fit(self, X, y=None):

Checks:

1. If input is a `pd.DataFrame` object
1. If input is a supported DataFrame
2. If column names are in such DataFrame

Parameters
----------
X : pd.DataFrame
X : DataFrame
The data on which we apply the column selection.
y : pd.Series, default=None
y : Series, default=None
Ignored, present for compatibility.

Returns
Expand All @@ -99,42 +112,42 @@ def fit(self, X, y=None):
Raises
------
TypeError
If `X` is not a `pd.DataFrame` object.
If `X` is not a supported DataFrame.
KeyError
If one or more of the columns provided doesn't exist in the input DataFrame.
ValueError
If dropping the specified columns would result in an empty output DataFrame.
"""
self.columns_ = as_list(self.columns)
self._check_X_for_type(X)
X = nw.from_native(X)
self._check_column_names(X)
self.feature_names_ = X.columns.drop(self.columns_).tolist()
self.feature_names_ = [x for x in X.columns if x not in self.columns_]
self._check_column_length()
return self

def transform(self, X):
"""Returns a pandas DataFrame with only the specified columns.
"""Returns a DataFrame with only the specified columns.

Parameters
----------
X : pd.DataFrame
X : DataFrame
The data on which we apply the column selection.

Returns
-------
pd.DataFrame
DataFrame
The data with the specified columns dropped.

Raises
------
TypeError
If `X` is not a `pd.DataFrame` object.
If `X` is not a supported DataFrame object.
"""
check_is_fitted(self, ["feature_names_"])
self._check_X_for_type(X)
X = nw.from_native(X)
if self.columns_:
return X.drop(columns=self.columns_)
return X
return nw.to_native(X.drop(self.columns_))
return nw.to_native(X)

def get_feature_names(self):
"""Alias for `.feature_names_` attribute"""
Expand All @@ -151,12 +164,6 @@ def _check_column_names(self, X):
if len(non_existent_columns) > 0:
raise KeyError(f"{list(non_existent_columns)} column(s) not in DataFrame")

@staticmethod
def _check_X_for_type(X):
"""Checks if input of the Selector is of the required dtype"""
if not isinstance(X, pd.DataFrame):
raise TypeError("Provided variable X is not of type pandas.DataFrame")
Comment on lines -154 to -158
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm removing this here, as nw.from_native already raises a similar error. For example, if you tried passing in a numpy array here, you'd get:

TypeError: Expected pandas-like dataframe, Polars dataframe, or Polars lazyframe, got: <class 'numpy.ndarray'>



class PandasTypeSelector(BaseEstimator, TransformerMixin):
"""The `PandasTypeSelector` transformer allows to select columns in a pandas DataFrame based on their type.
Expand Down
38 changes: 33 additions & 5 deletions tests/test_preprocessing/test_columndropper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal
from pandas.testing import assert_frame_equal as pandas_assert_frame_equal
from polars.testing import assert_frame_equal as polars_assert_frame_equal
from sklearn.pipeline import make_pipeline

from sklego.preprocessing import ColumnDropper
Expand All @@ -19,6 +21,19 @@ def df():
)


@pytest.fixture()
def df_polars():
return pl.DataFrame(
{
"a": [1, 2, 3, 4, 5, 6],
"b": [10, 9, 8, 7, 6, 5],
"c": ["a", "b", "a", "b", "c", "c"],
"d": ["b", "a", "a", "b", "a", "b"],
"e": [0, 1, 0, 1, 0, 1],
}
)


def test_drop_two(df):
result_df = ColumnDropper(["a", "b"]).fit_transform(df)
expected_df = pd.DataFrame(
Expand All @@ -29,7 +44,7 @@ def test_drop_two(df):
}
)

assert_frame_equal(result_df, expected_df)
pandas_assert_frame_equal(result_df, expected_df)


def test_drop_one(df):
Expand All @@ -43,7 +58,7 @@ def test_drop_one(df):
}
)

assert_frame_equal(result_df, expected_df)
pandas_assert_frame_equal(result_df, expected_df)


def test_drop_all(df):
Expand All @@ -53,7 +68,7 @@ def test_drop_all(df):

def test_drop_none(df):
result_df = ColumnDropper([]).fit_transform(df)
assert_frame_equal(result_df, df)
pandas_assert_frame_equal(result_df, df)


def test_drop_not_in_frame(df):
Expand All @@ -73,10 +88,23 @@ def test_drop_one_in_pipeline(df):
}
)

assert_frame_equal(result_df, expected_df)
pandas_assert_frame_equal(result_df, expected_df)


def test_get_feature_names():
df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]})
transformer = ColumnDropper("a").fit(df)
assert transformer.get_feature_names() == ["b"]


def test_drop_two_polars(df_polars):
result_df = ColumnDropper(["a", "b"]).fit_transform(df_polars)
expected_df = pl.DataFrame(
{
"c": ["a", "b", "a", "b", "c", "c"],
"d": ["b", "a", "a", "b", "a", "b"],
"e": [0, 1, 0, 1, 0, 1],
}
)

polars_assert_frame_equal(result_df, expected_df)