Make RegressionOutlier dataframe-agnostic (#665)

MarcoGorelli · web-flow · commit 94cf506de16f · 2024-05-11T17:54:46.000+02:00
* make regression outlier df-agnostic

* need to use eager-only for this one

* pass native to check_array

* remove cudf, link to check_X_y
diff --git a/sklego/meta/regression_outlier_detector.py b/sklego/meta/regression_outlier_detector.py
@@ -1,5 +1,5 @@
+import narwhals as nw
 import numpy as np
-import pandas as pd
 from sklearn.base import BaseEstimator, OutlierMixin
 from sklearn.utils.validation import check_array, check_is_fitted
 
@@ -11,8 +11,11 @@ class RegressionOutlierDetector(BaseEstimator, OutlierMixin):
     ----------
     model : scikit-learn compatible regression model
         A regression model that will be used for prediction.
-    column : int
-        The index of the target column to predict in the input data.
+    column : int | str
+        This should be:
+
+            - The index of the target column to predict in the input data, when the input is an array.
+            - The name of the target column to predict in the input data, when the input is a dataframe.
     lower : float, default=2.0
         Lower threshold for outlier detection. The method used for detection depends on the `method` parameter.
     upper : float, default=2.0
@@ -32,6 +35,21 @@ class RegressionOutlierDetector(BaseEstimator, OutlierMixin):
         The standard deviation of the differences between true and predicted values.
     idx_ : int
         The index of the target column in the input data.
+
+    Notes
+    -----
+    Native cross-dataframe support is achieved using
+    [Narwhals](https://narwhals-dev.github.io/narwhals/){:target="_blank"}.
+    Supported dataframes are:
+
+    - pandas
+    - Polars (eager)
+    - Modin
+
+    See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/){:target="_blank"} for an up-to-date list
+    (and to learn how you can add your dataframe library to it!), though note that only those
+    supported by [sklearn.utils.check_X_y](https://scikit-learn.org/stable/modules/generated/sklearn.utils.check_X_y.html)
+    will work with this class.
     """
 
     def __init__(self, model, column, lower=2, upper=2, method="sd"):
@@ -112,8 +130,9 @@ def fit(self, X, y=None):
         ValueError
             If the `model` is not a regression estimator.
         """
-        self.idx_ = np.argmax([i == self.column for i in X.columns]) if isinstance(X, pd.DataFrame) else self.column
-        X = check_array(X, estimator=self)
+        X = nw.from_native(X, eager_only=True, strict=False)
+        self.idx_ = np.argmax([i == self.column for i in X.columns]) if isinstance(X, nw.DataFrame) else self.column
+        X = check_array(nw.to_native(X, strict=False), estimator=self)
         if not self._is_regression_model():
             raise ValueError("Passed model must be regression!")
         X, y = self.to_x_y(X)
diff --git a/tests/test_meta/test_regression_outlier.py b/tests/test_meta/test_regression_outlier.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+import polars as pl
 import pytest
 from sklearn.linear_model import LinearRegression, LogisticRegression
 
@@ -42,14 +43,15 @@ def test_obvious_example():
         assert preds[i] == -1
 
 
-def test_obvious_example_pandas():
+@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
+def test_obvious_example_dataframe(frame_func):
     # generate random data for illustrative example
     np.random.seed(42)
     x = np.random.normal(0, 1, 100)
     y = 1 + x + np.random.normal(0, 0.2, 100)
     for i in [20, 25, 50, 80]:
         y[i] += 2
-    X = pd.DataFrame({"x": x, "y": y})
+    X = frame_func({"x": x, "y": y})
 
     # fit and plot
     mod = RegressionOutlierDetector(LinearRegression(), column="y")
@@ -58,14 +60,15 @@ def test_obvious_example_pandas():
         assert preds[i] == -1
 
 
-def test_raises_error():
+@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
+def test_raises_error(frame_func):
     # generate random data for illustrative example
     np.random.seed(42)
     x = np.random.normal(0, 1, 100)
     y = 1 + x + np.random.normal(0, 0.2, 100)
     for i in [20, 25, 50, 80]:
         y[i] += 2
-    X = pd.DataFrame({"x": x, "y": y})
+    X = frame_func({"x": x, "y": y})
 
     with pytest.raises(ValueError):
         mod = RegressionOutlierDetector(LogisticRegression(), column="y")