diff --git a/docs/user-guide/preprocessing.md b/docs/user-guide/preprocessing.md index e0318d8fe..b84906f75 100644 --- a/docs/user-guide/preprocessing.md +++ b/docs/user-guide/preprocessing.md @@ -253,6 +253,14 @@ Now let's see what occurs when we add a constraint that enforces the feature to If these features are now passed to a model that supports monotonicity constraints then we can build models with guarantees. +## Outlier Removal + +The [`OutlierRemover`][outlier-remover-api] class is a transformer that removes outliers from your dataset during training time only based on some outlier detector estimator. This can be useful in scenarios where outliers in the training data can negatively impact the performance of your model. By removing these outliers during training, your model can learn from a "clean" dataset that may lead to better performance. + +It's important to note that this transformer only removes outliers during training. This means that when you use your trained model to predict on new data, the new data will not have any outliers removed. This is useful because in a real-world scenario, new data may contain outliers and you would want your model to be able to handle these cases. + +The `OutlierRemover` class is initialized with an `outlier_detector` estimator, and a boolean flag `refit`. The outlier detector should be a scikit-learn compatible estimator that implements `.fit()` and `.predict()` methods. The refit flag determines whether the underlying estimator is fitted during `OutlierRemover.fit()`. + [estimator-transformer-api]: ../../api/meta#sklego.meta.estimator_transformer.EstimatorTransformer [meta-module]: ../../api/meta [id-transformer-api]: ../../api/preprocessing#sklego.preprocessing.identitytransformer.IdentityTransformer @@ -261,6 +269,7 @@ If these features are now passed to a model that supports monotonicity constrain [rbf-api]: ../../api/preprocessing#sklego.preprocessing.repeatingbasis.RepeatingBasisFunction [interval-encoder-api]: ../../api/preprocessing#sklego.preprocessing.intervalencoder.IntervalEncoder [decay-section]: ../../user-guide/meta#decayed-estimation +[outlier-remover-api]: ../../api/preprocessing#sklego.preprocessing.outlier_remover.OutlierRemover [formulaic-docs]: https://matthewwardrop.github.io/formulaic/ [formulaic-formulas]: https://matthewwardrop.github.io/formulaic/formulas/ diff --git a/sklego/mixture/bayesian_gmm_detector.py b/sklego/mixture/bayesian_gmm_detector.py index 331567064..5ce4452c0 100644 --- a/sklego/mixture/bayesian_gmm_detector.py +++ b/sklego/mixture/bayesian_gmm_detector.py @@ -12,7 +12,11 @@ class BayesianGMMOutlierDetector(OutlierMixin, BaseEstimator): """The `BayesianGMMOutlierDetector` trains a Bayesian Gaussian Mixture model on a dataset `X`. Once a density is trained we can evaluate the likelihood scores to see if it is deemed likely. - By giving a threshold this model might then label outliers if their likelihood score is too low. + By providing a `threshold` this model might then label outliers if their likelihood score is too low. + + !!! note + The parameters other than `threshold` and `method` are an exact copy of the parameters in + [sklearn.mixture.BayesianGaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html). Parameters ---------- @@ -28,10 +32,6 @@ class BayesianGMMOutlierDetector(OutlierMixin, BaseEstimator): If you select `method="stddev"` then the threshold value represents the numbers of standard deviations before calling something an outlier. - !!! note - The other parameters are an exact copy of the parameters in - [sklearn.mixture.BayesianGaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html). - Attributes ---------- gmm_ : BayesianGaussianMixture diff --git a/sklego/mixture/gmm_outlier_detector.py b/sklego/mixture/gmm_outlier_detector.py index b7339e729..dbefbac68 100644 --- a/sklego/mixture/gmm_outlier_detector.py +++ b/sklego/mixture/gmm_outlier_detector.py @@ -12,7 +12,11 @@ class GMMOutlierDetector(OutlierMixin, BaseEstimator): """The `GMMDetector` trains a Gaussian Mixture model on a dataset `X`. Once a density is trained we can evaluate the likelihood scores to see if it is deemed likely. - By giving a threshold this model might then label outliers if their likelihood score is too low. + By providing a `threshold` this model might then label outliers if their likelihood score is too low. + + !!! note + The parameters other than `threshold` and `method` are an exact copy of the parameters in + [sklearn.mixture.GaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html). Parameters ---------- @@ -28,10 +32,6 @@ class GMMOutlierDetector(OutlierMixin, BaseEstimator): If you select `method="stddev"` then the threshold value represents the numbers of standard deviations before calling something an outlier. - !!! note - The other parameters are an exact copy of the parameters in - [sklearn.mixture.GaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html). - Attributes ---------- gmm_ : GaussianMixture diff --git a/sklego/pandas_utils.py b/sklego/pandas_utils.py index a59abd733..d64323488 100644 --- a/sklego/pandas_utils.py +++ b/sklego/pandas_utils.py @@ -141,7 +141,8 @@ def log_step_extra( **log_func_kwargs: dict Keyword arguments to be passed to `log_functions` - Returns: + Returns + ------- Callable The decorated function. diff --git a/sklego/pipeline.py b/sklego/pipeline.py index 6677be293..9e2efb125 100644 --- a/sklego/pipeline.py +++ b/sklego/pipeline.py @@ -12,13 +12,6 @@ def default_log_callback(output, execution_time, **kwargs): """The default log callback which logs the step name, shape of the output and the execution time of the step. - Parameters - ---------- - output : tuple[np.ndarray | pd.DataFrame, estimator | transformer] - The output of the step and a step in the pipeline. - execution_time : float - The execution time of the step. - !!! info If you write your custom callback function the input is: @@ -31,6 +24,12 @@ def default_log_callback(output, execution_time, **kwargs): | `output` | T | The output of the function | | `execution_time` | float | The execution time of the step | + Parameters + ---------- + output : tuple[np.ndarray | pd.DataFrame, estimator | transformer] + The output of the step and a step in the pipeline. + execution_time : float + The execution time of the step. """ logger = logging.getLogger(__name__) step_result, step = output @@ -97,18 +96,11 @@ def _(self, func=None, *args, **kwargs): class DebugPipeline(Pipeline): """A pipeline that has a log statement in between each step, useful for debugging purposes. - Parameters - ---------- - log_callback : Callable | None, default=None - The callback function that logs information in between each intermediate step. - If set to `"default"`, `default_log_callback` is used. - - Notes - ----- See [`sklearn.pipeline.Pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline) - for all other variables. + for all other parameters other than `log_callback`. !!! note + This implementation is a hack on the original sklearn Pipeline. It aims to have the same behaviour as the original sklearn Pipeline, while changing minimal amount of code. @@ -124,6 +116,11 @@ class DebugPipeline(Pipeline): - The [`joblib.memory.Memory`](https://joblib.readthedocs.io/en/latest/generated/joblib.Memory.html) starts using a `_cache` method. + Parameters + ---------- + log_callback : Callable | None, default=None + The callback function that logs information in between each intermediate step. + If set to `"default"`, `default_log_callback` is used. Examples -------- diff --git a/sklego/preprocessing/outlier_remover.py b/sklego/preprocessing/outlier_remover.py index c073927e7..d3511c9e7 100644 --- a/sklego/preprocessing/outlier_remover.py +++ b/sklego/preprocessing/outlier_remover.py @@ -11,16 +11,36 @@ class OutlierRemover(TrainOnlyTransformerMixin, BaseEstimator): Parameters ---------- - outlier_detector : object + outlier_detector : scikit-learn compatible estimator An outlier detector that implements `.fit()` and `.predict()` methods. refit : bool, default=True - If True, fits the estimator during `pipeline.fit()`. If False, the estimator is not fitted during - `pipeline.fit()`. + Whether or not to fit the underlying estimator during `OutlierRemover(...).fit()`. Attributes ---------- estimator_ : object The fitted outlier detector. + + Example + ------- + ```py + import numpy as np + + from sklearn.ensemble import IsolationForest + from sklego.preprocessing import OutlierRemover + + np.random.seed(0) + X = np.random.randn(10000, 2) + + isolation_forest = IsolationForest() + isolation_forest.fit(X) + detector_preds = isolator_forest.predict(X) + + outlier_remover = OutlierRemover(isolation_forest, refit=True) + outlier_remover.fit(X) + + X_trans = outlier_remover.transform_train(X) + ``` """ def __init__(self, outlier_detector, refit=True): @@ -42,23 +62,6 @@ def fit(self, X, y=None): ------- self : OutlierRemover The fitted transformer. - - Example - ------- - ```py - from sklego.preprocessing import OutlierRemover - from sklearn.ensemble import IsolationForest - - np.random.seed(0) - X = np.random.randn(10000, 2) - - isolation_forest = IsolationForest() - isolation_forest.fit(X) - detector_preds = isolator_forest.predict(X) - - outlier_remover = OutlierRemover(isolation_forest, refit=True) - outlier_remover.fit(X) - ``` """ self.estimator_ = clone(self.outlier_detector) if self.refit: @@ -78,23 +81,6 @@ def transform_train(self, X): ------- np.ndarray of shape (n_not_outliers, n_features) The data with the outliers removed, where `n_not_outliers = n_samples - n_outliers`. - Example - ------- - ```py - from sklego.preprocessing import OutlierRemover - from sklearn.ensemble import IsolationForest - - np.random.seed(0) - X = np.random.randn(10000, 2) - - isolation_forest = IsolationForest() - isolation_forest.fit(X) - detector_preds = isolator_forest.predict(X) - - outlier_remover = OutlierRemover(isolation_forest, refit=True) - outlier_remover.fit(X) - X_trans = outlier_remover.transform_train(X) - ``` """ check_is_fitted(self, "estimator_") predictions = self.estimator_.predict(X)