Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 57 additions & 2 deletions sklego/preprocessing/repeatingbasis.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ def __init__(self, column=0, remainder="drop", n_periods=12, input_range=None, w
self.n_periods = n_periods
self.input_range = input_range
self.width = width

def get_feature_names_out(self, input_features=None):
feature_names = self.pipeline_.get_feature_names_out()
return feature_names

def fit(self, X, y=None):
"""Fit `RepeatingBasisFunction` transformer on input data `X`.
Expand All @@ -82,6 +86,7 @@ def fit(self, X, y=None):
self : RepeatingBasisFunction
The fitted transformer.
"""

self.pipeline_ = ColumnTransformer(
[
(
Expand All @@ -95,6 +100,7 @@ def fit(self, X, y=None):
)
],
remainder=self.remainder,
verbose_feature_names_out = False
)

self.pipeline_.fit(X, y)
Expand All @@ -116,6 +122,31 @@ def transform(self, X):
"""
check_is_fitted(self, ["pipeline_"])
return self.pipeline_.transform(X)

def inverse_transform(self, X):
"""Transform RBF features back to the input range. Outputs a numpy array.

Comment on lines +126 to +128
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add what you mention in the description in the docstring Notes of the method?

In particular I am referring to:

Note that the transformation is only invertible if the original values are in the input_range (upper bound excluded). Otherwise, the reconstructed values are only equal modulo the width of the range.

Parameters
----------
X : 2D array-like of shape (n_samples, n_features)
Can be either a pandas.DataFrame or a numpy array.
The RBF columns to transform back must appear first.

Returns
-------
X_transformed : array-like with the reconstruction of the original values
in input_range in the first column. Will be of the same type as X.
"""

check_is_fitted(self, ["pipeline_"])

if isinstance(X,np.ndarray):
Xarr = check_array(X[:,:self.n_periods], estimator=self, ensure_2d=True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be more comfortable with something along the following lines:

If line 144 becomes

Xarr = check_array(X, estimator=self, ensure_2d=True, ensure_min_features=self.n_periods)[:, :self.n_periods]

which also convert to array (some) dataframe-like objects and maybe it's even possible to avoid checking for array instance.

new_x = self.pipeline_.named_transformers_['repeatingbasis'].inverse_transform(Xarr)
Xarr = np.hstack((new_x.reshape(-1, 1),X[:,self.n_periods:]))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we returning the input as well? kind of related to the comment in tests

return Xarr
else:
raise TypeError("X must be a numpy array.")


class _RepeatingBasisFunction(TransformerMixin, BaseEstimator):
Expand Down Expand Up @@ -147,7 +178,10 @@ def __init__(self, n_periods: int = 12, input_range=None, width: float = 1.0):
self.n_periods = n_periods
self.input_range = input_range
self.width = width


def get_feature_names_out(self, input_features=None):
return [f"{input_features[0]}_rbf{str(i)}" for i in range(self.n_periods)]

def fit(self, X, y=None):
"""Fit the transformer to the input data and compute the basis functions.

Expand All @@ -163,6 +197,7 @@ def fit(self, X, y=None):
self : _RepeatingBasisFunction
The fitted transformer.
"""

X = check_array(X, estimator=self)

# find min and max for standardization if not given explicitly
Expand Down Expand Up @@ -208,7 +243,27 @@ def transform(self, X):

# apply rbf function to series for each basis
return self._rbf(base_distances)


def inverse_transform(self, X):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is awesome ✨

"""Transform features back to the input range.
"""

X = check_array(X, estimator=self, ensure_2d=True)
check_is_fitted(self, ["bases_", "width_"])
if X.shape[1] != self.n_periods:
raise ValueError(f"X should have exactly one column for each period, it has: {X.shape[1]}")
# Convert back to distances:
X = self.width_*np.sqrt(-np.log(X))
# Find closest base for each row:
min_col_indices = np.argmin(X, axis=1)
# Find direction by comparing distance to next and previous base (modulo circularity)
directions = np.sign([X[i,(min_col_indices[i] - 1) % self.n_periods]-X[i,(min_col_indices[i] + 1) % self.n_periods] for i in range(X.shape[0])])
# Retrieve original value
Y = self.bases_[min_col_indices] + directions*[X[i,min_col_indices[i]] for i in range(X.shape[0])]
Y[Y < 0] += 1
Y = (self.input_range[1] - self.input_range[0])*Y + self.input_range[0]
return Y

def _array_base_distance(self, arr: np.ndarray, base: float) -> np.ndarray:
"""Calculate the distances between all array values and the base, where 0 and 1 are assumed to be at the same
positions
Expand Down
17 changes: 17 additions & 0 deletions tests/test_preprocessing/test_repeatingbasisfunction.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,20 @@ def test_when_rbf_helper_receives_more_than_one_col_raises_value_error(df):
rbf_helper_tf = _RepeatingBasisFunction()
with pytest.raises(ValueError):
rbf_helper_tf.fit(X, y)

def test_pandas_output(df):
X, y = df[["a", "b", "c", "d"]], df[["e"]]
tf = RepeatingBasisFunction(column=0, n_periods=4, remainder="passthrough")
tf.set_output(transform='pandas')
Z = tf.fit(X, y).transform(X)
assert isinstance(Z, pd.core.frame.DataFrame)

def test_inverse_transform(df):
X, y = df[["a", "b", "c", "d"]], df[["e"]]
tf = RepeatingBasisFunction(column=0, n_periods=4, input_range=(0,7), remainder="drop")
Z = tf.fit(X, y).transform(X)
assert np.allclose(
X["a"],
tf.pipeline_.named_transformers_['repeatingbasis'].inverse_transform(Z),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels a bit too nested to me.

Why don't we

Suggested change
tf.pipeline_.named_transformers_['repeatingbasis'].inverse_transform(Z),
tf.inverse_transform(Z),

?

They should suppose to return the same or am I missing something?

rtol=1e-08,
atol=1e-12)