Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 7 additions & 35 deletions python/cuml/cuml/preprocessing/TargetEncoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@
from cuml.internals.outputs import reflect
from cuml.internals.validation import check_features, check_is_fitted

# Module-level flag to ensure deprecation warning only fires once per process
_COMBINATION_MODE_1D_WARNING_SHOWN = False


def get_stat_func(stat):
def func(ds):
Expand Down Expand Up @@ -165,21 +162,19 @@ class TargetEncoder(Base, InteropMixin):
--------
Converting a categorical implementation to a numerical one

>>> import warnings
>>> warnings.filterwarnings('ignore', category=FutureWarning)
>>> from cudf import DataFrame, Series
>>> from cuml.preprocessing import TargetEncoder
>>> train = DataFrame({'category': ['a', 'b', 'b', 'a'],
... 'label': [1, 0, 1, 1]})
>>> test = DataFrame({'category': ['a', 'c', 'b', 'a']})

>>> encoder = TargetEncoder(output_type='numpy')
>>> train_encoded = encoder.fit_transform(train[["category"]], train.label)
>>> test_encoded = encoder.transform(test[["category"]])
>>> print(train_encoded)
[1. 1. 0. 1.]
>>> print(test_encoded)
[1. 0.75 0.5 1. ]
>>> encoded = encoder.fit_transform(train[["category"]], train.label)
>>> encoded
array([[1.],
[1.],
[0.],
[1.]])
"""

# InteropMixin requirements
Expand Down Expand Up @@ -792,36 +787,13 @@ def _is_train_df(self, df):
def _impute_and_sort(self, df):
"""
Impute and sort the result encoding in the same row order as input.

Returns 2D array (n_samples, 1) when in independent mode (sklearn
compatibility), otherwise returns 1D array (cuML native behavior).
"""

df[self.out_col] = df[self.out_col].nans_to_nulls()
df[self.out_col] = df[self.out_col].fillna(self.y_stat_val)
df = df.sort_values(self.id_col)
res = df[self.out_col].values.copy()
# Reshape to 2D (n_samples, 1) for sklearn compatibility when:
# - multi_feature_mode="independent" is set (by cuml.accel or user)
# - _independent_mode_fitted is True (multi-feature independent mode)
sklearn_compat = getattr(
self, "multi_feature_mode", "combination"
) == "independent" or getattr(self, "_independent_mode_fitted", False)
if sklearn_compat:
res = res.reshape(-1, 1)
else:
# Deprecation warning for 1D output in combination mode (once per process)
global _COMBINATION_MODE_1D_WARNING_SHOWN
if not _COMBINATION_MODE_1D_WARNING_SHOWN:
warnings.warn(
"TargetEncoder currently returns 1D output for combination mode "
"(multi_feature_mode='combination'). In version 26.04, the output "
"will change to 2D (n_samples, n_output_features) for consistency "
"with sklearn. Use .ravel() if you need 1D output.",
FutureWarning,
stacklevel=4,
)
_COMBINATION_MODE_1D_WARNING_SHOWN = True
res = res.reshape(-1, 1)
return CumlArray(res)

def _data_with_strings_to_cudf_dataframe(self, x):
Expand Down
79 changes: 43 additions & 36 deletions python/cuml/tests/test_target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,25 @@
"ignore:TargetEncoder currently returns 1D output:FutureWarning"
)

# TODO: many of these tests use `output_type="numpy"` to work around
# https://github.com/rapidsai/cuml/issues/7893. These can be
# reverted once that's resolved.


def test_targetencoder_deprecated_1d_input():
df = cudf.DataFrame(
{"category": ["a", "b", "b", "a"], "label": [1, 0, 1, 1]}
)

# Warns in fit_transform
encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
Comment thread
jcrist marked this conversation as resolved.
with pytest.warns(FutureWarning, match="non-2-dimensional X"):
encoded = encoder.fit_transform(df.category, df.label)
answer = np.array([1.0, 1.0, 0.0, 1.0])
answer = np.array([1.0, 1.0, 0.0, 1.0])[:, None]
assert array_equal(encoded, answer)

# Warns in fit
encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
with pytest.warns(FutureWarning, match="non-2-dimensional X"):
encoder.fit(df.category, df.label)

Expand All @@ -42,12 +46,12 @@ def test_targetencoder_deprecated_1d_input():
def test_targetencoder_fit_transform():
train = cudf.DataFrame({"category": ["a", "b", "b", "a"]})
label = cudf.Series([1, 0, 1, 1])
encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
train_encoded = encoder.fit_transform(train, label)
answer = np.array([1.0, 1.0, 0.0, 1.0])
answer = np.array([1.0, 1.0, 0.0, 1.0])[:, None]
assert array_equal(train_encoded, answer)

encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
encoder.fit(train, label)
train_encoded = encoder.transform(train)

Expand All @@ -58,13 +62,13 @@ def test_targetencoder_transform():
train = cudf.DataFrame({"category": ["a", "b", "b", "a"]})
label = cudf.Series([1, 0, 1, 1])
test = cudf.DataFrame({"category": ["b", "b", "a", "b"]})
encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
encoder.fit_transform(train, label)
test_encoded = encoder.transform(test)
answer = np.array([0.5, 0.5, 1.0, 0.5])
answer = np.array([0.5, 0.5, 1.0, 0.5])[:, None]
assert array_equal(test_encoded, answer)

encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
encoder.fit(train, label)
test_encoded = encoder.transform(test)
assert array_equal(test_encoded, answer)
Expand All @@ -88,7 +92,7 @@ def test_targetencoder_random(n_samples, dtype, stat):
df_test["row_id"] = cp.arange(len(df_test))
df_test = df_test.merge(dg, on="x", how="left")
df_test = df_test.sort_values("row_id")
answer = df_test["y"].fillna(eval(f"cp.{stat}")(y).item()).values
answer = df_test["y"].fillna(getattr(cp, stat)(y).item()).values[:, None]
assert array_equal(test_encoded, answer)


Expand All @@ -106,17 +110,19 @@ def test_targetencoder_multi_column():
test = cudf.DataFrame(
{"cat_1": ["b", "b", "a", "b"], "cat_2": [1, 2, 1, 2]}
)
encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
train_encoded = encoder.fit_transform(
train[["cat_1", "cat_2"]], train.label
)
test_encoded = encoder.transform(test[["cat_1", "cat_2"]])
train_answer = np.array([2.0 / 3, 2.0 / 3, 1.0, 2.0 / 3, 2.0 / 3, 1.0])
test_answer = np.array([0.0, 1.0, 0.5, 1.0])
train_answer = np.array([2.0 / 3, 2.0 / 3, 1.0, 2.0 / 3, 2.0 / 3, 1.0])[
:, None
]
test_answer = np.array([0.0, 1.0, 0.5, 1.0])[:, None]
assert array_equal(train_encoded, train_answer)
assert array_equal(test_encoded, test_answer)

encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
encoder.fit(train[["cat_1", "cat_2"]], train.label)
train_encoded = encoder.transform(train[["cat_1", "cat_2"]])
test_encoded = encoder.transform(test[["cat_1", "cat_2"]])
Expand All @@ -132,13 +138,13 @@ def test_targetencoder_newly_encountered():
train = cudf.DataFrame({"category": ["a", "b", "b", "a"]})
label = cudf.Series([1, 0, 1, 1])
test = cudf.DataFrame({"category": ["c", "b", "a", "d"]})
encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
encoder.fit_transform(train, label)
test_encoded = encoder.transform(test)
answer = np.array([0.75, 0.5, 1.0, 0.75])
answer = np.array([0.75, 0.5, 1.0, 0.75])[:, None]
assert array_equal(test_encoded, answer)

encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
encoder.fit(train, label)
test_encoded = encoder.transform(test)
assert array_equal(test_encoded, answer)
Expand All @@ -149,13 +155,13 @@ def test_one_category():
label = cudf.Series([3, 0, 0, 3])
test = cudf.DataFrame({"category": ["c", "b", "a", "d"]})

encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
train_encoded = encoder.fit_transform(train, label)
answer = np.array([1.0, 2.0, 2.0, 1.0])
answer = np.array([1.0, 2.0, 2.0, 1.0])[:, None]
assert array_equal(train_encoded, answer)

test_encoded = encoder.transform(test)
answer = np.array([1.5, 1.5, 1.5, 1.5])
answer = np.array([1.5, 1.5, 1.5, 1.5])[:, None]
assert array_equal(test_encoded, answer)


Expand All @@ -167,12 +173,11 @@ def test_targetencoder_pandas():
train = pandas.DataFrame({"category": ["a", "b", "b", "a"]})
label = pandas.Series([1, 0, 1, 1])
test = pandas.DataFrame({"category": ["c", "b", "a", "d"]})
encoder = TargetEncoder()
encoder = TargetEncoder(output_type="numpy")
encoder.fit_transform(train, label)
test_encoded = encoder.transform(test)
answer = np.array([0.75, 0.5, 1.0, 0.75])
answer = np.array([0.75, 0.5, 1.0, 0.75])[:, None]
assert array_equal(test_encoded, answer)
assert isinstance(test_encoded, pandas.Series)


def test_targetencoder_numpy():
Expand All @@ -186,7 +191,7 @@ def test_targetencoder_numpy():
encoder = TargetEncoder()
encoder.fit_transform(x_train, y_train)
test_encoded = encoder.transform(x_test)
answer = np.array([1.0, 0.5, 0.75, 0.75])
answer = np.array([1.0, 0.5, 0.75, 0.75])[:, None]
assert array_equal(test_encoded, answer)
assert isinstance(test_encoded, np.ndarray)

Expand All @@ -202,7 +207,7 @@ def test_targetencoder_cupy():
encoder = TargetEncoder()
encoder.fit_transform(x_train, y_train)
test_encoded = encoder.transform(x_test)
answer = np.array([1.0, 0.5, 0.75, 0.75])
answer = np.array([1.0, 0.5, 0.75, 0.75])[:, None]
assert array_equal(test_encoded, answer)
assert isinstance(test_encoded, cp.ndarray)

Expand Down Expand Up @@ -241,12 +246,12 @@ def test_targetencoder_customized_fold_id():
train = cudf.DataFrame({"category": ["a", "b", "b", "a"]})
label = cudf.Series([1, 0, 1, 1])
fold_ids = [0, 1, 1, 2]
encoder = TargetEncoder(split_method="customize")
encoder = TargetEncoder(split_method="customize", output_type="numpy")
train_encoded = encoder.fit_transform(train, label, fold_ids=fold_ids)
answer = np.array([1.0, 0.75, 0.75, 1.0])
answer = np.array([1.0, 0.75, 0.75, 1.0])[:, None]
assert array_equal(train_encoded, answer)

encoder = TargetEncoder(split_method="customize")
encoder = TargetEncoder(split_method="customize", output_type="numpy")
encoder.fit(train, label, fold_ids=fold_ids)
train_encoded = encoder.transform(train)

Expand All @@ -256,12 +261,12 @@ def test_targetencoder_customized_fold_id():
def test_targetencoder_var():
train = cudf.DataFrame({"category": ["a", "b", "b", "b"]})
label = cudf.Series([1, 0, 1, 1])
encoder = TargetEncoder(stat="var")
encoder = TargetEncoder(stat="var", output_type="numpy")
train_encoded = encoder.fit_transform(train, label)
answer = np.array([0.25, 0.0, 0.5, 0.5])
answer = np.array([0.25, 0.0, 0.5, 0.5])[:, None]
assert array_equal(train_encoded, answer)

encoder = TargetEncoder(stat="var")
encoder = TargetEncoder(stat="var", output_type="numpy")
encoder.fit(train, label)
train_encoded = encoder.transform(train)

Expand All @@ -276,11 +281,11 @@ def test_transform_with_index():
X = df[["a"]]
y = df["b"]

t_enc = TargetEncoder()
t_enc = TargetEncoder(output_type="numpy")

t_enc.fit(X, y)
train_encoded = t_enc.transform(X)
ans = cp.asarray([0, 1, 0.5, 0.5])
ans = cp.asarray([0, 1, 0.5, 0.5])[:, None]
assert array_equal(train_encoded, ans)


Expand All @@ -302,12 +307,14 @@ def test_targetencoder_median():
{"category": ["a", "a", "a", "a", "b", "b", "b", "b"]}
)
label = cudf.Series([1, 22, 15, 17, 70, 9, 99, 56])
encoder = TargetEncoder(stat="median")
encoder = TargetEncoder(stat="median", output_type="numpy")
train_encoded = encoder.fit_transform(train, label)
answer = np.array([17.0, 15.0, 17.0, 15.0, 56.0, 70.0, 56.0, 70.0])
answer = np.array([17.0, 15.0, 17.0, 15.0, 56.0, 70.0, 56.0, 70.0])[
:, None
]
assert array_equal(train_encoded, answer)

encoder = TargetEncoder(stat="median")
encoder = TargetEncoder(stat="median", output_type="numpy")
encoder.fit(train, label)
train_encoded = encoder.transform(train)

Expand Down
Loading