diff --git a/python/cuml/cuml/preprocessing/TargetEncoder.py b/python/cuml/cuml/preprocessing/TargetEncoder.py index 5b2af8457f..f069a49c9a 100644 --- a/python/cuml/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/cuml/preprocessing/TargetEncoder.py @@ -20,9 +20,6 @@ from cuml.internals.outputs import reflect from cuml.internals.validation import check_features, check_is_fitted -# Module-level flag to ensure deprecation warning only fires once per process -_COMBINATION_MODE_1D_WARNING_SHOWN = False - def get_stat_func(stat): def func(ds): @@ -165,8 +162,6 @@ class TargetEncoder(Base, InteropMixin): -------- Converting a categorical implementation to a numerical one - >>> import warnings - >>> warnings.filterwarnings('ignore', category=FutureWarning) >>> from cudf import DataFrame, Series >>> from cuml.preprocessing import TargetEncoder >>> train = DataFrame({'category': ['a', 'b', 'b', 'a'], @@ -174,12 +169,12 @@ class TargetEncoder(Base, InteropMixin): >>> test = DataFrame({'category': ['a', 'c', 'b', 'a']}) >>> encoder = TargetEncoder(output_type='numpy') - >>> train_encoded = encoder.fit_transform(train[["category"]], train.label) - >>> test_encoded = encoder.transform(test[["category"]]) - >>> print(train_encoded) - [1. 1. 0. 1.] - >>> print(test_encoded) - [1. 0.75 0.5 1. ] + >>> encoded = encoder.fit_transform(train[["category"]], train.label) + >>> encoded + array([[1.], + [1.], + [0.], + [1.]]) """ # InteropMixin requirements @@ -792,36 +787,13 @@ def _is_train_df(self, df): def _impute_and_sort(self, df): """ Impute and sort the result encoding in the same row order as input. - - Returns 2D array (n_samples, 1) when in independent mode (sklearn - compatibility), otherwise returns 1D array (cuML native behavior). """ df[self.out_col] = df[self.out_col].nans_to_nulls() df[self.out_col] = df[self.out_col].fillna(self.y_stat_val) df = df.sort_values(self.id_col) res = df[self.out_col].values.copy() - # Reshape to 2D (n_samples, 1) for sklearn compatibility when: - # - multi_feature_mode="independent" is set (by cuml.accel or user) - # - _independent_mode_fitted is True (multi-feature independent mode) - sklearn_compat = getattr( - self, "multi_feature_mode", "combination" - ) == "independent" or getattr(self, "_independent_mode_fitted", False) - if sklearn_compat: - res = res.reshape(-1, 1) - else: - # Deprecation warning for 1D output in combination mode (once per process) - global _COMBINATION_MODE_1D_WARNING_SHOWN - if not _COMBINATION_MODE_1D_WARNING_SHOWN: - warnings.warn( - "TargetEncoder currently returns 1D output for combination mode " - "(multi_feature_mode='combination'). In version 26.04, the output " - "will change to 2D (n_samples, n_output_features) for consistency " - "with sklearn. Use .ravel() if you need 1D output.", - FutureWarning, - stacklevel=4, - ) - _COMBINATION_MODE_1D_WARNING_SHOWN = True + res = res.reshape(-1, 1) return CumlArray(res) def _data_with_strings_to_cudf_dataframe(self, x): diff --git a/python/cuml/tests/test_target_encoder.py b/python/cuml/tests/test_target_encoder.py index ab1aba1489..c9a1694c07 100644 --- a/python/cuml/tests/test_target_encoder.py +++ b/python/cuml/tests/test_target_encoder.py @@ -15,6 +15,10 @@ "ignore:TargetEncoder currently returns 1D output:FutureWarning" ) +# TODO: many of these tests use `output_type="numpy"` to work around +# https://github.com/rapidsai/cuml/issues/7893. These can be +# reverted once that's resolved. + def test_targetencoder_deprecated_1d_input(): df = cudf.DataFrame( @@ -22,14 +26,14 @@ def test_targetencoder_deprecated_1d_input(): ) # Warns in fit_transform - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") with pytest.warns(FutureWarning, match="non-2-dimensional X"): encoded = encoder.fit_transform(df.category, df.label) - answer = np.array([1.0, 1.0, 0.0, 1.0]) + answer = np.array([1.0, 1.0, 0.0, 1.0])[:, None] assert array_equal(encoded, answer) # Warns in fit - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") with pytest.warns(FutureWarning, match="non-2-dimensional X"): encoder.fit(df.category, df.label) @@ -42,12 +46,12 @@ def test_targetencoder_deprecated_1d_input(): def test_targetencoder_fit_transform(): train = cudf.DataFrame({"category": ["a", "b", "b", "a"]}) label = cudf.Series([1, 0, 1, 1]) - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") train_encoded = encoder.fit_transform(train, label) - answer = np.array([1.0, 1.0, 0.0, 1.0]) + answer = np.array([1.0, 1.0, 0.0, 1.0])[:, None] assert array_equal(train_encoded, answer) - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") encoder.fit(train, label) train_encoded = encoder.transform(train) @@ -58,13 +62,13 @@ def test_targetencoder_transform(): train = cudf.DataFrame({"category": ["a", "b", "b", "a"]}) label = cudf.Series([1, 0, 1, 1]) test = cudf.DataFrame({"category": ["b", "b", "a", "b"]}) - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") encoder.fit_transform(train, label) test_encoded = encoder.transform(test) - answer = np.array([0.5, 0.5, 1.0, 0.5]) + answer = np.array([0.5, 0.5, 1.0, 0.5])[:, None] assert array_equal(test_encoded, answer) - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") encoder.fit(train, label) test_encoded = encoder.transform(test) assert array_equal(test_encoded, answer) @@ -88,7 +92,7 @@ def test_targetencoder_random(n_samples, dtype, stat): df_test["row_id"] = cp.arange(len(df_test)) df_test = df_test.merge(dg, on="x", how="left") df_test = df_test.sort_values("row_id") - answer = df_test["y"].fillna(eval(f"cp.{stat}")(y).item()).values + answer = df_test["y"].fillna(getattr(cp, stat)(y).item()).values[:, None] assert array_equal(test_encoded, answer) @@ -106,17 +110,19 @@ def test_targetencoder_multi_column(): test = cudf.DataFrame( {"cat_1": ["b", "b", "a", "b"], "cat_2": [1, 2, 1, 2]} ) - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") train_encoded = encoder.fit_transform( train[["cat_1", "cat_2"]], train.label ) test_encoded = encoder.transform(test[["cat_1", "cat_2"]]) - train_answer = np.array([2.0 / 3, 2.0 / 3, 1.0, 2.0 / 3, 2.0 / 3, 1.0]) - test_answer = np.array([0.0, 1.0, 0.5, 1.0]) + train_answer = np.array([2.0 / 3, 2.0 / 3, 1.0, 2.0 / 3, 2.0 / 3, 1.0])[ + :, None + ] + test_answer = np.array([0.0, 1.0, 0.5, 1.0])[:, None] assert array_equal(train_encoded, train_answer) assert array_equal(test_encoded, test_answer) - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") encoder.fit(train[["cat_1", "cat_2"]], train.label) train_encoded = encoder.transform(train[["cat_1", "cat_2"]]) test_encoded = encoder.transform(test[["cat_1", "cat_2"]]) @@ -132,13 +138,13 @@ def test_targetencoder_newly_encountered(): train = cudf.DataFrame({"category": ["a", "b", "b", "a"]}) label = cudf.Series([1, 0, 1, 1]) test = cudf.DataFrame({"category": ["c", "b", "a", "d"]}) - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") encoder.fit_transform(train, label) test_encoded = encoder.transform(test) - answer = np.array([0.75, 0.5, 1.0, 0.75]) + answer = np.array([0.75, 0.5, 1.0, 0.75])[:, None] assert array_equal(test_encoded, answer) - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") encoder.fit(train, label) test_encoded = encoder.transform(test) assert array_equal(test_encoded, answer) @@ -149,13 +155,13 @@ def test_one_category(): label = cudf.Series([3, 0, 0, 3]) test = cudf.DataFrame({"category": ["c", "b", "a", "d"]}) - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") train_encoded = encoder.fit_transform(train, label) - answer = np.array([1.0, 2.0, 2.0, 1.0]) + answer = np.array([1.0, 2.0, 2.0, 1.0])[:, None] assert array_equal(train_encoded, answer) test_encoded = encoder.transform(test) - answer = np.array([1.5, 1.5, 1.5, 1.5]) + answer = np.array([1.5, 1.5, 1.5, 1.5])[:, None] assert array_equal(test_encoded, answer) @@ -167,12 +173,11 @@ def test_targetencoder_pandas(): train = pandas.DataFrame({"category": ["a", "b", "b", "a"]}) label = pandas.Series([1, 0, 1, 1]) test = pandas.DataFrame({"category": ["c", "b", "a", "d"]}) - encoder = TargetEncoder() + encoder = TargetEncoder(output_type="numpy") encoder.fit_transform(train, label) test_encoded = encoder.transform(test) - answer = np.array([0.75, 0.5, 1.0, 0.75]) + answer = np.array([0.75, 0.5, 1.0, 0.75])[:, None] assert array_equal(test_encoded, answer) - assert isinstance(test_encoded, pandas.Series) def test_targetencoder_numpy(): @@ -186,7 +191,7 @@ def test_targetencoder_numpy(): encoder = TargetEncoder() encoder.fit_transform(x_train, y_train) test_encoded = encoder.transform(x_test) - answer = np.array([1.0, 0.5, 0.75, 0.75]) + answer = np.array([1.0, 0.5, 0.75, 0.75])[:, None] assert array_equal(test_encoded, answer) assert isinstance(test_encoded, np.ndarray) @@ -202,7 +207,7 @@ def test_targetencoder_cupy(): encoder = TargetEncoder() encoder.fit_transform(x_train, y_train) test_encoded = encoder.transform(x_test) - answer = np.array([1.0, 0.5, 0.75, 0.75]) + answer = np.array([1.0, 0.5, 0.75, 0.75])[:, None] assert array_equal(test_encoded, answer) assert isinstance(test_encoded, cp.ndarray) @@ -241,12 +246,12 @@ def test_targetencoder_customized_fold_id(): train = cudf.DataFrame({"category": ["a", "b", "b", "a"]}) label = cudf.Series([1, 0, 1, 1]) fold_ids = [0, 1, 1, 2] - encoder = TargetEncoder(split_method="customize") + encoder = TargetEncoder(split_method="customize", output_type="numpy") train_encoded = encoder.fit_transform(train, label, fold_ids=fold_ids) - answer = np.array([1.0, 0.75, 0.75, 1.0]) + answer = np.array([1.0, 0.75, 0.75, 1.0])[:, None] assert array_equal(train_encoded, answer) - encoder = TargetEncoder(split_method="customize") + encoder = TargetEncoder(split_method="customize", output_type="numpy") encoder.fit(train, label, fold_ids=fold_ids) train_encoded = encoder.transform(train) @@ -256,12 +261,12 @@ def test_targetencoder_customized_fold_id(): def test_targetencoder_var(): train = cudf.DataFrame({"category": ["a", "b", "b", "b"]}) label = cudf.Series([1, 0, 1, 1]) - encoder = TargetEncoder(stat="var") + encoder = TargetEncoder(stat="var", output_type="numpy") train_encoded = encoder.fit_transform(train, label) - answer = np.array([0.25, 0.0, 0.5, 0.5]) + answer = np.array([0.25, 0.0, 0.5, 0.5])[:, None] assert array_equal(train_encoded, answer) - encoder = TargetEncoder(stat="var") + encoder = TargetEncoder(stat="var", output_type="numpy") encoder.fit(train, label) train_encoded = encoder.transform(train) @@ -276,11 +281,11 @@ def test_transform_with_index(): X = df[["a"]] y = df["b"] - t_enc = TargetEncoder() + t_enc = TargetEncoder(output_type="numpy") t_enc.fit(X, y) train_encoded = t_enc.transform(X) - ans = cp.asarray([0, 1, 0.5, 0.5]) + ans = cp.asarray([0, 1, 0.5, 0.5])[:, None] assert array_equal(train_encoded, ans) @@ -302,12 +307,14 @@ def test_targetencoder_median(): {"category": ["a", "a", "a", "a", "b", "b", "b", "b"]} ) label = cudf.Series([1, 22, 15, 17, 70, 9, 99, 56]) - encoder = TargetEncoder(stat="median") + encoder = TargetEncoder(stat="median", output_type="numpy") train_encoded = encoder.fit_transform(train, label) - answer = np.array([17.0, 15.0, 17.0, 15.0, 56.0, 70.0, 56.0, 70.0]) + answer = np.array([17.0, 15.0, 17.0, 15.0, 56.0, 70.0, 56.0, 70.0])[ + :, None + ] assert array_equal(train_encoded, answer) - encoder = TargetEncoder(stat="median") + encoder = TargetEncoder(stat="median", output_type="numpy") encoder.fit(train, label) train_encoded = encoder.transform(train)