Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,9 @@ CUML_KERNEL void excess_sample_with_replacement_kernel(
// compute the mask
// compute the adjacent differences according to the functor
// TODO: Replace deprecated 'FlagHeads' with 'SubtractLeft' when it is available
// Use -1 as the initial value since it can't match any valid column index [0, n-1]
BlockAdjacentDifferenceT(temp_storage.diff)
.SubtractLeft(items, mask, CustomDifference<IdxT>(), mask[0]);
.SubtractLeft(items, mask, CustomDifference<IdxT>(), IdxT(-1));
Comment on lines +220 to +222
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This appears correct to me. The previous implementation was comparing the first randomly selected column index against the initial value of mask[0] which is always zero. Outside the fact that comparing against a mask value makes absolutely no sense here, this also means it would never be selected, because the items array is sorted.


__syncthreads();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,10 @@
- "sklearn.feature_selection.tests.test_rfe::test_rfe_cv_groups"
- "sklearn.feature_selection.tests.test_rfe::test_rfe_features_importance"
- "sklearn.inspection.tests.test_partial_dependence::test_recursion_decision_tree_vs_forest_and_gbdt[0]"
- "sklearn.inspection.tests.test_permutation_importance::test_permutation_importance_correlated_feature_regression_pandas[0.5-1]"
- "sklearn.inspection.tests.test_permutation_importance::test_permutation_importance_correlated_feature_regression_pandas[0.5-2]"
- "sklearn.inspection.tests.test_permutation_importance::test_permutation_importance_correlated_feature_regression_pandas[1.0-1]"
- "sklearn.inspection.tests.test_permutation_importance::test_permutation_importance_correlated_feature_regression_pandas[1.0-2]"
- "sklearn.inspection.tests.test_permutation_importance::test_robustness_to_high_cardinality_noisy_feature[0.5-1]"
- "sklearn.inspection.tests.test_permutation_importance::test_robustness_to_high_cardinality_noisy_feature[0.5-2]"
- "sklearn.inspection.tests.test_permutation_importance::test_robustness_to_high_cardinality_noisy_feature[1.0-1]"
Expand Down
32 changes: 16 additions & 16 deletions python/cuml/tests/explainer/test_explainer_kernel_shap.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,24 +519,24 @@ def test_typeerror_input():
housing_regression_result = np.array(
[
[
-0.00182223,
-0.01232004,
-0.4782278,
0.04781425,
-0.01337761,
-0.34830606,
-0.4682865,
-0.20812261,
-0.8974524140357971,
0.001421511173248291,
-0.0688888430595398,
-0.03094351291656494,
-0.015949785709381104,
-0.23235774040222168,
-0.21568483114242554,
-0.0710676908493042,
],
[
-0.0013606,
0.0110372,
-0.445176,
-0.08268094,
0.00406259,
-0.02185595,
-0.47673094,
-0.13557231,
-0.744776725769043,
0.01672065258026123,
-0.1426766812801361,
0.06865900754928589,
-0.01718229055404663,
-0.06164264678955078,
-0.18163931369781494,
-0.039707064628601074,
],
],
dtype=np.float32,
Expand Down
59 changes: 59 additions & 0 deletions python/cuml/tests/test_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1331,3 +1331,62 @@ def test_rf_oob_score_binary_classification():

# OOB decision function should have 2 classes
assert clf.oob_decision_function_.shape[1] == 2


@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("n_features", [10, 20, 50])
@pytest.mark.skipif(
cudf_pandas_active,
reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
"Issue: https://github.com/rapidsai/cuml/issues/5991",
)
def test_rf_feature_zero_bias(datatype, n_features):
"""
Test for feature 0 sampling bias in RandomForest.

Creates a dataset where ONLY feature 0 predicts the target,
and all other features are pure noise. This tests whether
feature 0 is being properly sampled during tree building.

Regression test for: https://github.com/rapidsai/cuml/issues/7422
"""
n_samples = 5000

# Create dataset where only feature 0 is predictive
np.random.seed(42)
X = np.random.randn(n_samples, n_features).astype(datatype)
# Target depends ONLY on feature 0
y = (X[:, 0] > 0).astype(np.int32)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)

# Train scikit-learn model
sk_model = skrfc(
n_estimators=100,
max_features="sqrt",
max_depth=5,
min_samples_split=10,
random_state=42,
n_jobs=-1,
)
sk_model.fit(X_train, y_train)
sk_pred = sk_model.predict(X_test)
sk_acc = accuracy_score(y_test, sk_pred)

# Train cuML model
cuml_model = curfc(
n_estimators=100,
max_features="sqrt",
max_depth=5,
min_samples_split=10,
random_state=42,
)
cuml_model.fit(X_train, y_train)
cuml_pred = cuml_model.predict(X_test)
cuml_acc = accuracy_score(y_test, cuml_pred)

# cuML should achieve similar accuracy to sklearn
# If feature 0 is severely under-sampled, accuracy will be much lower
assert cuml_acc >= sk_acc - 0.10
Loading