rapidsai · rapids-bot · Nov 7, 2025 · Nov 1, 2025 · Nov 3, 2025 · Nov 3, 2025
@@ -217,8 +217,9 @@ CUML_KERNEL void excess_sample_with_replacement_kernel(
     // compute the mask
     // compute the adjacent differences according to the functor
     // TODO: Replace deprecated 'FlagHeads' with 'SubtractLeft' when it is available
+    // Use -1 as the initial value since it can't match any valid column index [0, n-1]
     BlockAdjacentDifferenceT(temp_storage.diff)
-      .SubtractLeft(items, mask, CustomDifference<IdxT>(), mask[0]);
+      .SubtractLeft(items, mask, CustomDifference<IdxT>(), IdxT(-1));
 
     __syncthreads();
 

@@ -186,6 +186,10 @@
   - "sklearn.feature_selection.tests.test_rfe::test_rfe_cv_groups"
   - "sklearn.feature_selection.tests.test_rfe::test_rfe_features_importance"
   - "sklearn.inspection.tests.test_partial_dependence::test_recursion_decision_tree_vs_forest_and_gbdt[0]"
+  - "sklearn.inspection.tests.test_permutation_importance::test_permutation_importance_correlated_feature_regression_pandas[0.5-1]"
+  - "sklearn.inspection.tests.test_permutation_importance::test_permutation_importance_correlated_feature_regression_pandas[0.5-2]"
+  - "sklearn.inspection.tests.test_permutation_importance::test_permutation_importance_correlated_feature_regression_pandas[1.0-1]"
+  - "sklearn.inspection.tests.test_permutation_importance::test_permutation_importance_correlated_feature_regression_pandas[1.0-2]"
   - "sklearn.inspection.tests.test_permutation_importance::test_robustness_to_high_cardinality_noisy_feature[0.5-1]"
   - "sklearn.inspection.tests.test_permutation_importance::test_robustness_to_high_cardinality_noisy_feature[0.5-2]"
   - "sklearn.inspection.tests.test_permutation_importance::test_robustness_to_high_cardinality_noisy_feature[1.0-1]"

@@ -519,24 +519,24 @@ def test_typeerror_input():
 housing_regression_result = np.array(
     [
         [
-            -0.00182223,
-            -0.01232004,
-            -0.4782278,
-            0.04781425,
-            -0.01337761,
-            -0.34830606,
-            -0.4682865,
-            -0.20812261,
+            -0.8974524140357971,
+            0.001421511173248291,
+            -0.0688888430595398,
+            -0.03094351291656494,
+            -0.015949785709381104,
+            -0.23235774040222168,
+            -0.21568483114242554,
+            -0.0710676908493042,
         ],
         [
-            -0.0013606,
-            0.0110372,
-            -0.445176,
-            -0.08268094,
-            0.00406259,
-            -0.02185595,
-            -0.47673094,
-            -0.13557231,
+            -0.744776725769043,
+            0.01672065258026123,
+            -0.1426766812801361,
+            0.06865900754928589,
+            -0.01718229055404663,
+            -0.06164264678955078,
+            -0.18163931369781494,
+            -0.039707064628601074,
         ],
     ],
     dtype=np.float32,

@@ -1331,3 +1331,62 @@ def test_rf_oob_score_binary_classification():
 
     # OOB decision function should have 2 classes
     assert clf.oob_decision_function_.shape[1] == 2
+
+
+@pytest.mark.parametrize("datatype", [np.float32, np.float64])
+@pytest.mark.parametrize("n_features", [10, 20, 50])
+@pytest.mark.skipif(
+    cudf_pandas_active,
+    reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
+    "Issue: https://github.com/rapidsai/cuml/issues/5991",
+)
+def test_rf_feature_zero_bias(datatype, n_features):
+    """
+    Test for feature 0 sampling bias in RandomForest.
+
+    Creates a dataset where ONLY feature 0 predicts the target,
+    and all other features are pure noise. This tests whether
+    feature 0 is being properly sampled during tree building.
+
+    Regression test for: https://github.com/rapidsai/cuml/issues/7422
+    """
+    n_samples = 5000
+
+    # Create dataset where only feature 0 is predictive
+    np.random.seed(42)
+    X = np.random.randn(n_samples, n_features).astype(datatype)
+    # Target depends ONLY on feature 0
+    y = (X[:, 0] > 0).astype(np.int32)
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.3, random_state=42, stratify=y
+    )
+
+    # Train scikit-learn model
+    sk_model = skrfc(
+        n_estimators=100,
+        max_features="sqrt",
+        max_depth=5,
+        min_samples_split=10,
+        random_state=42,
+        n_jobs=-1,
+    )
+    sk_model.fit(X_train, y_train)
+    sk_pred = sk_model.predict(X_test)
+    sk_acc = accuracy_score(y_test, sk_pred)
+
+    # Train cuML model
+    cuml_model = curfc(
+        n_estimators=100,
+        max_features="sqrt",
+        max_depth=5,
+        min_samples_split=10,
+        random_state=42,
+    )
+    cuml_model.fit(X_train, y_train)
+    cuml_pred = cuml_model.predict(X_test)
+    cuml_acc = accuracy_score(y_test, cuml_pred)
+
+    # cuML should achieve similar accuracy to sklearn
+    # If feature 0 is severely under-sampled, accuracy will be much lower
+    assert cuml_acc >= sk_acc - 0.10