Compute components_ in DBSCAN (#6976)

jcrist · web-flow · commit 029c3cff7f5f · 2025-07-07T19:01:54.000Z
This adds support for computing the `components_` attribute of `DBSCAN`. The attribute is only stored if `calc_core_sample_indices=True` (the default), since both are related to the core samples. The number of core samples is much smaller than the number of samples (sometimes it's 0), in practice this adds negligible time and space overhead, but will respect the existing switch if needed. Fixes #6975. Authors: - Jim Crist-Harif (https://github.com/jcrist) Approvers: - Simon Adorf (https://github.com/csadorf) URL: #6976
diff --git a/docs/source/zero-code-change-limitations.rst b/docs/source/zero-code-change-limitations.rst
@@ -111,10 +111,6 @@ DBSCAN
 - If ``metric`` isn't one of the supported metrics (``"l2"``, ``"euclidean"``, ``"cosine"``, ``"precomputed"``).
 - If ``X`` is sparse.
 
-Additionally, the following fitted attributes are currently not computed:
-
-- ``components_``
-
 
 sklearn.decomposition
 ---------------------
diff --git a/python/cuml/cuml/accel/tests/scikit-learn/xfail-list.yaml b/python/cuml/cuml/accel/tests/scikit-learn/xfail-list.yaml
@@ -1245,7 +1245,6 @@
 - reason: cuml missing certain fit attributes
   marker: cuml_accel_missing_fit_attributes
   tests:
-  - "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[DBSCAN-DBSCAN]"
   - "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[ElasticNet-ElasticNet]"
   - "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[Lasso-Lasso]"
   - "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[SVC-SVC]"
@@ -1538,7 +1537,3 @@
   strict: false
   tests:
   - "sklearn.decomposition.tests.test_sparse_pca::test_equivalence_components_pca_spca[42]"
-- reason: 'cuml.accel bug: Missing components_ attribute'
-  tests:
-  - "sklearn.cluster.tests.test_dbscan::test_dbscan_no_core_samples[csr_array]"
-  - "sklearn.cluster.tests.test_dbscan::test_dbscan_no_core_samples[csr_matrix]"
diff --git a/python/cuml/cuml/cluster/dbscan.pyx b/python/cuml/cuml/cluster/dbscan.pyx
@@ -199,8 +199,9 @@ class DBSCAN(Base,
         :ref:`output-data-type-configuration` for more info.
     calc_core_sample_indices : (optional) boolean (default = True)
         Indicates whether the indices of the core samples should be calculated.
-        The the attribute `core_sample_indices_` will not be used, setting this
-        to False will avoid unnecessary kernel launches
+        If True (the default), ``core_sample_indices_`` and ``components_`` will
+        be computed and stored as fitted attributes. Set to False to avoid
+        computing these attributes, removing a small amount of overhead.
 
     Attributes
     ----------
@@ -210,7 +211,10 @@ class DBSCAN(Base,
         output_type.
     core_sample_indices_ : array-like or cuDF series
         The indices of the core samples. Only calculated if
-        calc_core_sample_indices==True
+        ``calc_core_sample_indices=True``.
+    components_ : array-like or cuDF series
+        Copy of each core sample found by training. Only calculated if
+        ``calc_core_sample_indices=True``.
 
     Notes
     -----
@@ -232,6 +236,7 @@ class DBSCAN(Base,
     """
 
     core_sample_indices_ = CumlArrayDescriptor(order="C")
+    components_ = CumlArrayDescriptor(order="C")
     labels_ = CumlArrayDescriptor(order="C")
 
     _cpu_class_path = "sklearn.cluster.DBSCAN"
@@ -274,13 +279,15 @@ class DBSCAN(Base,
     def _attrs_from_cpu(self, model):
         return {
             "core_sample_indices_": to_gpu(model.core_sample_indices_, order="C"),
+            "components_": to_gpu(model.components_, order="C"),
             "labels_": to_gpu(model.labels_, order="C"),
             **super()._attrs_from_cpu(model),
         }
 
     def _attrs_to_cpu(self, model):
         return {
             "core_sample_indices_": to_cpu(self.core_sample_indices_, order="C"),
+            "components_": to_cpu(self.components_, order="C"),
             "labels_": to_cpu(self.labels_, order="C"),
             **super()._attrs_to_cpu(model),
         }
@@ -305,12 +312,6 @@ class DBSCAN(Base,
         self.metric = metric
         self.algorithm = algorithm
 
-        # internal array attributes
-        self.labels_ = None
-
-        # One used when `self.calc_core_sample_indices == True`
-        self.core_sample_indices_ = None
-
         # C++ API expects this to be numeric.
         if self.max_mbytes_per_batch is None:
             self.max_mbytes_per_batch = 0
@@ -378,8 +379,7 @@ class DBSCAN(Base,
 
         # Create the output core_sample_indices only if needed
         if self.calc_core_sample_indices:
-            self.core_sample_indices_ = \
-                CumlArray.empty(n_rows, dtype=out_dtype)
+            self.core_sample_indices_ = CumlArray.empty(n_rows, dtype=out_dtype)
             core_sample_indices_ptr = self.core_sample_indices_.ptr
 
         if self.dtype == np.float32:
@@ -449,7 +449,6 @@ class DBSCAN(Base,
         # make sure that the `fit` is complete before the following
         # delete call happens
         self.handle.sync()
-        del X_m
 
         # Finally, resize the core_sample_indices array if necessary
         if self.calc_core_sample_indices:
@@ -459,13 +458,16 @@ class DBSCAN(Base,
                 # increasing, so the min index should be the first returned -1
                 min_index = cp.argmin(self.core_sample_indices_).item()
                 # Check for the case where there are no -1's
-                if ((min_index == 0 and
-                        self.core_sample_indices_[min_index].item() != -1)):
+                if ((min_index == 0 and self.core_sample_indices_[min_index].item() != -1)):
                     # Nothing to delete. The array has no -1's
                     pass
                 else:
-                    self.core_sample_indices_ = \
-                        self.core_sample_indices_[:min_index]
+                    self.core_sample_indices_ = self.core_sample_indices_[:min_index]
+
+                self.components_ = X_m.to_output("cupy")[self.core_sample_indices_]
+        else:
+            self.core_sample_indices_ = None
+            self.components_ = None
 
         return self
 
diff --git a/python/cuml/cuml/tests/test_dbscan.py b/python/cuml/cuml/tests/test_dbscan.py
@@ -329,6 +329,7 @@ def test_core_point_prop1():
     assert array_equal(
         cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_
     )
+    assert array_equal(cuml_dbscan.components_, sk_dbscan.components_)
 
     # Check the labels are correct
     assert_dbscan_equal(
@@ -376,6 +377,7 @@ def test_core_point_prop2():
     assert array_equal(
         cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_
     )
+    assert array_equal(cuml_dbscan.components_, sk_dbscan.components_)
 
     # Check the labels are correct
     assert_dbscan_equal(
@@ -429,6 +431,7 @@ def test_core_point_prop3():
     assert array_equal(
         cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_
     )
+    assert array_equal(cuml_dbscan.components_, sk_dbscan.components_)
 
     # Check the labels are correct
     assert_dbscan_equal(
@@ -507,6 +510,7 @@ def test_dbscan_no_calc_core_point_indices():
 
     # Make sure we are None
     assert cuml_dbscan.core_sample_indices_ is None
+    assert cuml_dbscan.components_ is None
 
 
 def test_dbscan_on_empty_array():