Skip to content

Commit 029c3cf

Browse files
authored
Compute components_ in DBSCAN (#6976)
This adds support for computing the `components_` attribute of `DBSCAN`. The attribute is only stored if `calc_core_sample_indices=True` (the default), since both are related to the core samples. The number of core samples is much smaller than the number of samples (sometimes it's 0), in practice this adds negligible time and space overhead, but will respect the existing switch if needed. Fixes #6975. Authors: - Jim Crist-Harif (https://github.com/jcrist) Approvers: - Simon Adorf (https://github.com/csadorf) URL: #6976
1 parent 83ff7b5 commit 029c3cf

4 files changed

Lines changed: 22 additions & 25 deletions

File tree

docs/source/zero-code-change-limitations.rst

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,6 @@ DBSCAN
111111
- If ``metric`` isn't one of the supported metrics (``"l2"``, ``"euclidean"``, ``"cosine"``, ``"precomputed"``).
112112
- If ``X`` is sparse.
113113

114-
Additionally, the following fitted attributes are currently not computed:
115-
116-
- ``components_``
117-
118114

119115
sklearn.decomposition
120116
---------------------

python/cuml/cuml/accel/tests/scikit-learn/xfail-list.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,7 +1245,6 @@
12451245
- reason: cuml missing certain fit attributes
12461246
marker: cuml_accel_missing_fit_attributes
12471247
tests:
1248-
- "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[DBSCAN-DBSCAN]"
12491248
- "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[ElasticNet-ElasticNet]"
12501249
- "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[Lasso-Lasso]"
12511250
- "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[SVC-SVC]"
@@ -1538,7 +1537,3 @@
15381537
strict: false
15391538
tests:
15401539
- "sklearn.decomposition.tests.test_sparse_pca::test_equivalence_components_pca_spca[42]"
1541-
- reason: 'cuml.accel bug: Missing components_ attribute'
1542-
tests:
1543-
- "sklearn.cluster.tests.test_dbscan::test_dbscan_no_core_samples[csr_array]"
1544-
- "sklearn.cluster.tests.test_dbscan::test_dbscan_no_core_samples[csr_matrix]"

python/cuml/cuml/cluster/dbscan.pyx

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,9 @@ class DBSCAN(Base,
199199
:ref:`output-data-type-configuration` for more info.
200200
calc_core_sample_indices : (optional) boolean (default = True)
201201
Indicates whether the indices of the core samples should be calculated.
202-
The the attribute `core_sample_indices_` will not be used, setting this
203-
to False will avoid unnecessary kernel launches
202+
If True (the default), ``core_sample_indices_`` and ``components_`` will
203+
be computed and stored as fitted attributes. Set to False to avoid
204+
computing these attributes, removing a small amount of overhead.
204205
205206
Attributes
206207
----------
@@ -210,7 +211,10 @@ class DBSCAN(Base,
210211
output_type.
211212
core_sample_indices_ : array-like or cuDF series
212213
The indices of the core samples. Only calculated if
213-
calc_core_sample_indices==True
214+
``calc_core_sample_indices=True``.
215+
components_ : array-like or cuDF series
216+
Copy of each core sample found by training. Only calculated if
217+
``calc_core_sample_indices=True``.
214218
215219
Notes
216220
-----
@@ -232,6 +236,7 @@ class DBSCAN(Base,
232236
"""
233237

234238
core_sample_indices_ = CumlArrayDescriptor(order="C")
239+
components_ = CumlArrayDescriptor(order="C")
235240
labels_ = CumlArrayDescriptor(order="C")
236241

237242
_cpu_class_path = "sklearn.cluster.DBSCAN"
@@ -274,13 +279,15 @@ class DBSCAN(Base,
274279
def _attrs_from_cpu(self, model):
275280
return {
276281
"core_sample_indices_": to_gpu(model.core_sample_indices_, order="C"),
282+
"components_": to_gpu(model.components_, order="C"),
277283
"labels_": to_gpu(model.labels_, order="C"),
278284
**super()._attrs_from_cpu(model),
279285
}
280286

281287
def _attrs_to_cpu(self, model):
282288
return {
283289
"core_sample_indices_": to_cpu(self.core_sample_indices_, order="C"),
290+
"components_": to_cpu(self.components_, order="C"),
284291
"labels_": to_cpu(self.labels_, order="C"),
285292
**super()._attrs_to_cpu(model),
286293
}
@@ -305,12 +312,6 @@ class DBSCAN(Base,
305312
self.metric = metric
306313
self.algorithm = algorithm
307314

308-
# internal array attributes
309-
self.labels_ = None
310-
311-
# One used when `self.calc_core_sample_indices == True`
312-
self.core_sample_indices_ = None
313-
314315
# C++ API expects this to be numeric.
315316
if self.max_mbytes_per_batch is None:
316317
self.max_mbytes_per_batch = 0
@@ -378,8 +379,7 @@ class DBSCAN(Base,
378379

379380
# Create the output core_sample_indices only if needed
380381
if self.calc_core_sample_indices:
381-
self.core_sample_indices_ = \
382-
CumlArray.empty(n_rows, dtype=out_dtype)
382+
self.core_sample_indices_ = CumlArray.empty(n_rows, dtype=out_dtype)
383383
core_sample_indices_ptr = self.core_sample_indices_.ptr
384384

385385
if self.dtype == np.float32:
@@ -449,7 +449,6 @@ class DBSCAN(Base,
449449
# make sure that the `fit` is complete before the following
450450
# delete call happens
451451
self.handle.sync()
452-
del X_m
453452

454453
# Finally, resize the core_sample_indices array if necessary
455454
if self.calc_core_sample_indices:
@@ -459,13 +458,16 @@ class DBSCAN(Base,
459458
# increasing, so the min index should be the first returned -1
460459
min_index = cp.argmin(self.core_sample_indices_).item()
461460
# Check for the case where there are no -1's
462-
if ((min_index == 0 and
463-
self.core_sample_indices_[min_index].item() != -1)):
461+
if ((min_index == 0 and self.core_sample_indices_[min_index].item() != -1)):
464462
# Nothing to delete. The array has no -1's
465463
pass
466464
else:
467-
self.core_sample_indices_ = \
468-
self.core_sample_indices_[:min_index]
465+
self.core_sample_indices_ = self.core_sample_indices_[:min_index]
466+
467+
self.components_ = X_m.to_output("cupy")[self.core_sample_indices_]
468+
else:
469+
self.core_sample_indices_ = None
470+
self.components_ = None
469471

470472
return self
471473

python/cuml/cuml/tests/test_dbscan.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ def test_core_point_prop1():
329329
assert array_equal(
330330
cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_
331331
)
332+
assert array_equal(cuml_dbscan.components_, sk_dbscan.components_)
332333

333334
# Check the labels are correct
334335
assert_dbscan_equal(
@@ -376,6 +377,7 @@ def test_core_point_prop2():
376377
assert array_equal(
377378
cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_
378379
)
380+
assert array_equal(cuml_dbscan.components_, sk_dbscan.components_)
379381

380382
# Check the labels are correct
381383
assert_dbscan_equal(
@@ -429,6 +431,7 @@ def test_core_point_prop3():
429431
assert array_equal(
430432
cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_
431433
)
434+
assert array_equal(cuml_dbscan.components_, sk_dbscan.components_)
432435

433436
# Check the labels are correct
434437
assert_dbscan_equal(
@@ -507,6 +510,7 @@ def test_dbscan_no_calc_core_point_indices():
507510

508511
# Make sure we are None
509512
assert cuml_dbscan.core_sample_indices_ is None
513+
assert cuml_dbscan.components_ is None
510514

511515

512516
def test_dbscan_on_empty_array():

0 commit comments

Comments
 (0)