diff --git a/docs/source/zero-code-change-limitations.rst b/docs/source/zero-code-change-limitations.rst index 669b48ad68..216b6cf977 100644 --- a/docs/source/zero-code-change-limitations.rst +++ b/docs/source/zero-code-change-limitations.rst @@ -111,10 +111,6 @@ DBSCAN - If ``metric`` isn't one of the supported metrics (``"l2"``, ``"euclidean"``, ``"cosine"``, ``"precomputed"``). - If ``X`` is sparse. -Additionally, the following fitted attributes are currently not computed: - -- ``components_`` - sklearn.decomposition --------------------- diff --git a/python/cuml/cuml/accel/tests/scikit-learn/xfail-list.yaml b/python/cuml/cuml/accel/tests/scikit-learn/xfail-list.yaml index ce021accc9..ace21c1c91 100644 --- a/python/cuml/cuml/accel/tests/scikit-learn/xfail-list.yaml +++ b/python/cuml/cuml/accel/tests/scikit-learn/xfail-list.yaml @@ -1245,7 +1245,6 @@ - reason: cuml missing certain fit attributes marker: cuml_accel_missing_fit_attributes tests: - - "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[DBSCAN-DBSCAN]" - "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[ElasticNet-ElasticNet]" - "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[Lasso-Lasso]" - "sklearn.tests.test_docstring_parameters::test_fit_docstring_attributes[SVC-SVC]" @@ -1538,7 +1537,3 @@ strict: false tests: - "sklearn.decomposition.tests.test_sparse_pca::test_equivalence_components_pca_spca[42]" -- reason: 'cuml.accel bug: Missing components_ attribute' - tests: - - "sklearn.cluster.tests.test_dbscan::test_dbscan_no_core_samples[csr_array]" - - "sklearn.cluster.tests.test_dbscan::test_dbscan_no_core_samples[csr_matrix]" diff --git a/python/cuml/cuml/cluster/dbscan.pyx b/python/cuml/cuml/cluster/dbscan.pyx index 9bd0af96b1..ce8282470f 100644 --- a/python/cuml/cuml/cluster/dbscan.pyx +++ b/python/cuml/cuml/cluster/dbscan.pyx @@ -199,8 +199,9 @@ class DBSCAN(Base, :ref:`output-data-type-configuration` for more info. calc_core_sample_indices : (optional) boolean (default = True) Indicates whether the indices of the core samples should be calculated. - The the attribute `core_sample_indices_` will not be used, setting this - to False will avoid unnecessary kernel launches + If True (the default), ``core_sample_indices_`` and ``components_`` will + be computed and stored as fitted attributes. Set to False to avoid + computing these attributes, removing a small amount of overhead. Attributes ---------- @@ -210,7 +211,10 @@ class DBSCAN(Base, output_type. core_sample_indices_ : array-like or cuDF series The indices of the core samples. Only calculated if - calc_core_sample_indices==True + ``calc_core_sample_indices=True``. + components_ : array-like or cuDF series + Copy of each core sample found by training. Only calculated if + ``calc_core_sample_indices=True``. Notes ----- @@ -232,6 +236,7 @@ class DBSCAN(Base, """ core_sample_indices_ = CumlArrayDescriptor(order="C") + components_ = CumlArrayDescriptor(order="C") labels_ = CumlArrayDescriptor(order="C") _cpu_class_path = "sklearn.cluster.DBSCAN" @@ -274,6 +279,7 @@ class DBSCAN(Base, def _attrs_from_cpu(self, model): return { "core_sample_indices_": to_gpu(model.core_sample_indices_, order="C"), + "components_": to_gpu(model.components_, order="C"), "labels_": to_gpu(model.labels_, order="C"), **super()._attrs_from_cpu(model), } @@ -281,6 +287,7 @@ class DBSCAN(Base, def _attrs_to_cpu(self, model): return { "core_sample_indices_": to_cpu(self.core_sample_indices_, order="C"), + "components_": to_cpu(self.components_, order="C"), "labels_": to_cpu(self.labels_, order="C"), **super()._attrs_to_cpu(model), } @@ -305,12 +312,6 @@ class DBSCAN(Base, self.metric = metric self.algorithm = algorithm - # internal array attributes - self.labels_ = None - - # One used when `self.calc_core_sample_indices == True` - self.core_sample_indices_ = None - # C++ API expects this to be numeric. if self.max_mbytes_per_batch is None: self.max_mbytes_per_batch = 0 @@ -378,8 +379,7 @@ class DBSCAN(Base, # Create the output core_sample_indices only if needed if self.calc_core_sample_indices: - self.core_sample_indices_ = \ - CumlArray.empty(n_rows, dtype=out_dtype) + self.core_sample_indices_ = CumlArray.empty(n_rows, dtype=out_dtype) core_sample_indices_ptr = self.core_sample_indices_.ptr if self.dtype == np.float32: @@ -449,7 +449,6 @@ class DBSCAN(Base, # make sure that the `fit` is complete before the following # delete call happens self.handle.sync() - del X_m # Finally, resize the core_sample_indices array if necessary if self.calc_core_sample_indices: @@ -459,13 +458,16 @@ class DBSCAN(Base, # increasing, so the min index should be the first returned -1 min_index = cp.argmin(self.core_sample_indices_).item() # Check for the case where there are no -1's - if ((min_index == 0 and - self.core_sample_indices_[min_index].item() != -1)): + if ((min_index == 0 and self.core_sample_indices_[min_index].item() != -1)): # Nothing to delete. The array has no -1's pass else: - self.core_sample_indices_ = \ - self.core_sample_indices_[:min_index] + self.core_sample_indices_ = self.core_sample_indices_[:min_index] + + self.components_ = X_m.to_output("cupy")[self.core_sample_indices_] + else: + self.core_sample_indices_ = None + self.components_ = None return self diff --git a/python/cuml/cuml/tests/test_dbscan.py b/python/cuml/cuml/tests/test_dbscan.py index 0db726b17a..3d02c10dfd 100644 --- a/python/cuml/cuml/tests/test_dbscan.py +++ b/python/cuml/cuml/tests/test_dbscan.py @@ -329,6 +329,7 @@ def test_core_point_prop1(): assert array_equal( cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_ ) + assert array_equal(cuml_dbscan.components_, sk_dbscan.components_) # Check the labels are correct assert_dbscan_equal( @@ -376,6 +377,7 @@ def test_core_point_prop2(): assert array_equal( cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_ ) + assert array_equal(cuml_dbscan.components_, sk_dbscan.components_) # Check the labels are correct assert_dbscan_equal( @@ -429,6 +431,7 @@ def test_core_point_prop3(): assert array_equal( cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_ ) + assert array_equal(cuml_dbscan.components_, sk_dbscan.components_) # Check the labels are correct assert_dbscan_equal( @@ -507,6 +510,7 @@ def test_dbscan_no_calc_core_point_indices(): # Make sure we are None assert cuml_dbscan.core_sample_indices_ is None + assert cuml_dbscan.components_ is None def test_dbscan_on_empty_array():