diff --git a/cpp/include/cuml/manifold/spectral_embedding.hpp b/cpp/include/cuml/manifold/spectral_embedding.hpp index 7bce566948..3b6a9a06e0 100644 --- a/cpp/include/cuml/manifold/spectral_embedding.hpp +++ b/cpp/include/cuml/manifold/spectral_embedding.hpp @@ -58,4 +58,11 @@ void transform(raft::resources const& handle, raft::device_coo_matrix_view connectivity_graph, raft::device_matrix_view embedding); +void transform(raft::resources const& handle, + ML::SpectralEmbedding::params config, + raft::device_vector_view rows, + raft::device_vector_view cols, + raft::device_vector_view vals, + raft::device_matrix_view embedding); + } // namespace ML::SpectralEmbedding diff --git a/cpp/src/spectral/spectral_embedding.cu b/cpp/src/spectral/spectral_embedding.cu index 8b014f816d..ca0fd4b23c 100644 --- a/cpp/src/spectral/spectral_embedding.cu +++ b/cpp/src/spectral/spectral_embedding.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -53,4 +54,22 @@ void transform(raft::resources const& handle, handle, to_cuvs(config), connectivity_graph, embedding); } +void transform(raft::resources const& handle, + ML::SpectralEmbedding::params config, + raft::device_vector_view rows, + raft::device_vector_view cols, + raft::device_vector_view vals, + raft::device_matrix_view embedding) +{ + auto connectivity_graph_view = raft::make_device_coo_matrix_view( + vals.data_handle(), + raft::make_device_coordinate_structure_view(rows.data_handle(), + cols.data_handle(), + embedding.extent(0), + embedding.extent(0), + vals.size())); + + ML::SpectralEmbedding::transform(handle, config, connectivity_graph_view, embedding); +} + } // namespace ML::SpectralEmbedding diff --git a/python/cuml/cuml/manifold/spectral_embedding.pyx b/python/cuml/cuml/manifold/spectral_embedding.pyx index d8c4d7da1a..a0f5587729 100644 --- a/python/cuml/cuml/manifold/spectral_embedding.pyx +++ b/python/cuml/cuml/manifold/spectral_embedding.pyx @@ -13,32 +13,33 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import cupy as cp +import cupyx.scipy.sparse as cp_sp import numpy as np +import scipy.sparse as sp +from pylibraft.common.handle import Handle + +import cuml +from cuml.common import input_to_cuml_array +from cuml.common.array_descriptor import CumlArrayDescriptor +from cuml.internals.array import CumlArray +from cuml.internals.base import Base +from cuml.internals.mixins import CMajorInputTagMixin, SparseInputTagMixin +from cuml.internals.utils import check_random_seed from cython.operator cimport dereference as deref from libc.stdint cimport uint64_t, uintptr_t from libcpp cimport bool - -from pylibraft.common.handle import Handle - from pylibraft.common.cpp.mdspan cimport ( col_major, device_matrix_view, + device_vector_view, make_device_matrix_view, + make_device_vector_view, row_major, ) from pylibraft.common.handle cimport device_resources -import cuml -from cuml.common import input_to_cuml_array -from cuml.common.array_descriptor import CumlArrayDescriptor -from cuml.internals.array import CumlArray -from cuml.internals.base import Base -from cuml.internals.mixins import CMajorInputTagMixin, SparseInputTagMixin -from cuml.internals.utils import check_random_seed - cdef extern from "cuml/manifold/spectral_embedding.hpp" namespace "ML::SpectralEmbedding": @@ -55,11 +56,20 @@ cdef extern from "cuml/manifold/spectral_embedding.hpp" namespace "ML::SpectralE device_matrix_view[float, int, row_major] dataset, device_matrix_view[float, int, col_major] embedding) except + + cdef void transform( + const device_resources &handle, + params config, + device_vector_view[int, int] rows, + device_vector_view[int, int] cols, + device_vector_view[float, int] vals, + device_matrix_view[float, int, col_major] embedding) except + + @cuml.internals.api_return_array(get_output_type=True) def spectral_embedding(A, *, n_components=8, + affinity="nearest_neighbors", random_state=None, n_neighbors=None, norm_laplacian=True, @@ -77,10 +87,20 @@ def spectral_embedding(A, Parameters ---------- - A : array-like of shape (n_samples, n_features) - The input data. A k-NN graph will be constructed. + A : array-like or sparse matrix of shape (n_samples, n_features) or \ + (n_samples, n_samples) + If affinity is 'nearest_neighbors', this is the input data and a k-NN + graph will be constructed. If affinity is 'precomputed', this is the + affinity matrix. Supported formats for precomputed affinity: scipy + sparse (CSR, CSC, COO), cupy sparse (CSR, CSC, COO), dense numpy + arrays, or dense cupy arrays. n_components : int, default=8 The dimension of the projection subspace. + affinity : {'nearest_neighbors', 'precomputed'}, default='nearest_neighbors' + How to construct the affinity matrix. + - 'nearest_neighbors' : construct the affinity matrix by computing a + graph of nearest neighbors. + - 'precomputed' : interpret ``A`` as a precomputed affinity matrix. random_state : int, RandomState instance or None, default=None A pseudo random number generator used for the initialization. Use an int to make the results deterministic across calls. @@ -123,43 +143,74 @@ def spectral_embedding(A, >>> embedding.shape (100, 2) """ - if handle is None: handle = Handle() cdef device_resources *h = handle.getHandle() - A, _n_rows, _n_cols, _ = \ - input_to_cuml_array(A, order="C", check_dtype=np.float32, - convert_to_dtype=cp.float32) - A_ptr = A.ptr + if affinity == "nearest_neighbors": + A = input_to_cuml_array( + A, order="C", check_dtype=np.float32, convert_to_dtype=cp.float32 + ).array + elif affinity == "precomputed": + # Coerce `A` to a canonical float32 COO sparse matrix + if cp_sp.issparse(A): + A = A.tocoo() + if A.dtype != np.float32: + A = A.astype("float32") + elif sp.issparse(A): + A = cp_sp.coo_matrix(A, dtype="float32") + else: + A = cp_sp.coo_matrix(cp.asarray(A, dtype="float32")) + A.sum_duplicates() + else: + raise ValueError( + f"`affinity={affinity!r}` is not supported, expected one of " + "['nearest_neighbors', 'precomputed']" + ) cdef params config - - config.n_components = n_components config.seed = check_random_seed(random_state) - + config.norm_laplacian = norm_laplacian + config.drop_first = drop_first + config.n_components = n_components + 1 if drop_first else n_components config.n_neighbors = ( n_neighbors if n_neighbors is not None else max(int(A.shape[0] / 10), 1) ) - config.norm_laplacian = norm_laplacian - config.drop_first = drop_first - - if config.drop_first: - config.n_components += 1 - - eigenvectors = CumlArray.empty((A.shape[0], n_components), dtype=A.dtype, order='F') - - eigenvectors_ptr = eigenvectors.ptr + eigenvectors = CumlArray.empty( + (A.shape[0], n_components), dtype=np.float32, order='F' + ) - transform( - deref(h), config, - make_device_matrix_view[float, int, row_major]( - A_ptr, A.shape[0], A.shape[1]), - make_device_matrix_view[float, int, col_major]( - eigenvectors_ptr, A.shape[0], n_components)) + if affinity == "precomputed": + transform( + deref(h), + config, + make_device_vector_view(A.row.data.ptr, A.nnz), + make_device_vector_view(A.col.data.ptr, A.nnz), + make_device_vector_view(A.data.data.ptr, A.nnz), + make_device_matrix_view[float, int, col_major]( + eigenvectors.ptr, + eigenvectors.shape[0], + eigenvectors.shape[1], + ) + ) + else: + transform( + deref(h), + config, + make_device_matrix_view[float, int, row_major]( + A.ptr, + A.shape[0], + A.shape[1], + ), + make_device_matrix_view[float, int, col_major]( + eigenvectors.ptr, + eigenvectors.shape[0], + eigenvectors.shape[1], + ) + ) return eigenvectors @@ -180,6 +231,11 @@ class SpectralEmbedding(Base, ---------- n_components : int, default=2 The dimension of the projected subspace. + affinity : {'nearest_neighbors', 'precomputed'}, default='nearest_neighbors' + How to construct the affinity matrix. + - 'nearest_neighbors' : construct the affinity matrix by computing a + graph of nearest neighbors. + - 'precomputed' : interpret ``X`` as a precomputed affinity matrix. random_state : int, RandomState instance or None, default=None A pseudo random number generator used for the initialization. Use an int to make the results deterministic across calls. @@ -210,9 +266,6 @@ class SpectralEmbedding(Base, Notes ----- - Currently, cuML's SpectralEmbedding only supports the 'nearest_neighbors' - affinity mode, where a k-NN graph is constructed from the input data. - Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph has one connected component. If there graph has many components, the first few eigenvectors will simply uncover the connected components of the graph. @@ -229,10 +282,12 @@ class SpectralEmbedding(Base, """ embedding_ = CumlArrayDescriptor() - def __init__(self, n_components=2, random_state=None, n_neighbors=None, + def __init__(self, n_components=2, affinity="nearest_neighbors", + random_state=None, n_neighbors=None, handle=None, verbose=False, output_type=None): super().__init__(handle=handle, verbose=verbose, output_type=output_type) self.n_components = n_components + self.affinity = affinity self.random_state = random_state self.n_neighbors = n_neighbors @@ -240,6 +295,7 @@ class SpectralEmbedding(Base, def _get_param_names(cls): return super()._get_param_names() + [ "n_components", + "affinity", "random_state", "n_neighbors" ] @@ -249,9 +305,13 @@ class SpectralEmbedding(Base, Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : array-like or sparse matrix of shape (n_samples, n_features) or \ + (n_samples, n_samples) Training vector, where `n_samples` is the number of samples - and `n_features` is the number of features. + and `n_features` is the number of features. If affinity is + 'precomputed', X is the affinity matrix. Supported formats for + precomputed affinity: scipy sparse (CSR, CSC, COO), cupy sparse + (CSR, CSC, COO), dense numpy arrays, or dense cupy arrays. y : Ignored Not used, present for API consistency by convention. @@ -268,9 +328,13 @@ class SpectralEmbedding(Base, Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : array-like or sparse matrix of shape (n_samples, n_features) or \ + (n_samples, n_samples) Training vector, where `n_samples` is the number of samples - and `n_features` is the number of features. + and `n_features` is the number of features. If affinity is + 'precomputed', X is the affinity matrix. Supported formats for + precomputed affinity: scipy sparse (CSR, CSC, COO), cupy sparse + (CSR, CSC, COO), dense numpy arrays, or dense cupy arrays. y : Ignored Not used, present for API consistency by convention. @@ -282,6 +346,7 @@ class SpectralEmbedding(Base, self.embedding_ = spectral_embedding( X, n_components=self.n_components, + affinity=self.affinity, random_state=self.random_state, n_neighbors=self.n_neighbors ) diff --git a/python/cuml/tests/test_spectral_embedding.py b/python/cuml/tests/test_spectral_embedding.py index 4189b7d359..1515d2619f 100644 --- a/python/cuml/tests/test_spectral_embedding.py +++ b/python/cuml/tests/test_spectral_embedding.py @@ -14,13 +14,17 @@ # import cupy as cp +import cupyx.scipy.sparse as cp_sp import numpy as np import pytest +import scipy.sparse as sp from sklearn.datasets import load_digits, make_s_curve, make_swiss_roll from sklearn.manifold import SpectralEmbedding as skSpectralEmbedding from sklearn.manifold import trustworthiness +from sklearn.neighbors import kneighbors_graph from cuml.manifold import SpectralEmbedding, spectral_embedding +from cuml.manifold.umap import fuzzy_simplicial_set from cuml.testing.datasets import make_classification_dataset # Test parameters @@ -66,51 +70,130 @@ def load_digits_dataset(n_samples=None): return digits.data -# Dataset configurations: (dataset_loader, dataset_name, n_samples, min_trustworthiness) -dataset_configs = [ - (generate_s_curve, 1500, 0.8), - (generate_s_curve, 2000, 0.8), - (generate_swiss_roll, 2000, 0.8), - (generate_swiss_roll, 3000, 0.8), - (generate_mnist_like_dataset, 5000, 0.8), - (load_digits_dataset, None, 0.8), -] - - @pytest.mark.parametrize( - "dataset_loader,n_samples,min_trustworthiness", - dataset_configs, + "affinity,graph_type", + [ + ("nearest_neighbors", None), # Use built-in nearest_neighbors affinity + ("precomputed", "binary_knn"), # Precomputed binary k-NN graph + ( + "precomputed", + "distance_knn", + ), # Precomputed k-NN graph with distances + ("precomputed", "fuzzy_knn"), # Precomputed fuzzy k-NN graph from UMAP + ], +) +@pytest.mark.parametrize( + "dataset_loader,n_samples", + [ + (generate_s_curve, 1500), + (generate_s_curve, 2000), + (generate_swiss_roll, 2000), + (generate_swiss_roll, 3000), + (generate_mnist_like_dataset, 5000), + (load_digits_dataset, None), + ], ) def test_spectral_embedding_trustworthiness( - dataset_loader, n_samples, min_trustworthiness + dataset_loader, n_samples, affinity, graph_type ): - """Test trustworthiness comparison between sklearn and cuML on various datasets.""" + """Test trustworthiness comparison between sklearn and cuML on various datasets. + + Tests different graph construction methods: + - nearest_neighbors affinity: Uses built-in k-NN graph construction + - precomputed with binary_knn: Binary connectivity k-NN graph + - precomputed with distance_knn: k-NN graph with distance weights + - precomputed with fuzzy_knn: Smooth weighted graph from UMAP's fuzzy simplicial set + """ # Load/generate dataset X = dataset_loader(n_samples) if n_samples else dataset_loader(None) - # sklearn embedding - sk_spectral = skSpectralEmbedding( - n_components=N_COMPONENTS, - n_neighbors=N_NEIGHBORS, - affinity="nearest_neighbors", - random_state=42, - n_jobs=-1, - ) - X_sklearn = sk_spectral.fit_transform(X) - - # cuML embedding - X_gpu = cp.asarray(X) - cuml_spectral = SpectralEmbedding( - n_components=N_COMPONENTS, n_neighbors=N_NEIGHBORS, random_state=42 - ) - X_cuml_gpu = cuml_spectral.fit_transform(X_gpu) - X_cuml = cp.asnumpy(X_cuml_gpu) + if affinity == "precomputed": + if graph_type == "fuzzy_knn": + # Use fuzzy_simplicial_set to create a smooth weighted KNN graph + X_gpu = cp.asarray(X, dtype=np.float32) + + # Create smooth KNN graph using fuzzy_simplicial_set + # This creates a weighted graph with fuzzy membership strengths + graph = fuzzy_simplicial_set( + X_gpu, + n_neighbors=N_NEIGHBORS, + random_state=42, + ) + + # sklearn embedding with precomputed fuzzy graph + sk_spectral = skSpectralEmbedding( + n_components=N_COMPONENTS, + affinity="precomputed", + random_state=42, + ) + X_sklearn = sk_spectral.fit_transform(graph.get()) + + # cuML embedding with precomputed fuzzy graph + cuml_spectral = SpectralEmbedding( + n_components=N_COMPONENTS, + affinity="precomputed", + random_state=42, + ) + X_cuml_gpu = cuml_spectral.fit_transform(graph) + X_cuml = cp.asnumpy(X_cuml_gpu) + + elif graph_type in ["binary_knn", "distance_knn"]: + # Create k-neighbors graph for precomputed affinity + mode = "connectivity" if graph_type == "binary_knn" else "distance" + knn_graph = kneighbors_graph( + X, + n_neighbors=N_NEIGHBORS, + mode=mode, + include_self=True, + ) + # Make symmetric + knn_graph = 0.5 * (knn_graph + knn_graph.T) + knn_coo = knn_graph.tocoo() + + # sklearn embedding with precomputed + sk_spectral = skSpectralEmbedding( + n_components=N_COMPONENTS, + affinity="precomputed", + random_state=42, + ) + X_sklearn = sk_spectral.fit_transform(knn_coo) + + # cuML embedding with precomputed + cuml_spectral = SpectralEmbedding( + n_components=N_COMPONENTS, + affinity="precomputed", + random_state=42, + ) + X_cuml_gpu = cuml_spectral.fit_transform(knn_coo) + X_cuml = cp.asnumpy(X_cuml_gpu) + else: + # sklearn embedding with nearest_neighbors + sk_spectral = skSpectralEmbedding( + n_components=N_COMPONENTS, + n_neighbors=N_NEIGHBORS, + affinity="nearest_neighbors", + random_state=42, + n_jobs=-1, + ) + X_sklearn = sk_spectral.fit_transform(X) + + # cuML embedding with nearest_neighbors + X_gpu = cp.asarray(X) + cuml_spectral = SpectralEmbedding( + n_components=N_COMPONENTS, + affinity="nearest_neighbors", + n_neighbors=N_NEIGHBORS, + random_state=42, + ) + X_cuml_gpu = cuml_spectral.fit_transform(X_gpu) + X_cuml = cp.asnumpy(X_cuml_gpu) # Calculate trustworthiness scores trust_sklearn = trustworthiness(X, X_sklearn, n_neighbors=N_NEIGHBORS) trust_cuml = trustworthiness(X, X_cuml, n_neighbors=N_NEIGHBORS) # Assertions + min_trustworthiness = 0.8 assert trust_sklearn > min_trustworthiness assert trust_cuml > min_trustworthiness @@ -141,6 +224,14 @@ def test_spectral_embedding_function_api(): assert cp.allclose(embedding1, embedding2) +def test_spectral_embedding_invalid_affinity(): + X, _ = make_s_curve(n_samples=200, noise=0.05, random_state=42) + with pytest.raises( + ValueError, match="`affinity='oops!'` is not supported" + ): + spectral_embedding(X, affinity="oops!") + + @pytest.mark.parametrize( "input_type,expected_type", [ @@ -175,3 +266,76 @@ def test_output_type_handling(input_type, expected_type): ).fit_transform(X) assert isinstance(out, expected_type) assert out.shape == (n_samples, 2) + + +@pytest.mark.parametrize( + "converter", + [ + pytest.param(lambda x: x.toarray(), id="numpy"), + pytest.param(lambda x: cp.asarray(x.toarray()), id="cupy"), + pytest.param(sp.coo_matrix, id="scipy_coo"), + pytest.param(sp.csr_matrix, id="scipy_csr"), + pytest.param(sp.csc_matrix, id="scipy_csc"), + pytest.param(cp_sp.coo_matrix, id="cupy_coo"), + pytest.param(cp_sp.csr_matrix, id="cupy_csr"), + pytest.param(cp_sp.csc_matrix, id="cupy_csc"), + ], +) +@pytest.mark.parametrize("dtype", ["float32", "float64"]) +def test_precomputed_matrix_formats(converter, dtype): + """Test that various matrix formats work correctly with precomputed affinity. + + This test verifies that SpectralEmbedding works with all combinations of: + - Matrix formats: COO, CSR, CSC, and dense + - Libraries: scipy and cupy + - dtypes: float32 and float64 + + It also ensures the embeddings have good trustworthiness scores. + """ + + # Generate test data using existing helper function + n_samples = 1000 + X_np = generate_s_curve(n_samples) + + # Create a symmetric k-NN affinity graph + knn_graph = kneighbors_graph( + X_np, + n_neighbors=N_NEIGHBORS, + mode="connectivity", + include_self=True, + ) + knn_graph = 0.5 * (knn_graph + knn_graph.T) + + # Convert to the desired format + affinity_matrix = converter(knn_graph).astype(dtype) + + # Test with SpectralEmbedding class + model = SpectralEmbedding( + n_components=2, affinity="precomputed", random_state=42 + ) + embedding_class = model.fit_transform(affinity_matrix) + + # Test with spectral_embedding function + embedding_func = spectral_embedding( + affinity_matrix, + n_components=2, + affinity="precomputed", + random_state=42, + ) + + # Verify output shapes + assert embedding_class.shape == (n_samples, 2) + assert embedding_func.shape == (n_samples, 2) + + # Calculate and print trustworthiness scores + trust_class = trustworthiness( + X_np, cp.asnumpy(embedding_class), n_neighbors=N_NEIGHBORS + ) + trust_func = trustworthiness( + X_np, cp.asnumpy(embedding_func), n_neighbors=N_NEIGHBORS + ) + + # Verify embeddings have good quality + min_trust = 0.8 + assert trust_class > min_trust + assert trust_func > min_trust