Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions cpp/include/cuml/manifold/spectral_embedding.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,11 @@ void transform(raft::resources const& handle,
raft::device_coo_matrix_view<float, int, int, int> connectivity_graph,
raft::device_matrix_view<float, int, raft::col_major> embedding);

void transform(raft::resources const& handle,
ML::SpectralEmbedding::params config,
raft::device_vector_view<int, int> rows,
raft::device_vector_view<int, int> cols,
raft::device_vector_view<float, int> vals,
raft::device_matrix_view<float, int, raft::col_major> embedding);
Comment on lines +63 to +66
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like the API won't work with datasets having more elements (nnz) than std::numeric_limits<int>::max. Would be great to update the cuVS and cuML APIs to allow larger matrices (extent as uint64_t). Maybe as a follow-up PR?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, tracking here rapidsai/cuvs#1243.


} // namespace ML::SpectralEmbedding
19 changes: 19 additions & 0 deletions cpp/src/spectral/spectral_embedding.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <cuml/manifold/spectral_embedding.hpp>

#include <raft/core/device_coo_matrix.hpp>
#include <raft/core/device_mdspan.hpp>
#include <raft/core/resources.hpp>

Expand Down Expand Up @@ -53,4 +54,22 @@ void transform(raft::resources const& handle,
handle, to_cuvs(config), connectivity_graph, embedding);
}

void transform(raft::resources const& handle,
ML::SpectralEmbedding::params config,
raft::device_vector_view<int, int> rows,
raft::device_vector_view<int, int> cols,
raft::device_vector_view<float, int> vals,
raft::device_matrix_view<float, int, raft::col_major> embedding)
{
auto connectivity_graph_view = raft::make_device_coo_matrix_view<float, int, int, int>(
vals.data_handle(),
raft::make_device_coordinate_structure_view<int, int, int>(rows.data_handle(),
cols.data_handle(),
embedding.extent(0),
embedding.extent(0),
Comment thread
viclafargue marked this conversation as resolved.
vals.size()));

ML::SpectralEmbedding::transform(handle, config, connectivity_graph_view, embedding);
}

} // namespace ML::SpectralEmbedding
89 changes: 73 additions & 16 deletions python/cuml/cuml/manifold/spectral_embedding.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,16 @@ from pylibraft.common.handle import Handle
from pylibraft.common.cpp.mdspan cimport (
col_major,
device_matrix_view,
device_vector_view,
make_device_matrix_view,
make_device_vector_view,
row_major,
)
from pylibraft.common.handle cimport device_resources

from cupyx.scipy.sparse import coo_matrix as cupy_coo
from scipy.sparse import coo_matrix as scipy_coo

import cuml
from cuml.common import input_to_cuml_array
from cuml.common.array_descriptor import CumlArrayDescriptor
Expand All @@ -55,11 +60,20 @@ cdef extern from "cuml/manifold/spectral_embedding.hpp" namespace "ML::SpectralE
device_matrix_view[float, int, row_major] dataset,
device_matrix_view[float, int, col_major] embedding) except +

cdef void transform(
const device_resources &handle,
params config,
device_vector_view[int, int] rows,
device_vector_view[int, int] cols,
device_vector_view[float, int] vals,
device_matrix_view[float, int, col_major] embedding) except +


@cuml.internals.api_return_array(get_output_type=True)
def spectral_embedding(A,
*,
n_components=8,
affinity="nearest_neighbors",
random_state=None,
n_neighbors=None,
norm_laplacian=True,
Expand All @@ -81,6 +95,12 @@ def spectral_embedding(A,
The input data. A k-NN graph will be constructed.
n_components : int, default=8
The dimension of the projection subspace.
affinity : {'nearest_neighbors', 'precomputed'} or callable, \
default='nearest_neighbors'
How to construct the affinity matrix.
- 'nearest_neighbors' : construct the affinity matrix by computing a
graph of nearest neighbors.
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
random_state : int, RandomState instance or None, default=None
A pseudo random number generator used for the initialization.
Use an int to make the results deterministic across calls.
Expand Down Expand Up @@ -128,10 +148,31 @@ def spectral_embedding(A,
handle = Handle()
cdef device_resources *h = <device_resources*><size_t>handle.getHandle()

A, _n_rows, _n_cols, _ = \
input_to_cuml_array(A, order="C", check_dtype=np.float32,
convert_to_dtype=cp.float32)
A_ptr = <uintptr_t>A.ptr
if affinity == "precomputed":
assert isinstance(A, (scipy_coo, cupy_coo))

rows = A.row
cols = A.col
vals = A.data
_n_rows = A.shape[0]
nnz = A.nnz

rows = input_to_cuml_array(rows, order="C",
check_dtype=np.int32, convert_to_dtype=cp.int32)[0]
cols = input_to_cuml_array(cols, order="C",
check_dtype=np.int32, convert_to_dtype=cp.int32)[0]
vals = input_to_cuml_array(vals, order="C",
check_dtype=np.float32, convert_to_dtype=cp.float32)[0]

rows_ptr = <uintptr_t>rows.ptr
cols_ptr = <uintptr_t>cols.ptr
vals_ptr = <uintptr_t>vals.ptr
Comment thread
jcrist marked this conversation as resolved.
Outdated

else:
A, _n_rows, _n_cols, _ = \
input_to_cuml_array(A, order="C", check_dtype=np.float32,
convert_to_dtype=cp.float32)
A_ptr = <uintptr_t>A.ptr

cdef params config

Expand All @@ -141,7 +182,7 @@ def spectral_embedding(A,
config.n_neighbors = (
n_neighbors
if n_neighbors is not None
else max(int(A.shape[0] / 10), 1)
else max(int(_n_rows / 10), 1)
)

config.norm_laplacian = norm_laplacian
Expand All @@ -150,16 +191,25 @@ def spectral_embedding(A,
if config.drop_first:
config.n_components += 1

eigenvectors = CumlArray.empty((A.shape[0], n_components), dtype=A.dtype, order='F')
eigenvectors = CumlArray.empty((_n_rows, n_components), dtype=np.float32, order='F')

eigenvectors_ptr = <uintptr_t>eigenvectors.ptr

transform(
deref(h), config,
make_device_matrix_view[float, int, row_major](
<float *>A_ptr, <int> A.shape[0], <int> A.shape[1]),
make_device_matrix_view[float, int, col_major](
<float *>eigenvectors_ptr, <int> A.shape[0], <int> n_components))
if affinity == "precomputed":
transform(
deref(h), config,
make_device_vector_view(<int *>rows_ptr, <int> nnz),
make_device_vector_view(<int *>cols_ptr, <int> nnz),
make_device_vector_view(<float *>vals_ptr, <int> nnz),
make_device_matrix_view[float, int, col_major](
<float *>eigenvectors_ptr, <int> _n_rows, <int> n_components))
else:
transform(
deref(h), config,
make_device_matrix_view[float, int, row_major](
<float *>A_ptr, <int> _n_rows, <int> _n_cols),
make_device_matrix_view[float, int, col_major](
<float *>eigenvectors_ptr, <int> _n_rows, <int> n_components))

return eigenvectors

Expand All @@ -180,6 +230,12 @@ class SpectralEmbedding(Base,
----------
n_components : int, default=2
The dimension of the projected subspace.
affinity : {'nearest_neighbors', 'precomputed'} or callable, \
default='nearest_neighbors'
How to construct the affinity matrix.
- 'nearest_neighbors' : construct the affinity matrix by computing a
graph of nearest neighbors.
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
random_state : int, RandomState instance or None, default=None
A pseudo random number generator used for the initialization.
Use an int to make the results deterministic across calls.
Expand Down Expand Up @@ -210,9 +266,6 @@ class SpectralEmbedding(Base,

Notes
-----
Currently, cuML's SpectralEmbedding only supports the 'nearest_neighbors'
affinity mode, where a k-NN graph is constructed from the input data.

Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph
has one connected component. If there graph has many components, the first
few eigenvectors will simply uncover the connected components of the graph.
Expand All @@ -229,17 +282,20 @@ class SpectralEmbedding(Base,
"""
embedding_ = CumlArrayDescriptor()

def __init__(self, n_components=2, random_state=None, n_neighbors=None,
def __init__(self, n_components=2, affinity="nearest_neighbors",
random_state=None, n_neighbors=None,
handle=None, verbose=False, output_type=None):
super().__init__(handle=handle, verbose=verbose, output_type=output_type)
self.n_components = n_components
self.affinity = affinity
self.random_state = random_state
self.n_neighbors = n_neighbors

@classmethod
def _get_param_names(cls):
return super()._get_param_names() + [
"n_components",
"affinity",
"random_state",
"n_neighbors"
]
Expand Down Expand Up @@ -282,6 +338,7 @@ class SpectralEmbedding(Base,
self.embedding_ = spectral_embedding(
X,
n_components=self.n_components,
affinity=self.affinity,
random_state=self.random_state,
n_neighbors=self.n_neighbors
)
Expand Down
142 changes: 110 additions & 32 deletions python/cuml/tests/test_spectral_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
from sklearn.datasets import load_digits, make_s_curve, make_swiss_roll
from sklearn.manifold import SpectralEmbedding as skSpectralEmbedding
from sklearn.manifold import trustworthiness
from sklearn.neighbors import kneighbors_graph

from cuml.manifold import SpectralEmbedding, spectral_embedding
from cuml.manifold.umap import fuzzy_simplicial_set
from cuml.testing.datasets import make_classification_dataset

# Test parameters
Expand Down Expand Up @@ -66,51 +68,127 @@ def load_digits_dataset(n_samples=None):
return digits.data


# Dataset configurations: (dataset_loader, dataset_name, n_samples, min_trustworthiness)
dataset_configs = [
(generate_s_curve, 1500, 0.8),
(generate_s_curve, 2000, 0.8),
(generate_swiss_roll, 2000, 0.8),
(generate_swiss_roll, 3000, 0.8),
(generate_mnist_like_dataset, 5000, 0.8),
(load_digits_dataset, None, 0.8),
]


@pytest.mark.parametrize(
"dataset_loader,n_samples,min_trustworthiness",
dataset_configs,
"affinity,graph_type",
[
("nearest_neighbors", None), # Use built-in nearest_neighbors affinity
("precomputed", "binary_knn"), # Precomputed binary k-NN graph
("precomputed", "fuzzy_knn"), # Precomputed fuzzy k-NN graph from UMAP
],
Comment on lines +73 to +83
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we also add ("precomputed", "regular_knn") with mode="distance" to check that it is as good as ("nearest_neighbors", None).

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in ba3601d

)
@pytest.mark.parametrize(
"dataset_loader,n_samples",
[
(generate_s_curve, 1500),
(generate_s_curve, 2000),
(generate_swiss_roll, 2000),
(generate_swiss_roll, 3000),
(generate_mnist_like_dataset, 5000),
(load_digits_dataset, None),
],
)
def test_spectral_embedding_trustworthiness(
dataset_loader, n_samples, min_trustworthiness
dataset_loader, n_samples, affinity, graph_type
):
Comment on lines 82 to 98
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be great to quickly check if it also behave as expected with a smooth KNN such as one produced by the fuzzy_simplicial_set function.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in 9373046

"""Test trustworthiness comparison between sklearn and cuML on various datasets."""
"""Test trustworthiness comparison between sklearn and cuML on various datasets.

Tests different graph construction methods:
- nearest_neighbors affinity: Uses built-in k-NN graph construction
- precomputed with binary_knn: Binary connectivity k-NN graph
- precomputed with fuzzy_knn: Smooth weighted graph from UMAP's fuzzy simplicial set
"""
# Load/generate dataset
X = dataset_loader(n_samples) if n_samples else dataset_loader(None)

# sklearn embedding
sk_spectral = skSpectralEmbedding(
n_components=N_COMPONENTS,
n_neighbors=N_NEIGHBORS,
affinity="nearest_neighbors",
random_state=42,
n_jobs=-1,
)
X_sklearn = sk_spectral.fit_transform(X)

# cuML embedding
X_gpu = cp.asarray(X)
cuml_spectral = SpectralEmbedding(
n_components=N_COMPONENTS, n_neighbors=N_NEIGHBORS, random_state=42
)
X_cuml_gpu = cuml_spectral.fit_transform(X_gpu)
X_cuml = cp.asnumpy(X_cuml_gpu)
if affinity == "precomputed":
if graph_type == "fuzzy_knn":
# Use fuzzy_simplicial_set to create a smooth weighted KNN graph
X_gpu = cp.asarray(X, dtype=np.float32)

# Create smooth KNN graph using fuzzy_simplicial_set
# This creates a weighted graph with fuzzy membership strengths
graph = fuzzy_simplicial_set(
X_gpu,
n_neighbors=N_NEIGHBORS,
random_state=42,
)

# Convert to dense for sklearn
graph_dense = graph.toarray()

# sklearn embedding with precomputed fuzzy graph
sk_spectral = skSpectralEmbedding(
n_components=N_COMPONENTS,
affinity="precomputed",
random_state=42,
)
X_sklearn = sk_spectral.fit_transform(graph_dense.get())
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't the Scikit-Learn implementation handle sparse arrays here?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, addressed here 4560a49


# cuML embedding with precomputed fuzzy graph
cuml_spectral = SpectralEmbedding(
n_components=N_COMPONENTS,
affinity="precomputed",
random_state=42,
)
X_cuml_gpu = cuml_spectral.fit_transform(graph)
X_cuml = cp.asnumpy(X_cuml_gpu)

elif graph_type == "binary_knn":
# Create k-neighbors graph for precomputed affinity
knn_graph = kneighbors_graph(
X,
n_neighbors=N_NEIGHBORS,
mode="connectivity",
include_self=True,
)
# Make symmetric
knn_graph = 0.5 * (knn_graph + knn_graph.T)
knn_coo = knn_graph.tocoo()

# sklearn embedding with precomputed
sk_spectral = skSpectralEmbedding(
n_components=N_COMPONENTS,
affinity="precomputed",
random_state=42,
)
X_sklearn = sk_spectral.fit_transform(knn_coo)

# cuML embedding with precomputed
cuml_spectral = SpectralEmbedding(
n_components=N_COMPONENTS,
affinity="precomputed",
random_state=42,
)
X_cuml_gpu = cuml_spectral.fit_transform(knn_coo)
X_cuml = cp.asnumpy(X_cuml_gpu)
else:
# sklearn embedding with nearest_neighbors
sk_spectral = skSpectralEmbedding(
n_components=N_COMPONENTS,
n_neighbors=N_NEIGHBORS,
affinity="nearest_neighbors",
random_state=42,
n_jobs=-1,
)
X_sklearn = sk_spectral.fit_transform(X)

# cuML embedding with nearest_neighbors
X_gpu = cp.asarray(X)
cuml_spectral = SpectralEmbedding(
n_components=N_COMPONENTS,
affinity="nearest_neighbors",
n_neighbors=N_NEIGHBORS,
random_state=42,
)
X_cuml_gpu = cuml_spectral.fit_transform(X_gpu)
X_cuml = cp.asnumpy(X_cuml_gpu)

# Calculate trustworthiness scores
trust_sklearn = trustworthiness(X, X_sklearn, n_neighbors=N_NEIGHBORS)
trust_cuml = trustworthiness(X, X_cuml, n_neighbors=N_NEIGHBORS)

# Assertions
min_trustworthiness = 0.8
assert trust_sklearn > min_trustworthiness
assert trust_cuml > min_trustworthiness

Expand Down