-
Notifications
You must be signed in to change notification settings - Fork 623
Spectral Embedding argument affinity={"precomputed", "nearest_neighbors"}
#7117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
01cb5cb
50ccec0
8d8a904
63f7fa6
4df79b5
ddf2a26
9373046
19c0d11
ba3601d
83951a4
4560a49
706b843
2b5bbd6
42d4eae
884e282
9873ab1
bc838e3
1942d7d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,8 +19,10 @@ | |
| from sklearn.datasets import load_digits, make_s_curve, make_swiss_roll | ||
| from sklearn.manifold import SpectralEmbedding as skSpectralEmbedding | ||
| from sklearn.manifold import trustworthiness | ||
| from sklearn.neighbors import kneighbors_graph | ||
|
|
||
| from cuml.manifold import SpectralEmbedding, spectral_embedding | ||
| from cuml.manifold.umap import fuzzy_simplicial_set | ||
| from cuml.testing.datasets import make_classification_dataset | ||
|
|
||
| # Test parameters | ||
|
|
@@ -66,51 +68,127 @@ def load_digits_dataset(n_samples=None): | |
| return digits.data | ||
|
|
||
|
|
||
| # Dataset configurations: (dataset_loader, dataset_name, n_samples, min_trustworthiness) | ||
| dataset_configs = [ | ||
| (generate_s_curve, 1500, 0.8), | ||
| (generate_s_curve, 2000, 0.8), | ||
| (generate_swiss_roll, 2000, 0.8), | ||
| (generate_swiss_roll, 3000, 0.8), | ||
| (generate_mnist_like_dataset, 5000, 0.8), | ||
| (load_digits_dataset, None, 0.8), | ||
| ] | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "dataset_loader,n_samples,min_trustworthiness", | ||
| dataset_configs, | ||
| "affinity,graph_type", | ||
| [ | ||
| ("nearest_neighbors", None), # Use built-in nearest_neighbors affinity | ||
| ("precomputed", "binary_knn"), # Precomputed binary k-NN graph | ||
| ("precomputed", "fuzzy_knn"), # Precomputed fuzzy k-NN graph from UMAP | ||
| ], | ||
|
Comment on lines
+73
to
+83
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we also add
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in ba3601d |
||
| ) | ||
| @pytest.mark.parametrize( | ||
| "dataset_loader,n_samples", | ||
| [ | ||
| (generate_s_curve, 1500), | ||
| (generate_s_curve, 2000), | ||
| (generate_swiss_roll, 2000), | ||
| (generate_swiss_roll, 3000), | ||
| (generate_mnist_like_dataset, 5000), | ||
| (load_digits_dataset, None), | ||
| ], | ||
| ) | ||
| def test_spectral_embedding_trustworthiness( | ||
| dataset_loader, n_samples, min_trustworthiness | ||
| dataset_loader, n_samples, affinity, graph_type | ||
| ): | ||
|
Comment on lines
82
to
98
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be great to quickly check if it also behave as expected with a smooth KNN such as one produced by the
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in 9373046 |
||
| """Test trustworthiness comparison between sklearn and cuML on various datasets.""" | ||
| """Test trustworthiness comparison between sklearn and cuML on various datasets. | ||
|
|
||
| Tests different graph construction methods: | ||
| - nearest_neighbors affinity: Uses built-in k-NN graph construction | ||
| - precomputed with binary_knn: Binary connectivity k-NN graph | ||
| - precomputed with fuzzy_knn: Smooth weighted graph from UMAP's fuzzy simplicial set | ||
| """ | ||
| # Load/generate dataset | ||
| X = dataset_loader(n_samples) if n_samples else dataset_loader(None) | ||
|
|
||
| # sklearn embedding | ||
| sk_spectral = skSpectralEmbedding( | ||
| n_components=N_COMPONENTS, | ||
| n_neighbors=N_NEIGHBORS, | ||
| affinity="nearest_neighbors", | ||
| random_state=42, | ||
| n_jobs=-1, | ||
| ) | ||
| X_sklearn = sk_spectral.fit_transform(X) | ||
|
|
||
| # cuML embedding | ||
| X_gpu = cp.asarray(X) | ||
| cuml_spectral = SpectralEmbedding( | ||
| n_components=N_COMPONENTS, n_neighbors=N_NEIGHBORS, random_state=42 | ||
| ) | ||
| X_cuml_gpu = cuml_spectral.fit_transform(X_gpu) | ||
| X_cuml = cp.asnumpy(X_cuml_gpu) | ||
| if affinity == "precomputed": | ||
| if graph_type == "fuzzy_knn": | ||
| # Use fuzzy_simplicial_set to create a smooth weighted KNN graph | ||
| X_gpu = cp.asarray(X, dtype=np.float32) | ||
|
|
||
| # Create smooth KNN graph using fuzzy_simplicial_set | ||
| # This creates a weighted graph with fuzzy membership strengths | ||
| graph = fuzzy_simplicial_set( | ||
| X_gpu, | ||
| n_neighbors=N_NEIGHBORS, | ||
| random_state=42, | ||
| ) | ||
|
|
||
| # Convert to dense for sklearn | ||
| graph_dense = graph.toarray() | ||
|
|
||
| # sklearn embedding with precomputed fuzzy graph | ||
| sk_spectral = skSpectralEmbedding( | ||
| n_components=N_COMPONENTS, | ||
| affinity="precomputed", | ||
| random_state=42, | ||
| ) | ||
| X_sklearn = sk_spectral.fit_transform(graph_dense.get()) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't the Scikit-Learn implementation handle sparse arrays here?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, addressed here 4560a49 |
||
|
|
||
| # cuML embedding with precomputed fuzzy graph | ||
| cuml_spectral = SpectralEmbedding( | ||
| n_components=N_COMPONENTS, | ||
| affinity="precomputed", | ||
| random_state=42, | ||
| ) | ||
| X_cuml_gpu = cuml_spectral.fit_transform(graph) | ||
| X_cuml = cp.asnumpy(X_cuml_gpu) | ||
|
|
||
| elif graph_type == "binary_knn": | ||
| # Create k-neighbors graph for precomputed affinity | ||
| knn_graph = kneighbors_graph( | ||
| X, | ||
| n_neighbors=N_NEIGHBORS, | ||
| mode="connectivity", | ||
| include_self=True, | ||
| ) | ||
| # Make symmetric | ||
| knn_graph = 0.5 * (knn_graph + knn_graph.T) | ||
| knn_coo = knn_graph.tocoo() | ||
|
|
||
| # sklearn embedding with precomputed | ||
| sk_spectral = skSpectralEmbedding( | ||
| n_components=N_COMPONENTS, | ||
| affinity="precomputed", | ||
| random_state=42, | ||
| ) | ||
| X_sklearn = sk_spectral.fit_transform(knn_coo) | ||
|
|
||
| # cuML embedding with precomputed | ||
| cuml_spectral = SpectralEmbedding( | ||
| n_components=N_COMPONENTS, | ||
| affinity="precomputed", | ||
| random_state=42, | ||
| ) | ||
| X_cuml_gpu = cuml_spectral.fit_transform(knn_coo) | ||
| X_cuml = cp.asnumpy(X_cuml_gpu) | ||
| else: | ||
| # sklearn embedding with nearest_neighbors | ||
| sk_spectral = skSpectralEmbedding( | ||
| n_components=N_COMPONENTS, | ||
| n_neighbors=N_NEIGHBORS, | ||
| affinity="nearest_neighbors", | ||
| random_state=42, | ||
| n_jobs=-1, | ||
| ) | ||
| X_sklearn = sk_spectral.fit_transform(X) | ||
|
|
||
| # cuML embedding with nearest_neighbors | ||
| X_gpu = cp.asarray(X) | ||
| cuml_spectral = SpectralEmbedding( | ||
| n_components=N_COMPONENTS, | ||
| affinity="nearest_neighbors", | ||
| n_neighbors=N_NEIGHBORS, | ||
| random_state=42, | ||
| ) | ||
| X_cuml_gpu = cuml_spectral.fit_transform(X_gpu) | ||
| X_cuml = cp.asnumpy(X_cuml_gpu) | ||
|
|
||
| # Calculate trustworthiness scores | ||
| trust_sklearn = trustworthiness(X, X_sklearn, n_neighbors=N_NEIGHBORS) | ||
| trust_cuml = trustworthiness(X, X_cuml, n_neighbors=N_NEIGHBORS) | ||
|
|
||
| # Assertions | ||
| min_trustworthiness = 0.8 | ||
| assert trust_sklearn > min_trustworthiness | ||
| assert trust_cuml > min_trustworthiness | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like the API won't work with datasets having more elements (nnz) than
std::numeric_limits<int>::max. Would be great to update the cuVS and cuML APIs to allow larger matrices (extent asuint64_t). Maybe as a follow-up PR?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, tracking here rapidsai/cuvs#1243.