diff --git a/python/cuml/cuml/manifold/simpl_set.pyx b/python/cuml/cuml/manifold/simpl_set.pyx index 14dfa1414b..c77858e565 100644 --- a/python/cuml/cuml/manifold/simpl_set.pyx +++ b/python/cuml/cuml/manifold/simpl_set.pyx @@ -25,7 +25,7 @@ cupyx = gpu_only_import('cupyx') from cuml.manifold.umap_utils cimport * from cuml.manifold.umap_utils import GraphHolder, find_ab_params, \ - metric_parsing + coerce_metric from cuml.internals.input_utils import input_to_cuml_array, is_array_like from cuml.internals.array import CumlArray @@ -158,10 +158,7 @@ def fuzzy_simplicial_set(X, umap_params.deterministic = deterministic umap_params.set_op_mix_ratio = set_op_mix_ratio umap_params.local_connectivity = local_connectivity - try: - umap_params.metric = metric_parsing[metric.lower()] - except KeyError: - raise ValueError(f"Invalid value for metric: {metric}") + umap_params.metric = coerce_metric(metric) if metric_kwds is None: umap_params.p = 2.0 else: @@ -353,10 +350,8 @@ def simplicial_set_embedding( umap_params.init = 1 else: raise ValueError("Invalid initialization strategy") - try: - umap_params.metric = metric_parsing[metric.lower()] - except KeyError: - raise ValueError(f"Invalid value for metric: {metric}") + + umap_params.metric = coerce_metric(metric) if metric_kwds is None: umap_params.p = 2.0 else: diff --git a/python/cuml/cuml/manifold/umap.pyx b/python/cuml/cuml/manifold/umap.pyx index 0f1e693785..3fbb5de09d 100644 --- a/python/cuml/cuml/manifold/umap.pyx +++ b/python/cuml/cuml/manifold/umap.pyx @@ -74,11 +74,8 @@ IF GPUBUILD == 1: from libc.stdlib cimport free from cuml.manifold.umap_utils cimport * from pylibraft.common.handle cimport handle_t - from cuml.manifold.umap_utils import GraphHolder, find_ab_params, \ - metric_parsing, DENSE_SUPPORTED_METRICS, SPARSE_SUPPORTED_METRICS - - from cuml.manifold.simpl_set import fuzzy_simplicial_set, \ - simplicial_set_embedding + from cuml.manifold.umap_utils import GraphHolder, find_ab_params, coerce_metric + from cuml.manifold.simpl_set import fuzzy_simplicial_set, simplicial_set_embedding cdef extern from "cuml/manifold/umap.hpp" namespace "ML::UMAP": @@ -483,56 +480,45 @@ class UMAP(UniversalBase, umap_params.verbosity = self.verbose umap_params.a = self.a umap_params.b = self.b + umap_params.target_n_neighbors = self.target_n_neighbors + umap_params.target_weight = self.target_weight + umap_params.random_state = check_random_seed(self.random_state) + umap_params.deterministic = self.deterministic + if self.init == "spectral": umap_params.init = 1 else: # self.init == "random" umap_params.init = 0 - umap_params.target_n_neighbors = self.target_n_neighbors + if self.target_metric == "euclidean": umap_params.target_metric = MetricType.EUCLIDEAN else: # self.target_metric == "categorical" umap_params.target_metric = MetricType.CATEGORICAL - if self.build_algo == "brute_force_knn": - umap_params.build_algo = graph_build_algo.BRUTE_FORCE_KNN - else: # self.init == "nn_descent" - umap_params.build_algo = graph_build_algo.NN_DESCENT - if self.build_kwds is None: - umap_params.nn_descent_params.graph_degree = 64 - umap_params.nn_descent_params.intermediate_graph_degree = 128 - umap_params.nn_descent_params.max_iterations = 20 - umap_params.nn_descent_params.termination_threshold = 0.0001 - umap_params.nn_descent_params.return_distances = True - umap_params.nn_descent_params.n_clusters = 1 - else: - umap_params.nn_descent_params.graph_degree = self.build_kwds.get("nnd_graph_degree", 64) - umap_params.nn_descent_params.intermediate_graph_degree = self.build_kwds.get("nnd_intermediate_graph_degree", 128) - umap_params.nn_descent_params.max_iterations = self.build_kwds.get("nnd_max_iterations", 20) - umap_params.nn_descent_params.termination_threshold = self.build_kwds.get("nnd_termination_threshold", 0.0001) - umap_params.nn_descent_params.return_distances = self.build_kwds.get("nnd_return_distances", True) - if self.build_kwds.get("nnd_n_clusters", 1) < 1: - logger.info("Negative number of nnd_n_clusters not allowed. Changing nnd_n_clusters to 1") - umap_params.nn_descent_params.n_clusters = self.build_kwds.get("nnd_n_clusters", 1) - umap_params.target_weight = self.target_weight - umap_params.random_state = check_random_seed(self.random_state) - umap_params.deterministic = self.deterministic - - try: - umap_params.metric = metric_parsing[self.metric.lower()] - if sparse: - if umap_params.metric not in SPARSE_SUPPORTED_METRICS: - raise NotImplementedError(f"Metric '{self.metric}' not supported for sparse inputs.") - elif umap_params.metric not in DENSE_SUPPORTED_METRICS: - raise NotImplementedError(f"Metric '{self.metric}' not supported for dense inputs.") - - except KeyError: - raise ValueError(f"Invalid value for metric: {self.metric}") + umap_params.metric = coerce_metric( + self.metric, sparse=sparse, build_algo=self.build_algo + ) if self.metric_kwds is None: umap_params.p = 2.0 else: umap_params.p = self.metric_kwds.get('p') + if self.build_algo == "brute_force_knn": + umap_params.build_algo = graph_build_algo.BRUTE_FORCE_KNN + else: + umap_params.build_algo = graph_build_algo.NN_DESCENT + build_kwds = self.build_kwds or {} + umap_params.nn_descent_params.graph_degree = build_kwds.get("nnd_graph_degree", 64) + umap_params.nn_descent_params.intermediate_graph_degree = build_kwds.get("nnd_intermediate_graph_degree", 128) + umap_params.nn_descent_params.max_iterations = build_kwds.get("nnd_max_iterations", 20) + umap_params.nn_descent_params.termination_threshold = build_kwds.get("nnd_termination_threshold", 0.0001) + umap_params.nn_descent_params.return_distances = build_kwds.get("nnd_return_distances", True) + umap_params.nn_descent_params.n_clusters = build_kwds.get("nnd_n_clusters", 1) + # Forward metric & metric_kwds to nn_descent + umap_params.nn_descent_params.metric = umap_params.metric + umap_params.nn_descent_params.metric_arg = umap_params.p + cdef uintptr_t callback_ptr = 0 if self.callback: callback_ptr = self.callback.get_native_callback() diff --git a/python/cuml/cuml/manifold/umap_utils.pxd b/python/cuml/cuml/manifold/umap_utils.pxd index 498e495733..c82f0244d4 100644 --- a/python/cuml/cuml/manifold/umap_utils.pxd +++ b/python/cuml/cuml/manifold/umap_utils.pxd @@ -24,6 +24,7 @@ from libc.stdint cimport uint64_t, uintptr_t, int64_t from libcpp cimport bool from libcpp.memory cimport shared_ptr from cuml.metrics.distance_type cimport DistanceType +from cuml.metrics.raft_distance_type cimport DistanceType as RaftDistanceType from cuml.internals.logger cimport level_enum cdef extern from "cuml/manifold/umapparams.h" namespace "ML::UMAPParams": @@ -39,6 +40,7 @@ cdef extern from "cuml/common/callback.hpp" namespace "ML::Internals": cdef cppclass GraphBasedDimRedCallback + cdef extern from "raft/neighbors/nn_descent_types.hpp" namespace "raft::neighbors::experimental::nn_descent": cdef struct index_params: uint64_t graph_degree, @@ -47,6 +49,8 @@ cdef extern from "raft/neighbors/nn_descent_types.hpp" namespace "raft::neighbor float termination_threshold, bool return_distances, uint64_t n_clusters, + RaftDistanceType metric, + float metric_arg cdef extern from "cuml/manifold/umapparams.h" namespace "ML": diff --git a/python/cuml/cuml/manifold/umap_utils.pyx b/python/cuml/cuml/manifold/umap_utils.pyx index bfad75cbdb..60463b8a18 100644 --- a/python/cuml/cuml/manifold/umap_utils.pyx +++ b/python/cuml/cuml/manifold/umap_utils.pyx @@ -16,6 +16,8 @@ # distutils: language = c++ +from typing import Literal + from rmm.pylibrmm.memory_resource cimport get_current_device_resource from pylibraft.common.handle cimport handle_t from cuml.manifold.umap_utils cimport * @@ -134,7 +136,7 @@ def find_ab_params(spread, min_dist): return params[0], params[1] -metric_parsing = { +_METRICS = { "l2": DistanceType.L2SqrtExpanded, "euclidean": DistanceType.L2SqrtExpanded, "sqeuclidean": DistanceType.L2Expanded, @@ -153,32 +155,61 @@ metric_parsing = { "canberra": DistanceType.Canberra } +_SUPPORTED_METRICS = { + "nn_descent": { + "sparse": frozenset(), + "dense": frozenset((DistanceType.L2SqrtExpanded,)) + }, + "brute_force_knn": { + "sparse": frozenset(( + DistanceType.Canberra, + DistanceType.CorrelationExpanded, + DistanceType.CosineExpanded, + DistanceType.HammingUnexpanded, + DistanceType.HellingerExpanded, + DistanceType.JaccardExpanded, + DistanceType.L1, + DistanceType.L2SqrtExpanded, + DistanceType.L2Expanded, + DistanceType.Linf, + DistanceType.LpUnexpanded, + )), + "dense": frozenset(( + DistanceType.Canberra, + DistanceType.CorrelationExpanded, + DistanceType.CosineExpanded, + DistanceType.HammingUnexpanded, + DistanceType.HellingerExpanded, + # DistanceType.JaccardExpanded, # not supported + DistanceType.L1, + DistanceType.L2SqrtExpanded, + DistanceType.L2Expanded, + DistanceType.Linf, + DistanceType.LpUnexpanded, + )) + } +} + -DENSE_SUPPORTED_METRICS = [ - DistanceType.Canberra, - DistanceType.CorrelationExpanded, - DistanceType.CosineExpanded, - DistanceType.HammingUnexpanded, - DistanceType.HellingerExpanded, - # DistanceType.JaccardExpanded, # not supported - DistanceType.L1, - DistanceType.L2SqrtExpanded, - DistanceType.L2Expanded, - DistanceType.Linf, - DistanceType.LpUnexpanded, -] - - -SPARSE_SUPPORTED_METRICS = [ - DistanceType.Canberra, - DistanceType.CorrelationExpanded, - DistanceType.CosineExpanded, - DistanceType.HammingUnexpanded, - DistanceType.HellingerExpanded, - DistanceType.JaccardExpanded, - DistanceType.L1, - DistanceType.L2SqrtExpanded, - DistanceType.L2Expanded, - DistanceType.Linf, - DistanceType.LpUnexpanded, -] +def coerce_metric( + metric: str, + sparse: bool = False, + build_algo: Literal["brute_force_knn", "nn_descent"] = "brute_force_knn", +) -> DistanceType: + """Coerce a metric string to a `DistanceType`. + + Also checks that the metric is valid and supported. + """ + try: + out = _METRICS[metric.lower()] + except KeyError: + raise ValueError(f"Invalid value for metric: {metric!r}") + + kind = "sparse" if sparse else "dense" + supported = _SUPPORTED_METRICS[build_algo][kind] + if out not in supported: + raise NotImplementedError( + f"Metric {metric!r} not supported for {kind} inputs with {build_algo=}" + ) + + return out diff --git a/python/cuml/cuml/tests/test_umap.py b/python/cuml/cuml/tests/test_umap.py index 64a4281c08..a2220ae14e 100644 --- a/python/cuml/cuml/tests/test_umap.py +++ b/python/cuml/cuml/tests/test_umap.py @@ -17,10 +17,10 @@ # Please install UMAP before running the code # use 'conda install -c conda-forge umap-learn' command to install it -import platform import pytest import copy import joblib +import umap from sklearn.metrics import adjusted_rand_score from sklearn.manifold import trustworthiness from sklearn.datasets import make_blobs @@ -46,12 +46,6 @@ scipy_sparse = cpu_only_import("scipy.sparse") -IS_ARM = platform.processor() == "aarch64" - -if not IS_ARM: - import umap - - dataset_names = ["iris", "digits", "wine", "blobs"] @@ -82,9 +76,6 @@ def test_blobs_cluster(nrows, n_feats, build_algo): @pytest.mark.parametrize( "n_feats", [unit_param(10), quality_param(100), stress_param(1000)] ) -@pytest.mark.skipif( - IS_ARM, reason="https://github.com/rapidsai/cuml/issues/5441" -) @pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_umap_fit_transform_score(nrows, n_feats, build_algo): @@ -257,9 +248,6 @@ def test_umap_transform_on_digits(target_metric): @pytest.mark.parametrize("target_metric", ["categorical", "euclidean"]) @pytest.mark.parametrize("name", dataset_names) -@pytest.mark.skipif( - IS_ARM, reason="https://github.com/rapidsai/cuml/issues/5441" -) def test_umap_fit_transform_trust(name, target_metric): if name == "iris": @@ -303,9 +291,6 @@ def test_umap_fit_transform_trust(name, target_metric): @pytest.mark.parametrize("should_downcast", [True]) @pytest.mark.parametrize("input_type", ["dataframe", "ndarray"]) @pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) -@pytest.mark.skipif( - IS_ARM, reason="https://github.com/rapidsai/cuml/issues/5441" -) def test_umap_data_formats( input_type, should_downcast, @@ -344,9 +329,6 @@ def test_umap_data_formats( @pytest.mark.parametrize("target_metric", ["categorical", "euclidean"]) @pytest.mark.filterwarnings("ignore:(.*)connected(.*):UserWarning:sklearn[.*]") @pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) -@pytest.mark.skipif( - IS_ARM, reason="https://github.com/rapidsai/cuml/issues/5441" -) def test_umap_fit_transform_score_default(target_metric, build_algo): n_samples = 500 @@ -546,9 +528,6 @@ def test_umap_transform_trustworthiness_with_consistency_enabled(): @pytest.mark.filterwarnings("ignore:(.*)zero(.*)::scipy[.*]|umap[.*]") -@pytest.mark.skipif( - IS_ARM, reason="https://github.com/rapidsai/cuml/issues/5441" -) @pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_exp_decay_params(build_algo): def compare_exp_decay_params(a=None, b=None, min_dist=0.1, spread=1.0): @@ -693,9 +672,6 @@ def correctness_sparse(a, b, atol=0.1, rtol=0.2, threshold=0.95): @pytest.mark.parametrize("n_rows", [200, 800]) @pytest.mark.parametrize("n_features", [8, 32]) @pytest.mark.parametrize("n_neighbors", [8, 16]) -@pytest.mark.skipif( - IS_ARM, reason="https://github.com/rapidsai/cuml/issues/5441" -) def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors): n_clusters = 30 random_state = 42 @@ -723,28 +699,39 @@ def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors): @pytest.mark.parametrize( - "metric,supported", + "metric,build_algo,supported", [ - ("l2", True), - ("euclidean", True), - ("sqeuclidean", True), - ("l1", True), - ("manhattan", True), - ("minkowski", True), - ("chebyshev", True), - ("cosine", True), - ("correlation", True), - ("jaccard", False), - ("hamming", True), - ("canberra", True), + ("l2", "brute_force_knn", True), + ("euclidean", "brute_force_knn", True), + ("sqeuclidean", "brute_force_knn", True), + ("l1", "brute_force_knn", True), + ("manhattan", "brute_force_knn", True), + ("minkowski", "brute_force_knn", True), + ("chebyshev", "brute_force_knn", True), + ("cosine", "brute_force_knn", True), + ("correlation", "brute_force_knn", True), + ("jaccard", "brute_force_knn", False), + ("hamming", "brute_force_knn", True), + ("canberra", "brute_force_knn", True), + ("l2", "nn_descent", True), + ("euclidean", "nn_descent", True), + ("sqeuclidean", "nn_descent", False), + ("l1", "nn_descent", False), + ("manhattan", "nn_descent", False), + ("minkowski", "nn_descent", False), + ("chebyshev", "nn_descent", False), + ("cosine", "nn_descent", False), + ("correlation", "nn_descent", False), + ("jaccard", "nn_descent", False), + ("hamming", "nn_descent", False), + ("canberra", "nn_descent", False), ], ) -@pytest.mark.skipif( - IS_ARM, reason="https://github.com/rapidsai/cuml/issues/5441" -) -def test_umap_distance_metrics_fit_transform_trust(metric, supported): +def test_umap_distance_metrics_fit_transform_trust( + metric, build_algo, supported +): data, labels = make_blobs( - n_samples=1000, n_features=64, centers=5, random_state=42 + n_samples=500, n_features=64, centers=5, random_state=42 ) if metric == "jaccard": @@ -754,7 +741,11 @@ def test_umap_distance_metrics_fit_transform_trust(metric, supported): n_neighbors=10, min_dist=0.01, metric=metric, init="random" ) cuml_model = cuUMAP( - n_neighbors=10, min_dist=0.01, metric=metric, init="random" + n_neighbors=10, + min_dist=0.01, + metric=metric, + init="random", + build_algo=build_algo, ) if not supported: with pytest.raises(NotImplementedError): @@ -792,9 +783,6 @@ def test_umap_distance_metrics_fit_transform_trust(metric, supported): ("canberra", True, True), ], ) -@pytest.mark.skipif( - IS_ARM, reason="https://github.com/rapidsai/cuml/issues/5441" -) def test_umap_distance_metrics_fit_transform_trust_on_sparse_input( metric, supported, umap_learn_supported ):