|
| 1 | +/* |
| 2 | + * Copyright (c) 2025, NVIDIA CORPORATION. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | + |
| 17 | +#pragma once |
| 18 | + |
| 19 | +#include <cuvs/core/c_api.h> |
| 20 | +#include <cuvs/distance/distance.h> |
| 21 | +#include <dlpack/dlpack.h> |
| 22 | +#include <stdint.h> |
| 23 | + |
| 24 | +#ifdef __cplusplus |
| 25 | +extern "C" { |
| 26 | +#endif |
| 27 | + |
| 28 | +/** |
| 29 | + * @defgroup kmeans_c_params k-means hyperparameters |
| 30 | + * @{ |
| 31 | + */ |
| 32 | + |
| 33 | +enum cuvsKMeansInitMethod { |
| 34 | + /** |
| 35 | + * Sample the centroids using the kmeans++ strategy |
| 36 | + */ |
| 37 | + KMeansPlusPlus, |
| 38 | + |
| 39 | + /** |
| 40 | + * Sample the centroids uniformly at random |
| 41 | + */ |
| 42 | + Random, |
| 43 | + |
| 44 | + /** |
| 45 | + * User provides the array of initial centroids |
| 46 | + */ |
| 47 | + Array |
| 48 | +}; |
| 49 | + |
| 50 | +/** |
| 51 | + * @brief Hyper-parameters for the kmeans algorithm |
| 52 | + */ |
| 53 | +struct cuvsKMeansParams { |
| 54 | + cuvsDistanceType metric; |
| 55 | + |
| 56 | + /** |
| 57 | + * The number of clusters to form as well as the number of centroids to generate (default:8). |
| 58 | + */ |
| 59 | + int n_clusters; |
| 60 | + |
| 61 | + /** |
| 62 | + * Method for initialization, defaults to k-means++: |
| 63 | + * - cuvsKMeansInitMethod::KMeansPlusPlus (k-means++): Use scalable k-means++ algorithm |
| 64 | + * to select the initial cluster centers. |
| 65 | + * - cuvsKMeansInitMethod::Random (random): Choose 'n_clusters' observations (rows) at |
| 66 | + * random from the input data for the initial centroids. |
| 67 | + * - cuvsKMeansInitMethod::Array (ndarray): Use 'centroids' as initial cluster centers. |
| 68 | + */ |
| 69 | + cuvsKMeansInitMethod init; |
| 70 | + |
| 71 | + /** |
| 72 | + * Maximum number of iterations of the k-means algorithm for a single run. |
| 73 | + */ |
| 74 | + int max_iter; |
| 75 | + |
| 76 | + /** |
| 77 | + * Relative tolerance with regards to inertia to declare convergence. |
| 78 | + */ |
| 79 | + double tol; |
| 80 | + |
| 81 | + /** |
| 82 | + * Number of instance k-means algorithm will be run with different seeds. |
| 83 | + */ |
| 84 | + int n_init; |
| 85 | + |
| 86 | + /** |
| 87 | + * Oversampling factor for use in the k-means|| algorithm |
| 88 | + */ |
| 89 | + double oversampling_factor; |
| 90 | + |
| 91 | + /** |
| 92 | + * batch_samples and batch_centroids are used to tile 1NN computation which is |
| 93 | + * useful to optimize/control the memory footprint |
| 94 | + * Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0 |
| 95 | + * then don't tile the centroids |
| 96 | + */ |
| 97 | + int batch_samples; |
| 98 | + |
| 99 | + /** |
| 100 | + * if 0 then batch_centroids = n_clusters |
| 101 | + */ |
| 102 | + int batch_centroids; |
| 103 | + |
| 104 | + bool inertia_check; |
| 105 | + |
| 106 | + /** |
| 107 | + * Whether to use hierarchical (balanced) kmeans or not |
| 108 | + */ |
| 109 | + bool hierarchical; |
| 110 | + |
| 111 | + /** |
| 112 | + * For hierarchical k-means , defines the number of training iterations |
| 113 | + */ |
| 114 | + int hierarchical_n_iters; |
| 115 | +}; |
| 116 | + |
| 117 | +typedef struct cuvsKMeansParams* cuvsKMeansParams_t; |
| 118 | + |
| 119 | +/** |
| 120 | + * @brief Allocate KMeans params, and populate with default values |
| 121 | + * |
| 122 | + * @param[in] params cuvsKMeansParams_t to allocate |
| 123 | + * @return cuvsError_t |
| 124 | + */ |
| 125 | +cuvsError_t cuvsKMeansParamsCreate(cuvsKMeansParams_t* params); |
| 126 | + |
| 127 | +/** |
| 128 | + * @brief De-allocate KMeans params |
| 129 | + * |
| 130 | + * @param[in] params |
| 131 | + * @return cuvsError_t |
| 132 | + */ |
| 133 | +cuvsError_t cuvsKMeansParamsDestroy(cuvsKMeansParams_t params); |
| 134 | + |
| 135 | +/** |
| 136 | + * @} |
| 137 | + */ |
| 138 | + |
| 139 | +/** |
| 140 | + * @defgroup kmeans_c k-means clustering APIs |
| 141 | + * @{ |
| 142 | + */ |
| 143 | + |
| 144 | +/** |
| 145 | + * @brief Find clusters with k-means algorithm. |
| 146 | + * |
| 147 | + * Initial centroids are chosen with k-means++ algorithm. Empty |
| 148 | + * clusters are reinitialized by choosing new centroids with |
| 149 | + * k-means++ algorithm. |
| 150 | + * |
| 151 | + * @param[in] res opaque C handle |
| 152 | + * @param[in] params Parameters for KMeans model. |
| 153 | + * @param[in] X Training instances to cluster. The data must |
| 154 | + * be in row-major format. |
| 155 | + * [dim = n_samples x n_features] |
| 156 | + * @param[in] sample_weight Optional weights for each observation in X. |
| 157 | + * [len = n_samples] |
| 158 | + * @param[inout] centroids [in] When init is InitMethod::Array, use |
| 159 | + * centroids as the initial cluster centers. |
| 160 | + * [out] The generated centroids from the |
| 161 | + * kmeans algorithm are stored at the address |
| 162 | + * pointed by 'centroids'. |
| 163 | + * [dim = n_clusters x n_features] |
| 164 | + * @param[out] inertia Sum of squared distances of samples to their |
| 165 | + * closest cluster center. |
| 166 | + * @param[out] n_iter Number of iterations run. |
| 167 | + */ |
| 168 | +cuvsError_t cuvsKMeansFit(cuvsResources_t res, |
| 169 | + cuvsKMeansParams_t params, |
| 170 | + DLManagedTensor* X, |
| 171 | + DLManagedTensor* sample_weight, |
| 172 | + DLManagedTensor* centroids, |
| 173 | + double* inertia, |
| 174 | + int* n_iter); |
| 175 | + |
| 176 | +/** |
| 177 | + * @brief Predict the closest cluster each sample in X belongs to. |
| 178 | + * |
| 179 | + * @param[in] res opaque C handle |
| 180 | + * @param[in] params Parameters for KMeans model. |
| 181 | + * @param[in] X New data to predict. |
| 182 | + * [dim = n_samples x n_features] |
| 183 | + * @param[in] sample_weight Optional weights for each observation in X. |
| 184 | + * [len = n_samples] |
| 185 | + * @param[in] centroids Cluster centroids. The data must be in |
| 186 | + * row-major format. |
| 187 | + * [dim = n_clusters x n_features] |
| 188 | + * @param[in] normalize_weight True if the weights should be normalized |
| 189 | + * @param[out] labels Index of the cluster each sample in X |
| 190 | + * belongs to. |
| 191 | + * [len = n_samples] |
| 192 | + * @param[out] inertia Sum of squared distances of samples to |
| 193 | + * their closest cluster center. |
| 194 | + */ |
| 195 | +cuvsError_t cuvsKMeansPredict(cuvsResources_t res, |
| 196 | + cuvsKMeansParams_t params, |
| 197 | + DLManagedTensor* X, |
| 198 | + DLManagedTensor* sample_weight, |
| 199 | + DLManagedTensor* centroids, |
| 200 | + DLManagedTensor* labels, |
| 201 | + bool normalize_weight, |
| 202 | + double* inertia); |
| 203 | + |
| 204 | +/** |
| 205 | + * @brief Compute cluster cost |
| 206 | + * |
| 207 | + * @param[in] res opaque C handle |
| 208 | + * @param[in] X Training instances to cluster. The data must |
| 209 | + * be in row-major format. |
| 210 | + * [dim = n_samples x n_features] |
| 211 | + * @param[in] centroids Cluster centroids. The data must be in |
| 212 | + * row-major format. |
| 213 | + * [dim = n_clusters x n_features] |
| 214 | + * @param[out] cost Resulting cluster cost |
| 215 | + * |
| 216 | + */ |
| 217 | +cuvsError_t cuvsKMeansClusterCost(cuvsResources_t res, |
| 218 | + DLManagedTensor* X, |
| 219 | + DLManagedTensor* centroids, |
| 220 | + double* cost); |
| 221 | +/** |
| 222 | + * @} |
| 223 | + */ |
| 224 | + |
| 225 | +#ifdef __cplusplus |
| 226 | +} |
| 227 | +#endif |
0 commit comments