Skip to content

Commit f2d70ae

Browse files
authored
Expose kmeans to python (#729)
Authors: - Ben Frederickson (https://github.com/benfred) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: #729
1 parent bd1c31c commit f2d70ae

22 files changed

Lines changed: 1401 additions & 37 deletions

cpp/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,7 @@ if(BUILD_SHARED_LIBS)
312312
add_library(
313313
cuvs_objs OBJECT
314314
src/cluster/kmeans_balanced_fit_float.cu
315+
src/cluster/kmeans_cluster_cost.cu
315316
src/cluster/kmeans_fit_mg_float.cu
316317
src/cluster/kmeans_fit_mg_double.cu
317318
src/cluster/kmeans_fit_double.cu
@@ -699,6 +700,7 @@ target_compile_definitions(cuvs::cuvs INTERFACE $<$<BOOL:${CUVS_NVTX}>:NVTX_ENAB
699700
add_library(
700701
cuvs_c SHARED
701702
src/core/c_api.cpp
703+
src/cluster/kmeans_c.cpp
702704
src/neighbors/brute_force_c.cpp
703705
src/neighbors/ivf_flat_c.cpp
704706
src/neighbors/ivf_pq_c.cpp

cpp/include/cuvs/cluster/kmeans.h

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <cuvs/core/c_api.h>
20+
#include <cuvs/distance/distance.h>
21+
#include <dlpack/dlpack.h>
22+
#include <stdint.h>
23+
24+
#ifdef __cplusplus
25+
extern "C" {
26+
#endif
27+
28+
/**
29+
* @defgroup kmeans_c_params k-means hyperparameters
30+
* @{
31+
*/
32+
33+
enum cuvsKMeansInitMethod {
34+
/**
35+
* Sample the centroids using the kmeans++ strategy
36+
*/
37+
KMeansPlusPlus,
38+
39+
/**
40+
* Sample the centroids uniformly at random
41+
*/
42+
Random,
43+
44+
/**
45+
* User provides the array of initial centroids
46+
*/
47+
Array
48+
};
49+
50+
/**
51+
* @brief Hyper-parameters for the kmeans algorithm
52+
*/
53+
struct cuvsKMeansParams {
54+
cuvsDistanceType metric;
55+
56+
/**
57+
* The number of clusters to form as well as the number of centroids to generate (default:8).
58+
*/
59+
int n_clusters;
60+
61+
/**
62+
* Method for initialization, defaults to k-means++:
63+
* - cuvsKMeansInitMethod::KMeansPlusPlus (k-means++): Use scalable k-means++ algorithm
64+
* to select the initial cluster centers.
65+
* - cuvsKMeansInitMethod::Random (random): Choose 'n_clusters' observations (rows) at
66+
* random from the input data for the initial centroids.
67+
* - cuvsKMeansInitMethod::Array (ndarray): Use 'centroids' as initial cluster centers.
68+
*/
69+
cuvsKMeansInitMethod init;
70+
71+
/**
72+
* Maximum number of iterations of the k-means algorithm for a single run.
73+
*/
74+
int max_iter;
75+
76+
/**
77+
* Relative tolerance with regards to inertia to declare convergence.
78+
*/
79+
double tol;
80+
81+
/**
82+
* Number of instance k-means algorithm will be run with different seeds.
83+
*/
84+
int n_init;
85+
86+
/**
87+
* Oversampling factor for use in the k-means|| algorithm
88+
*/
89+
double oversampling_factor;
90+
91+
/**
92+
* batch_samples and batch_centroids are used to tile 1NN computation which is
93+
* useful to optimize/control the memory footprint
94+
* Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0
95+
* then don't tile the centroids
96+
*/
97+
int batch_samples;
98+
99+
/**
100+
* if 0 then batch_centroids = n_clusters
101+
*/
102+
int batch_centroids;
103+
104+
bool inertia_check;
105+
106+
/**
107+
* Whether to use hierarchical (balanced) kmeans or not
108+
*/
109+
bool hierarchical;
110+
111+
/**
112+
* For hierarchical k-means , defines the number of training iterations
113+
*/
114+
int hierarchical_n_iters;
115+
};
116+
117+
typedef struct cuvsKMeansParams* cuvsKMeansParams_t;
118+
119+
/**
120+
* @brief Allocate KMeans params, and populate with default values
121+
*
122+
* @param[in] params cuvsKMeansParams_t to allocate
123+
* @return cuvsError_t
124+
*/
125+
cuvsError_t cuvsKMeansParamsCreate(cuvsKMeansParams_t* params);
126+
127+
/**
128+
* @brief De-allocate KMeans params
129+
*
130+
* @param[in] params
131+
* @return cuvsError_t
132+
*/
133+
cuvsError_t cuvsKMeansParamsDestroy(cuvsKMeansParams_t params);
134+
135+
/**
136+
* @}
137+
*/
138+
139+
/**
140+
* @defgroup kmeans_c k-means clustering APIs
141+
* @{
142+
*/
143+
144+
/**
145+
* @brief Find clusters with k-means algorithm.
146+
*
147+
* Initial centroids are chosen with k-means++ algorithm. Empty
148+
* clusters are reinitialized by choosing new centroids with
149+
* k-means++ algorithm.
150+
*
151+
* @param[in] res opaque C handle
152+
* @param[in] params Parameters for KMeans model.
153+
* @param[in] X Training instances to cluster. The data must
154+
* be in row-major format.
155+
* [dim = n_samples x n_features]
156+
* @param[in] sample_weight Optional weights for each observation in X.
157+
* [len = n_samples]
158+
* @param[inout] centroids [in] When init is InitMethod::Array, use
159+
* centroids as the initial cluster centers.
160+
* [out] The generated centroids from the
161+
* kmeans algorithm are stored at the address
162+
* pointed by 'centroids'.
163+
* [dim = n_clusters x n_features]
164+
* @param[out] inertia Sum of squared distances of samples to their
165+
* closest cluster center.
166+
* @param[out] n_iter Number of iterations run.
167+
*/
168+
cuvsError_t cuvsKMeansFit(cuvsResources_t res,
169+
cuvsKMeansParams_t params,
170+
DLManagedTensor* X,
171+
DLManagedTensor* sample_weight,
172+
DLManagedTensor* centroids,
173+
double* inertia,
174+
int* n_iter);
175+
176+
/**
177+
* @brief Predict the closest cluster each sample in X belongs to.
178+
*
179+
* @param[in] res opaque C handle
180+
* @param[in] params Parameters for KMeans model.
181+
* @param[in] X New data to predict.
182+
* [dim = n_samples x n_features]
183+
* @param[in] sample_weight Optional weights for each observation in X.
184+
* [len = n_samples]
185+
* @param[in] centroids Cluster centroids. The data must be in
186+
* row-major format.
187+
* [dim = n_clusters x n_features]
188+
* @param[in] normalize_weight True if the weights should be normalized
189+
* @param[out] labels Index of the cluster each sample in X
190+
* belongs to.
191+
* [len = n_samples]
192+
* @param[out] inertia Sum of squared distances of samples to
193+
* their closest cluster center.
194+
*/
195+
cuvsError_t cuvsKMeansPredict(cuvsResources_t res,
196+
cuvsKMeansParams_t params,
197+
DLManagedTensor* X,
198+
DLManagedTensor* sample_weight,
199+
DLManagedTensor* centroids,
200+
DLManagedTensor* labels,
201+
bool normalize_weight,
202+
double* inertia);
203+
204+
/**
205+
* @brief Compute cluster cost
206+
*
207+
* @param[in] res opaque C handle
208+
* @param[in] X Training instances to cluster. The data must
209+
* be in row-major format.
210+
* [dim = n_samples x n_features]
211+
* @param[in] centroids Cluster centroids. The data must be in
212+
* row-major format.
213+
* [dim = n_clusters x n_features]
214+
* @param[out] cost Resulting cluster cost
215+
*
216+
*/
217+
cuvsError_t cuvsKMeansClusterCost(cuvsResources_t res,
218+
DLManagedTensor* X,
219+
DLManagedTensor* centroids,
220+
double* cost);
221+
/**
222+
* @}
223+
*/
224+
225+
#ifdef __cplusplus
226+
}
227+
#endif

0 commit comments

Comments
 (0)