Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions c/include/cuvs/cluster/kmeans.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ typedef enum {
Array = 2
} cuvsKMeansInitMethod;


/**
* @brief Hyper-parameters for the kmeans algorithm
*/
Expand Down Expand Up @@ -90,6 +91,7 @@ struct cuvsKMeansParams {
*/
int batch_centroids;

/** Check inertia during iterations for early convergence. */
bool inertia_check;

/**
Expand All @@ -101,6 +103,12 @@ struct cuvsKMeansParams {
* For hierarchical k-means , defines the number of training iterations
*/
int hierarchical_n_iters;

/**
* Number of samples to process per GPU batch for the batched (host-data) API.
* When set to 0, defaults to n_samples (process all at once).
*/
int64_t streaming_batch_size;
};

typedef struct cuvsKMeansParams* cuvsKMeansParams_t;
Expand Down Expand Up @@ -142,18 +150,24 @@ typedef enum { CUVS_KMEANS_TYPE_KMEANS = 0, CUVS_KMEANS_TYPE_KMEANS_BALANCED = 1
* clusters are reinitialized by choosing new centroids with
* k-means++ algorithm.
*
* X may reside on either host (CPU) or device (GPU) memory.
* When X is on the host the data is streamed to the GPU in
* batches controlled by params->streaming_batch_size.
*
* @param[in] res opaque C handle
* @param[in] params Parameters for KMeans model.
* @param[in] X Training instances to cluster. The data must
* be in row-major format.
* be in row-major format. May be on host or
* device memory.
* [dim = n_samples x n_features]
* @param[in] sample_weight Optional weights for each observation in X.
* Must be on the same memory space as X.
* [len = n_samples]
* @param[inout] centroids [in] When init is InitMethod::Array, use
* centroids as the initial cluster centers.
* [out] The generated centroids from the
* kmeans algorithm are stored at the address
* pointed by 'centroids'.
* pointed by 'centroids'. Must be on device.
* [dim = n_clusters x n_features]
* @param[out] inertia Sum of squared distances of samples to their
* closest cluster center.
Expand Down Expand Up @@ -212,6 +226,7 @@ cuvsError_t cuvsKMeansClusterCost(cuvsResources_t res,
DLManagedTensor* X,
DLManagedTensor* centroids,
double* cost);

/**
* @}
*/
Expand Down
103 changes: 75 additions & 28 deletions c/src/cluster/kmeans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*/

#include <cstdint>

#include <dlpack/dlpack.h>

#include <cuvs/cluster/kmeans.h>
Expand All @@ -17,16 +18,18 @@ namespace {

cuvs::cluster::kmeans::params convert_params(const cuvsKMeansParams& params)
{
auto kmeans_params = cuvs::cluster::kmeans::params();
kmeans_params.metric = static_cast<cuvs::distance::DistanceType>(params.metric);
kmeans_params.init = static_cast<cuvs::cluster::kmeans::params::InitMethod>(params.init);
kmeans_params.n_clusters = params.n_clusters;
kmeans_params.max_iter = params.max_iter;
kmeans_params.tol = params.tol;
auto kmeans_params = cuvs::cluster::kmeans::params();
kmeans_params.metric = static_cast<cuvs::distance::DistanceType>(params.metric);
kmeans_params.init = static_cast<cuvs::cluster::kmeans::params::InitMethod>(params.init);
kmeans_params.n_clusters = params.n_clusters;
kmeans_params.max_iter = params.max_iter;
kmeans_params.tol = params.tol;
kmeans_params.n_init = params.n_init;
kmeans_params.oversampling_factor = params.oversampling_factor;
kmeans_params.batch_samples = params.batch_samples;
kmeans_params.batch_centroids = params.batch_centroids;
kmeans_params.inertia_check = params.inertia_check;
kmeans_params.streaming_batch_size = params.streaming_batch_size;
return kmeans_params;
}

Expand All @@ -38,7 +41,7 @@ cuvs::cluster::kmeans::balanced_params convert_balanced_params(const cuvsKMeansP
return kmeans_params;
}

template <typename T, typename IdxT = int32_t>
template <typename T, typename IdxT = int64_t>
void _fit(cuvsResources_t res,
const cuvsKMeansParams& params,
DLManagedTensor* X_tensor,
Expand All @@ -50,7 +53,51 @@ void _fit(cuvsResources_t res,
auto X = X_tensor->dl_tensor;
auto res_ptr = reinterpret_cast<raft::resources*>(res);

if (cuvs::core::is_dlpack_device_compatible(X)) {
if (!cuvs::core::is_dlpack_device_compatible(X)) {
auto n_samples = static_cast<IdxT>(X.shape[0]);
auto n_features = static_cast<IdxT>(X.shape[1]);

if (params.hierarchical) {
RAFT_FAIL("hierarchical kmeans is not supported with host data");
}

auto centroids_dl = centroids_tensor->dl_tensor;
if (!cuvs::core::is_dlpack_device_compatible(centroids_dl)) {
RAFT_FAIL("centroids must be on device memory");
}

auto X_view = raft::make_host_matrix_view<T const, IdxT>(
reinterpret_cast<T const*>(X.data), n_samples, n_features);
auto centroids_view =
cuvs::core::from_dlpack<raft::device_matrix_view<T, IdxT, raft::row_major>>(
centroids_tensor);

std::optional<raft::host_vector_view<T const, IdxT>> sample_weight;
if (sample_weight_tensor != NULL) {
auto sw = sample_weight_tensor->dl_tensor;
if (!cuvs::core::is_dlpack_host_compatible(sw)) {
RAFT_FAIL("sample_weight must be host accessible when X is on host");
}
sample_weight = raft::make_host_vector_view<T const, IdxT>(
reinterpret_cast<T const*>(sw.data), n_samples);
}

T inertia_temp;
IdxT n_iter_temp;

auto kmeans_params = convert_params(params);
cuvs::cluster::kmeans::fit(*res_ptr,
kmeans_params,
X_view,
sample_weight,
centroids_view,
raft::make_host_scalar_view<T>(&inertia_temp),
raft::make_host_scalar_view<IdxT>(&n_iter_temp));

*inertia = inertia_temp;
*n_iter = n_iter_temp;

} else {
using const_mdspan_type = raft::device_matrix_view<T const, IdxT, raft::row_major>;
using mdspan_type = raft::device_matrix_view<T, IdxT, raft::row_major>;

Expand Down Expand Up @@ -85,13 +132,11 @@ void _fit(cuvsResources_t res,
cuvs::core::from_dlpack<const_mdspan_type>(X_tensor),
sample_weight,
cuvs::core::from_dlpack<mdspan_type>(centroids_tensor),
raft::make_host_scalar_view<T, IdxT>(&inertia_temp),
raft::make_host_scalar_view<IdxT, IdxT>(&n_iter_temp));
raft::make_host_scalar_view<T>(&inertia_temp),
raft::make_host_scalar_view<IdxT>(&n_iter_temp));
*inertia = inertia_temp;
*n_iter = n_iter_temp;
}
} else {
RAFT_FAIL("X dataset must be accessible on device memory");
}
}

Expand Down Expand Up @@ -143,7 +188,7 @@ void _predict(cuvsResources_t res,
cuvs::core::from_dlpack<const_mdspan_type>(centroids_tensor),
cuvs::core::from_dlpack<labels_mdspan_type>(labels_tensor),
normalize_weight,
raft::make_host_scalar_view<T, IdxT>(&inertia_temp));
raft::make_host_scalar_view<T>(&inertia_temp));
*inertia = inertia_temp;
}
} else {
Expand All @@ -168,7 +213,7 @@ void _cluster_cost(cuvsResources_t res,
cuvs::cluster::kmeans::cluster_cost(*res_ptr,
cuvs::core::from_dlpack<mdspan_type>(X_tensor),
cuvs::core::from_dlpack<mdspan_type>(centroids_tensor),
raft::make_host_scalar_view<T, IdxT>(&cost_temp));
raft::make_host_scalar_view<T>(&cost_temp));
} else {
RAFT_FAIL("X dataset must be accessible on device memory");
}
Expand All @@ -182,17 +227,20 @@ extern "C" cuvsError_t cuvsKMeansParamsCreate(cuvsKMeansParams_t* params)
return cuvs::core::translate_exceptions([=] {
cuvs::cluster::kmeans::params cpp_params;
cuvs::cluster::kmeans::balanced_params cpp_balanced_params;
*params =
new cuvsKMeansParams{.metric = static_cast<cuvsDistanceType>(cpp_params.metric),
.n_clusters = cpp_params.n_clusters,
.init = static_cast<cuvsKMeansInitMethod>(cpp_params.init),
.max_iter = cpp_params.max_iter,
.tol = cpp_params.tol,
.oversampling_factor = cpp_params.oversampling_factor,
.batch_samples = cpp_params.batch_samples,
.inertia_check = cpp_params.inertia_check,
.hierarchical = false,
.hierarchical_n_iters = static_cast<int>(cpp_balanced_params.n_iters)};
*params = new cuvsKMeansParams{
.metric = static_cast<cuvsDistanceType>(cpp_params.metric),
.n_clusters = cpp_params.n_clusters,
.init = static_cast<cuvsKMeansInitMethod>(cpp_params.init),
.max_iter = cpp_params.max_iter,
.tol = cpp_params.tol,
.n_init = cpp_params.n_init,
.oversampling_factor = cpp_params.oversampling_factor,
.batch_samples = cpp_params.batch_samples,
.batch_centroids = cpp_params.batch_centroids,
.inertia_check = cpp_params.inertia_check,
.hierarchical = false,
.hierarchical_n_iters = static_cast<int>(cpp_balanced_params.n_iters),
.streaming_batch_size = cpp_params.streaming_batch_size};
});
}

Expand Down Expand Up @@ -235,10 +283,9 @@ extern "C" cuvsError_t cuvsKMeansPredict(cuvsResources_t res,
return cuvs::core::translate_exceptions([=] {
auto dataset = X->dl_tensor;
if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
_predict<float>(res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
_predict<float>(res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
} else if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 64) {
_predict<double>(
res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
_predict<double>(res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
} else {
RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
dataset.dtype.code,
Expand Down
94 changes: 93 additions & 1 deletion cpp/include/cuvs/cluster/kmeans.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,15 +100,31 @@ struct params : base_params {
* useful to optimize/control the memory footprint
* Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0
* then don't tile the centroids
*
* NB: These parameters are unrelated to streaming_batch_size, which controls how many
* samples to transfer from host to device per batch when processing out-of-core
* data.
*/
int batch_samples = 1 << 15;

/**
* if 0 then batch_centroids = n_clusters
*/
int batch_centroids = 0; //
int batch_centroids = 0;

/**
* If true, check inertia during iterations for early convergence.
*/
bool inertia_check = false;

/**
* Number of samples to process per GPU batch when fitting with host data.
* When set to 0, defaults to n_samples (process all at once).
* Only used by the batched (host-data) code path and ignored by device-data
* overloads.
* Default: 0 (process all data at once).
*/
int64_t streaming_batch_size = 0;
};

/**
Expand Down Expand Up @@ -141,6 +157,82 @@ enum class kmeans_type { KMeans = 0, KMeansBalanced = 1 };
* @{
*/

/**
* @brief Find clusters with k-means algorithm using batched processing of host data.
*
* TODO: Evaluate replacing the extent type with int64_t. Reference issue:
* https://github.com/rapidsai/cuvs/issues/1961
*
* This overload supports out-of-core computation where the dataset resides
* on the host. Data is processed in GPU-sized batches, streaming from host to device.
* The batch size is controlled by params.streaming_batch_size.
*
* @code{.cpp}
* #include <raft/core/resources.hpp>
* #include <cuvs/cluster/kmeans.hpp>
* using namespace cuvs::cluster;
* ...
* raft::resources handle;
* cuvs::cluster::kmeans::params params;
* params.n_clusters = 100;
* params.streaming_batch_size = 100000;
* float inertia;
* int64_t n_iter;
*
* // Data on host
* std::vector<float> h_X(n_samples * n_features);
* auto X = raft::make_host_matrix_view<const float, int64_t>(h_X.data(), n_samples, n_features);
*
* // Centroids on device
* auto centroids = raft::make_device_matrix<float, int64_t>(handle, params.n_clusters,
* n_features);
*
* kmeans::fit(handle,
* params,
* X,
* std::nullopt,
* centroids.view(),
* raft::make_host_scalar_view(&inertia),
* raft::make_host_scalar_view(&n_iter));
* @endcode
*
* @param[in] handle The raft handle.
* @param[in] params Parameters for KMeans model. Batch size is read from
* params.streaming_batch_size.
* @param[in] X Training instances on HOST memory. The data must
* be in row-major format.
* [dim = n_samples x n_features]
* @param[in] sample_weight Optional weights for each observation in X (on host).
* [len = n_samples]
* @param[inout] centroids [in] When init is InitMethod::Array, use
* centroids as the initial cluster centers.
* [out] The generated centroids from the
* kmeans algorithm are stored at the address
* pointed by 'centroids'.
* [dim = n_clusters x n_features]
* @param[out] inertia Sum of squared distances of samples to their
* closest cluster center.
* @param[out] n_iter Number of iterations run.
*/
void fit(raft::resources const& handle,
const cuvs::cluster::kmeans::params& params,
raft::host_matrix_view<const float, int64_t> X,
std::optional<raft::host_vector_view<const float, int64_t>> sample_weight,
raft::device_matrix_view<float, int64_t> centroids,
raft::host_scalar_view<float> inertia,
raft::host_scalar_view<int64_t> n_iter);

/**
* @brief Find clusters with k-means algorithm using batched processing of host data.
*/
void fit(raft::resources const& handle,
const cuvs::cluster::kmeans::params& params,
raft::host_matrix_view<const double, int64_t> X,
std::optional<raft::host_vector_view<const double, int64_t>> sample_weight,
raft::device_matrix_view<double, int64_t> centroids,
raft::host_scalar_view<double> inertia,
raft::host_scalar_view<int64_t> n_iter);

/**
* @brief Find clusters with k-means algorithm.
* Initial centroids are chosen with k-means++ algorithm. Empty
Expand Down
Loading
Loading