rapidsai · rapids-bot · Mar 31, 2026 · Mar 31, 2026 · Apr 1, 2026 · Apr 1, 2026
@@ -36,6 +36,7 @@ typedef enum {
   Array = 2
 } cuvsKMeansInitMethod;
 
+
 /**
  * @brief Hyper-parameters for the kmeans algorithm
  */
@@ -90,6 +91,7 @@ struct cuvsKMeansParams {
    */
   int batch_centroids;
 
+  /** Check inertia during iterations for early convergence. */
   bool inertia_check;
 
   /**
@@ -101,6 +103,12 @@ struct cuvsKMeansParams {
    * For hierarchical k-means , defines the number of training iterations
    */
   int hierarchical_n_iters;
+
+  /**
+   * Number of samples to process per GPU batch for the batched (host-data) API.
+   * When set to 0, defaults to n_samples (process all at once).
+   */
+   int64_t streaming_batch_size;
 };
 
 typedef struct cuvsKMeansParams* cuvsKMeansParams_t;
@@ -142,18 +150,24 @@ typedef enum { CUVS_KMEANS_TYPE_KMEANS = 0, CUVS_KMEANS_TYPE_KMEANS_BALANCED = 1
  *   clusters are reinitialized by choosing new centroids with
  *   k-means++ algorithm.
  *
+ *   X may reside on either host (CPU) or device (GPU) memory.
+ *   When X is on the host the data is streamed to the GPU in
+ *   batches controlled by params->streaming_batch_size.
+ *
  * @param[in]     res           opaque C handle
  * @param[in]     params        Parameters for KMeans model.
  * @param[in]     X             Training instances to cluster. The data must
- *                              be in row-major format.
+ *                              be in row-major format. May be on host or
+ *                              device memory.
  *                              [dim = n_samples x n_features]
  * @param[in]     sample_weight Optional weights for each observation in X.
+ *                              Must be on the same memory space as X.
  *                              [len = n_samples]
  * @param[inout]  centroids     [in] When init is InitMethod::Array, use
  *                              centroids as the initial cluster centers.
  *                              [out] The generated centroids from the
  *                              kmeans algorithm are stored at the address
- *                              pointed by 'centroids'.
+ *                              pointed by 'centroids'. Must be on device.
  *                              [dim = n_clusters x n_features]
  * @param[out]    inertia       Sum of squared distances of samples to their
  *                              closest cluster center.
@@ -212,6 +226,7 @@ cuvsError_t cuvsKMeansClusterCost(cuvsResources_t res,
                                   DLManagedTensor* X,
                                   DLManagedTensor* centroids,
                                   double* cost);
+
 /**
  * @}
  */

@@ -4,6 +4,7 @@
  */
 
 #include <cstdint>
+
 #include <dlpack/dlpack.h>
 
 #include <cuvs/cluster/kmeans.h>
@@ -17,16 +18,18 @@ namespace {
 
 cuvs::cluster::kmeans::params convert_params(const cuvsKMeansParams& params)
 {
-  auto kmeans_params       = cuvs::cluster::kmeans::params();
-  kmeans_params.metric     = static_cast<cuvs::distance::DistanceType>(params.metric);
-  kmeans_params.init       = static_cast<cuvs::cluster::kmeans::params::InitMethod>(params.init);
-  kmeans_params.n_clusters = params.n_clusters;
-  kmeans_params.max_iter   = params.max_iter;
-  kmeans_params.tol        = params.tol;
+  auto kmeans_params                = cuvs::cluster::kmeans::params();
+  kmeans_params.metric              = static_cast<cuvs::distance::DistanceType>(params.metric);
+  kmeans_params.init = static_cast<cuvs::cluster::kmeans::params::InitMethod>(params.init);
+  kmeans_params.n_clusters          = params.n_clusters;
+  kmeans_params.max_iter            = params.max_iter;
+  kmeans_params.tol                 = params.tol;
+  kmeans_params.n_init              = params.n_init;
   kmeans_params.oversampling_factor = params.oversampling_factor;
   kmeans_params.batch_samples       = params.batch_samples;
   kmeans_params.batch_centroids     = params.batch_centroids;
   kmeans_params.inertia_check       = params.inertia_check;
+  kmeans_params.streaming_batch_size  = params.streaming_batch_size;
   return kmeans_params;
 }
 
@@ -38,7 +41,7 @@ cuvs::cluster::kmeans::balanced_params convert_balanced_params(const cuvsKMeansP
   return kmeans_params;
 }
 
-template <typename T, typename IdxT = int32_t>
+template <typename T, typename IdxT = int64_t>
 void _fit(cuvsResources_t res,
           const cuvsKMeansParams& params,
           DLManagedTensor* X_tensor,
@@ -50,7 +53,51 @@ void _fit(cuvsResources_t res,
   auto X       = X_tensor->dl_tensor;
   auto res_ptr = reinterpret_cast<raft::resources*>(res);
 
-  if (cuvs::core::is_dlpack_device_compatible(X)) {
+  if (!cuvs::core::is_dlpack_device_compatible(X)) {
+    auto n_samples  = static_cast<IdxT>(X.shape[0]);
+    auto n_features = static_cast<IdxT>(X.shape[1]);
+
+    if (params.hierarchical) {
+      RAFT_FAIL("hierarchical kmeans is not supported with host data");
+    }
+
+    auto centroids_dl = centroids_tensor->dl_tensor;
+    if (!cuvs::core::is_dlpack_device_compatible(centroids_dl)) {
+      RAFT_FAIL("centroids must be on device memory");
+    }
+
+    auto X_view = raft::make_host_matrix_view<T const, IdxT>(
+      reinterpret_cast<T const*>(X.data), n_samples, n_features);
+    auto centroids_view =
+      cuvs::core::from_dlpack<raft::device_matrix_view<T, IdxT, raft::row_major>>(
+        centroids_tensor);
+
+    std::optional<raft::host_vector_view<T const, IdxT>> sample_weight;
+    if (sample_weight_tensor != NULL) {
+      auto sw = sample_weight_tensor->dl_tensor;
+      if (!cuvs::core::is_dlpack_host_compatible(sw)) {
+        RAFT_FAIL("sample_weight must be host accessible when X is on host");
+      }
+      sample_weight = raft::make_host_vector_view<T const, IdxT>(
+        reinterpret_cast<T const*>(sw.data), n_samples);
+    }
+
+    T inertia_temp;
+    IdxT n_iter_temp;
+
+    auto kmeans_params = convert_params(params);
+    cuvs::cluster::kmeans::fit(*res_ptr,
+                               kmeans_params,
+                               X_view,
+                               sample_weight,
+                               centroids_view,
+                               raft::make_host_scalar_view<T>(&inertia_temp),
+                               raft::make_host_scalar_view<IdxT>(&n_iter_temp));
+
+    *inertia = inertia_temp;
+    *n_iter  = n_iter_temp;
+
+  } else {
     using const_mdspan_type = raft::device_matrix_view<T const, IdxT, raft::row_major>;
     using mdspan_type       = raft::device_matrix_view<T, IdxT, raft::row_major>;
 
@@ -85,13 +132,11 @@ void _fit(cuvsResources_t res,
                                  cuvs::core::from_dlpack<const_mdspan_type>(X_tensor),
                                  sample_weight,
                                  cuvs::core::from_dlpack<mdspan_type>(centroids_tensor),
-                                 raft::make_host_scalar_view<T, IdxT>(&inertia_temp),
-                                 raft::make_host_scalar_view<IdxT, IdxT>(&n_iter_temp));
+                                 raft::make_host_scalar_view<T>(&inertia_temp),
+                                 raft::make_host_scalar_view<IdxT>(&n_iter_temp));
       *inertia = inertia_temp;
       *n_iter  = n_iter_temp;
     }
-  } else {
-    RAFT_FAIL("X dataset must be accessible on device memory");
   }
 }
 
@@ -143,7 +188,7 @@ void _predict(cuvsResources_t res,
                                      cuvs::core::from_dlpack<const_mdspan_type>(centroids_tensor),
                                      cuvs::core::from_dlpack<labels_mdspan_type>(labels_tensor),
                                      normalize_weight,
-                                     raft::make_host_scalar_view<T, IdxT>(&inertia_temp));
+                                     raft::make_host_scalar_view<T>(&inertia_temp));
       *inertia = inertia_temp;
     }
   } else {
@@ -168,7 +213,7 @@ void _cluster_cost(cuvsResources_t res,
     cuvs::cluster::kmeans::cluster_cost(*res_ptr,
                                         cuvs::core::from_dlpack<mdspan_type>(X_tensor),
                                         cuvs::core::from_dlpack<mdspan_type>(centroids_tensor),
-                                        raft::make_host_scalar_view<T, IdxT>(&cost_temp));
+                                        raft::make_host_scalar_view<T>(&cost_temp));
   } else {
     RAFT_FAIL("X dataset must be accessible on device memory");
   }
@@ -182,17 +227,20 @@ extern "C" cuvsError_t cuvsKMeansParamsCreate(cuvsKMeansParams_t* params)
   return cuvs::core::translate_exceptions([=] {
     cuvs::cluster::kmeans::params cpp_params;
     cuvs::cluster::kmeans::balanced_params cpp_balanced_params;
-    *params =
-      new cuvsKMeansParams{.metric     = static_cast<cuvsDistanceType>(cpp_params.metric),
-                           .n_clusters = cpp_params.n_clusters,
-                           .init       = static_cast<cuvsKMeansInitMethod>(cpp_params.init),
-                           .max_iter   = cpp_params.max_iter,
-                           .tol        = cpp_params.tol,
-                           .oversampling_factor  = cpp_params.oversampling_factor,
-                           .batch_samples        = cpp_params.batch_samples,
-                           .inertia_check        = cpp_params.inertia_check,
-                           .hierarchical         = false,
-                           .hierarchical_n_iters = static_cast<int>(cpp_balanced_params.n_iters)};
+    *params = new cuvsKMeansParams{
+      .metric               = static_cast<cuvsDistanceType>(cpp_params.metric),
+      .n_clusters           = cpp_params.n_clusters,
+      .init                 = static_cast<cuvsKMeansInitMethod>(cpp_params.init),
+      .max_iter             = cpp_params.max_iter,
+      .tol                  = cpp_params.tol,
+      .n_init               = cpp_params.n_init,
+      .oversampling_factor  = cpp_params.oversampling_factor,
+      .batch_samples        = cpp_params.batch_samples,
+      .batch_centroids      = cpp_params.batch_centroids,
+      .inertia_check        = cpp_params.inertia_check,
+      .hierarchical         = false,
+      .hierarchical_n_iters = static_cast<int>(cpp_balanced_params.n_iters),
+      .streaming_batch_size           = cpp_params.streaming_batch_size};
   });
 }
 
@@ -235,10 +283,9 @@ extern "C" cuvsError_t cuvsKMeansPredict(cuvsResources_t res,
   return cuvs::core::translate_exceptions([=] {
     auto dataset = X->dl_tensor;
     if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
-      _predict<float>(res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
+        _predict<float>(res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
     } else if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 64) {
-      _predict<double>(
-        res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
+        _predict<double>(res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
     } else {
       RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
                 dataset.dtype.code,

@@ -100,15 +100,31 @@ struct params : base_params {
    * useful to optimize/control the memory footprint
    * Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0
    * then don't tile the centroids
+   *
+   * NB: These parameters are unrelated to streaming_batch_size, which controls how many
+   * samples to transfer from host to device per batch when processing out-of-core
+   * data.
    */
   int batch_samples = 1 << 15;
 
   /**
    * if 0 then batch_centroids = n_clusters
    */
-  int batch_centroids = 0;  //
+  int batch_centroids = 0;
 
+  /**
+   * If true, check inertia during iterations for early convergence.
+   */
   bool inertia_check = false;
+
+  /**
+   * Number of samples to process per GPU batch when fitting with host data.
+   * When set to 0, defaults to n_samples (process all at once).
+   * Only used by the batched (host-data) code path and ignored by device-data
+   * overloads.
+   * Default: 0 (process all data at once).
+   */
+  int64_t streaming_batch_size = 0;
 };
 
 /**
@@ -141,6 +157,82 @@ enum class kmeans_type { KMeans = 0, KMeansBalanced = 1 };
  * @{
  */
 
+/**
+ * @brief Find clusters with k-means algorithm using batched processing of host data.
+ *
+ * TODO: Evaluate replacing the extent type with int64_t. Reference issue:
+ * https://github.com/rapidsai/cuvs/issues/1961
+ *
+ * This overload supports out-of-core computation where the dataset resides
+ * on the host. Data is processed in GPU-sized batches, streaming from host to device.
+ * The batch size is controlled by params.streaming_batch_size.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/cluster/kmeans.hpp>
+ *   using namespace cuvs::cluster;
+ *   ...
+ *   raft::resources handle;
+ *   cuvs::cluster::kmeans::params params;
+ *   params.n_clusters = 100;
+ *   params.streaming_batch_size = 100000;
+ *   float inertia;
+ *   int64_t n_iter;
+ *
+ *   // Data on host
+ *   std::vector<float> h_X(n_samples * n_features);
+ *   auto X = raft::make_host_matrix_view<const float, int64_t>(h_X.data(), n_samples, n_features);
+ *
+ *   // Centroids on device
+ *   auto centroids = raft::make_device_matrix<float, int64_t>(handle, params.n_clusters,
+ * n_features);
+ *
+ *   kmeans::fit(handle,
+ *               params,
+ *               X,
+ *               std::nullopt,
+ *               centroids.view(),
+ *               raft::make_host_scalar_view(&inertia),
+ *               raft::make_host_scalar_view(&n_iter));
+ * @endcode
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model. Batch size is read from
+ *                              params.streaming_batch_size.
+ * @param[in]     X             Training instances on HOST memory. The data must
+ *                              be in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation in X (on host).
+ *                              [len = n_samples]
+ * @param[inout]  centroids     [in] When init is InitMethod::Array, use
+ *                              centroids as the initial cluster centers.
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::host_matrix_view<const float, int64_t> X,
+         std::optional<raft::host_vector_view<const float, int64_t>> sample_weight,
+         raft::device_matrix_view<float, int64_t> centroids,
+         raft::host_scalar_view<float> inertia,
+         raft::host_scalar_view<int64_t> n_iter);
+
+/**
+ * @brief Find clusters with k-means algorithm using batched processing of host data.
+ */
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::host_matrix_view<const double, int64_t> X,
+         std::optional<raft::host_vector_view<const double, int64_t>> sample_weight,
+         raft::device_matrix_view<double, int64_t> centroids,
+         raft::host_scalar_view<double> inertia,
+         raft::host_scalar_view<int64_t> n_iter);
+
 /**
  * @brief Find clusters with k-means algorithm.
  *   Initial centroids are chosen with k-means++ algorithm. Empty