rapidsai · tarang-jain · Apr 1, 2026
@@ -286,14 +286,13 @@ auto make_strided_dataset(const raft::resources& res, const SrcT& src, uint32_t
                                 0,
                                 out_array.size() * sizeof(value_type),
                                 raft::resource::get_cuda_stream(res)));
-  RAFT_CUDA_TRY(cudaMemcpy2DAsync(out_array.data_handle(),
-                                  sizeof(value_type) * required_stride,
-                                  src.data_handle(),
-                                  sizeof(value_type) * src_stride,
-                                  sizeof(value_type) * src.extent(1),
-                                  src.extent(0),
-                                  cudaMemcpyDefault,
-                                  raft::resource::get_cuda_stream(res)));
+  raft::copy_matrix(out_array.data_handle(),
+                    required_stride,
+                    src.data_handle(),
+                    src_stride,
+                    src.extent(1),
+                    src.extent(0),
+                    raft::resource::get_cuda_stream(res));
 
   return std::make_unique<out_owning_type>(std::move(out_array), out_layout);
 }
@@ -357,14 +356,13 @@ auto make_strided_dataset(
                                 0,
                                 out_array.size() * sizeof(value_type),
                                 raft::resource::get_cuda_stream(res)));
-  RAFT_CUDA_TRY(cudaMemcpy2DAsync(out_array.data_handle(),
-                                  sizeof(value_type) * required_stride,
-                                  src.data_handle(),
-                                  sizeof(value_type) * src_stride,
-                                  sizeof(value_type) * src.extent(1),
-                                  src.extent(0),
-                                  cudaMemcpyDefault,
-                                  raft::resource::get_cuda_stream(res)));
+  raft::copy_matrix(out_array.data_handle(),
+                    required_stride,
+                    src.data_handle(),
+                    src_stride,
+                    src.extent(1),
+                    src.extent(0),
+                    raft::resource::get_cuda_stream(res));
 
   return std::make_unique<out_owning_type>(std::move(out_array), out_layout);
 }

@@ -12,6 +12,7 @@
 #include <raft/core/resources.hpp>
 #include <raft/matrix/init.cuh>
 #include <raft/stats/histogram.cuh>
+#include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_buffer.hpp>
 
@@ -93,14 +94,13 @@ void add_node_core(
   for (const auto& batch : additional_dataset_batch) {
     // Step 1: Obtain K (=base_degree) nearest neighbors of the new vectors by CAGRA search
     // Create queries
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(queries.data_handle(),
-                                    sizeof(T) * dim,
-                                    batch.data(),
-                                    sizeof(T) * additional_dataset_view.stride(0),
-                                    sizeof(T) * dim,
-                                    batch.size(),
-                                    cudaMemcpyDefault,
-                                    raft::resource::get_cuda_stream(handle)));
+    raft::copy_matrix(queries.data_handle(),
+                      dim,
+                      batch.data(),
+                      additional_dataset_view.stride(0),
+                      dim,
+                      batch.size(),
+                      raft::resource::get_cuda_stream(handle));
 
     const auto queries_view = raft::make_device_matrix_view<const T, std::int64_t>(
       queries.data_handle(), batch.size(), dim);
@@ -407,23 +407,20 @@ void extend_core(
     // The padding area must be filled with zeros.!!!!!!!!!!!!!!!!!!!
     memset(host_updated_dataset.data_handle(), 0, sizeof(T) * host_updated_dataset.size());
 
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_updated_dataset.data_handle(),
-                                    sizeof(T) * stride,
-                                    strided_dset->view().data_handle(),
-                                    sizeof(T) * stride,
-                                    sizeof(T) * dim,
-                                    initial_dataset_size,
-                                    cudaMemcpyDefault,
-                                    raft::resource::get_cuda_stream(handle)));
-    RAFT_CUDA_TRY(
-      cudaMemcpy2DAsync(host_updated_dataset.data_handle() + initial_dataset_size * stride,
-                        sizeof(T) * stride,
-                        additional_dataset.data_handle(),
-                        sizeof(T) * additional_dataset.stride(0),
-                        sizeof(T) * dim,
-                        num_new_nodes,
-                        cudaMemcpyDefault,
-                        raft::resource::get_cuda_stream(handle)));
+    raft::copy_matrix(host_updated_dataset.data_handle(),
+                      stride,
+                      strided_dset->view().data_handle(),
+                      stride,
+                      dim,
+                      initial_dataset_size,
+                      raft::resource::get_cuda_stream(handle));
+    raft::copy_matrix(host_updated_dataset.data_handle() + initial_dataset_size * stride,
+                      stride,
+                      additional_dataset.data_handle(),
+                      additional_dataset.stride(0),
+                      dim,
+                      num_new_nodes,
+                      raft::resource::get_cuda_stream(handle));
 
     if (new_dataset_buffer_view.has_value()) {
       updated_dataset_view = new_dataset_buffer_view.value();

@@ -13,6 +13,7 @@
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/serialize.hpp>
+#include <raft/util/cudart_utils.hpp>
 
 #include "../../../core/nvtx.hpp"
 #include "../dataset_serialize.hpp"
@@ -169,14 +170,13 @@ void serialize_to_hnswlib(
                  static_cast<size_t>(dataset.extent(0)),
                  static_cast<size_t>(dataset.extent(1)));
     host_dataset = raft::make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
-                                    sizeof(T) * host_dataset.extent(1),
-                                    dataset.data_handle(),
-                                    sizeof(T) * dataset.stride(0),
-                                    sizeof(T) * host_dataset.extent(1),
-                                    dataset.extent(0),
-                                    cudaMemcpyDefault,
-                                    raft::resource::get_cuda_stream(res)));
+    raft::copy_matrix(host_dataset.data_handle(),
+                      host_dataset.extent(1),
+                      dataset.data_handle(),
+                      dataset.stride(0),
+                      host_dataset.extent(1),
+                      dataset.extent(0),
+                      raft::resource::get_cuda_stream(res));
     raft::resource::sync_stream(res);
     host_dataset_view = raft::make_const_mdspan(host_dataset.view());
   }

@@ -10,6 +10,7 @@
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/matrix/init.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <raft/util/integer_utils.hpp>
 
 #include <rmm/resource_ref.hpp>
@@ -284,14 +285,13 @@ void copy_with_padding(
   } else {
     // copy with padding
     raft::matrix::fill(res, dst.view(), T(0));
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(dst.data_handle(),
-                                    sizeof(T) * dst.extent(1),
-                                    src.data_handle(),
-                                    sizeof(T) * src.extent(1),
-                                    sizeof(T) * src.extent(1),
-                                    src.extent(0),
-                                    cudaMemcpyDefault,
-                                    raft::resource::get_cuda_stream(res)));
+    raft::copy_matrix(dst.data_handle(),
+                      dst.extent(1),
+                      src.data_handle(),
+                      src.extent(1),
+                      src.extent(1),
+                      src.extent(0),
+                      raft::resource::get_cuda_stream(res));
   }
 }
 }  // namespace cuvs::neighbors::cagra::detail
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -9,6 +9,7 @@
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/core/serialize.hpp>
+#include <raft/util/cudart_utils.hpp>
 
 #include <raft/core/logger.hpp>
 
@@ -44,14 +45,13 @@ void serialize(const raft::resources& res,
   // Remove padding before saving the dataset
   auto src = dataset.view();
   auto dst = raft::make_host_matrix<DataT, IdxT>(n_rows, dim);
-  RAFT_CUDA_TRY(cudaMemcpy2DAsync(dst.data_handle(),
-                                  sizeof(DataT) * dim,
-                                  src.data_handle(),
-                                  sizeof(DataT) * stride,
-                                  sizeof(DataT) * dim,
-                                  n_rows,
-                                  cudaMemcpyDefault,
-                                  raft::resource::get_cuda_stream(res)));
+  raft::copy_matrix(dst.data_handle(),
+                    dim,
+                    src.data_handle(),
+                    stride,
+                    dim,
+                    n_rows,
+                    raft::resource::get_cuda_stream(res));
   raft::resource::sync_stream(res);
   raft::serialize_mdspan(res, os, dst.view());
 }

@@ -18,6 +18,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/pinned_mdarray.hpp>
+#include <raft/util/cudart_utils.hpp>
 
 #include <hnswlib/hnswalg.h>
 #include <hnswlib/hnswlib.h>
@@ -233,14 +234,13 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
                  static_cast<size_t>(cagra_dataset.extent(1)));
     host_dataset =
       raft::make_host_matrix<T, int64_t>(cagra_dataset.extent(0), cagra_dataset.extent(1));
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
-                                    sizeof(T) * host_dataset.extent(1),
-                                    cagra_dataset.data_handle(),
-                                    sizeof(T) * cagra_dataset.stride(0),
-                                    sizeof(T) * host_dataset.extent(1),
-                                    cagra_dataset.extent(0),
-                                    cudaMemcpyDefault,
-                                    raft::resource::get_cuda_stream(res)));
+    raft::copy_matrix(host_dataset.data_handle(),
+                      host_dataset.extent(1),
+                      cagra_dataset.data_handle(),
+                      cagra_dataset.stride(0),
+                      host_dataset.extent(1),
+                      cagra_dataset.extent(0),
+                      raft::resource::get_cuda_stream(res));
     raft::resource::sync_stream(res);
     host_dataset_view = host_dataset.view();
   }
@@ -1048,14 +1048,13 @@ std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> fro
     }
   } else {
     common::nvtx::range<common::nvtx::domain::cuvs> copy_scope("get_linklist0<device>");
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(appr_algo->get_linklist0(0) + 1,
-                                    appr_algo->size_data_per_element_,
-                                    graph_ptr,
-                                    degree * sizeof(uint32_t),
-                                    degree * sizeof(uint32_t),
-                                    n_rows,
-                                    cudaMemcpyDefault,
-                                    raft::resource::get_cuda_stream(res)));
+    raft::copy_matrix(reinterpret_cast<char*>(appr_algo->get_linklist0(0) + 1),
+                      appr_algo->size_data_per_element_,
+                      reinterpret_cast<const char*>(graph_ptr),
+                      degree * sizeof(uint32_t),
+                      degree * sizeof(uint32_t),
+                      n_rows,
+                      raft::resource::get_cuda_stream(res));
 #pragma omp parallel for num_threads(num_threads)
     for (int64_t i = 0; i < n_rows; i++) {
       appr_algo->setListCount(appr_algo->get_linklist0(i), degree);

@@ -16,6 +16,7 @@
 #include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/serialize.hpp>
+#include <raft/util/cudart_utils.hpp>
 
 #include "../dataset_serialize.hpp"
 
@@ -156,15 +157,13 @@ void serialize_sector_aligned(raft::resources const& res,
   if (!dataset_strided) { RAFT_FAIL("Invalid dataset"); }
   auto d_data = dataset_strided->view();
   auto h_data = raft::make_host_matrix<T, int64_t>(npts, ndims);
-  auto stride = dataset_strided->stride();
-  RAFT_CUDA_TRY(cudaMemcpy2DAsync(h_data.data_handle(),
-                                  sizeof(T) * ndims,
-                                  d_data.data_handle(),
-                                  sizeof(T) * stride,
-                                  sizeof(T) * ndims,
-                                  npts,
-                                  cudaMemcpyDefault,
-                                  raft::resource::get_cuda_stream(res)));
+  raft::copy_matrix(h_data.data_handle(),
+                    ndims,
+                    d_data.data_handle(),
+                    dataset_strided->stride(),
+                    ndims,
+                    npts,
+                    raft::resource::get_cuda_stream(res));
   raft::resource::sync_stream(res);
 
   // buffers

@@ -20,6 +20,7 @@
 #include <raft/core/resource/cuda_stream_pool.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/map.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
@@ -56,14 +57,13 @@ auto subsample(raft::resources const& res,
   size_t trainset_ratio = dataset.extent(0) / n_samples;
   auto result = raft::make_device_matrix<value_type, index_type>(res, n_samples, dataset.extent(1));
 
-  RAFT_CUDA_TRY(cudaMemcpy2DAsync(result.data_handle(),
-                                  sizeof(value_type) * dim,
-                                  dataset.data_handle(),
-                                  sizeof(value_type) * dim * trainset_ratio,
-                                  sizeof(value_type) * dim,
-                                  n_samples,
-                                  cudaMemcpyDefault,
-                                  raft::resource::get_cuda_stream(res)));
+  raft::copy_matrix(result.data_handle(),
+                    dim,
+                    dataset.data_handle(),
+                    dim * trainset_ratio,
+                    dim,
+                    n_samples,
+                    raft::resource::get_cuda_stream(res));
   return result;
 }
 

@@ -32,6 +32,7 @@
 #include <raft/linalg/norm.cuh>
 #include <raft/matrix/init.cuh>
 #include <raft/stats/histogram.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -411,14 +412,13 @@ inline auto build(raft::resources const& handle,
     rmm::device_uvector<T> trainset(
       n_rows_train * index.dim(), stream, raft::resource::get_large_workspace_resource(handle));
     // TODO: a proper sampling
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
-                                    sizeof(T) * index.dim(),
-                                    dataset,
-                                    sizeof(T) * index.dim() * trainset_ratio,
-                                    sizeof(T) * index.dim(),
-                                    n_rows_train,
-                                    cudaMemcpyDefault,
-                                    stream));
+    raft::copy_matrix(trainset.data(),
+                      index.dim(),
+                      dataset,
+                      index.dim() * trainset_ratio,
+                      index.dim(),
+                      n_rows_train,
+                      stream);
     auto trainset_const_view =
       raft::make_device_matrix_view<const T, IdxT>(trainset.data(), n_rows_train, index.dim());
     auto centers_view = raft::make_device_matrix_view<float, IdxT>(