diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp index 3909098398..c7111aaf4a 100644 --- a/cpp/include/cuvs/neighbors/common.hpp +++ b/cpp/include/cuvs/neighbors/common.hpp @@ -286,14 +286,13 @@ auto make_strided_dataset(const raft::resources& res, const SrcT& src, uint32_t 0, out_array.size() * sizeof(value_type), raft::resource::get_cuda_stream(res))); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(out_array.data_handle(), - sizeof(value_type) * required_stride, - src.data_handle(), - sizeof(value_type) * src_stride, - sizeof(value_type) * src.extent(1), - src.extent(0), - cudaMemcpyDefault, - raft::resource::get_cuda_stream(res))); + raft::copy_matrix(out_array.data_handle(), + required_stride, + src.data_handle(), + src_stride, + src.extent(1), + src.extent(0), + raft::resource::get_cuda_stream(res)); return std::make_unique(std::move(out_array), out_layout); } @@ -357,14 +356,13 @@ auto make_strided_dataset( 0, out_array.size() * sizeof(value_type), raft::resource::get_cuda_stream(res))); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(out_array.data_handle(), - sizeof(value_type) * required_stride, - src.data_handle(), - sizeof(value_type) * src_stride, - sizeof(value_type) * src.extent(1), - src.extent(0), - cudaMemcpyDefault, - raft::resource::get_cuda_stream(res))); + raft::copy_matrix(out_array.data_handle(), + required_stride, + src.data_handle(), + src_stride, + src.extent(1), + src.extent(0), + raft::resource::get_cuda_stream(res)); return std::make_unique(std::move(out_array), out_layout); } diff --git a/cpp/src/neighbors/detail/cagra/add_nodes.cuh b/cpp/src/neighbors/detail/cagra/add_nodes.cuh index 8d6ac67d83..71ec2457f6 100644 --- a/cpp/src/neighbors/detail/cagra/add_nodes.cuh +++ b/cpp/src/neighbors/detail/cagra/add_nodes.cuh @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -93,14 +94,13 @@ void add_node_core( for (const auto& batch : additional_dataset_batch) { // Step 1: Obtain K (=base_degree) nearest neighbors of the new vectors by CAGRA search // Create queries - RAFT_CUDA_TRY(cudaMemcpy2DAsync(queries.data_handle(), - sizeof(T) * dim, - batch.data(), - sizeof(T) * additional_dataset_view.stride(0), - sizeof(T) * dim, - batch.size(), - cudaMemcpyDefault, - raft::resource::get_cuda_stream(handle))); + raft::copy_matrix(queries.data_handle(), + dim, + batch.data(), + additional_dataset_view.stride(0), + dim, + batch.size(), + raft::resource::get_cuda_stream(handle)); const auto queries_view = raft::make_device_matrix_view( queries.data_handle(), batch.size(), dim); @@ -407,23 +407,20 @@ void extend_core( // The padding area must be filled with zeros.!!!!!!!!!!!!!!!!!!! memset(host_updated_dataset.data_handle(), 0, sizeof(T) * host_updated_dataset.size()); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_updated_dataset.data_handle(), - sizeof(T) * stride, - strided_dset->view().data_handle(), - sizeof(T) * stride, - sizeof(T) * dim, - initial_dataset_size, - cudaMemcpyDefault, - raft::resource::get_cuda_stream(handle))); - RAFT_CUDA_TRY( - cudaMemcpy2DAsync(host_updated_dataset.data_handle() + initial_dataset_size * stride, - sizeof(T) * stride, - additional_dataset.data_handle(), - sizeof(T) * additional_dataset.stride(0), - sizeof(T) * dim, - num_new_nodes, - cudaMemcpyDefault, - raft::resource::get_cuda_stream(handle))); + raft::copy_matrix(host_updated_dataset.data_handle(), + stride, + strided_dset->view().data_handle(), + stride, + dim, + initial_dataset_size, + raft::resource::get_cuda_stream(handle)); + raft::copy_matrix(host_updated_dataset.data_handle() + initial_dataset_size * stride, + stride, + additional_dataset.data_handle(), + additional_dataset.stride(0), + dim, + num_new_nodes, + raft::resource::get_cuda_stream(handle)); if (new_dataset_buffer_view.has_value()) { updated_dataset_view = new_dataset_buffer_view.value(); diff --git a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh index 4e6de1d2b6..323184e757 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh @@ -13,6 +13,7 @@ #include #include #include +#include #include "../../../core/nvtx.hpp" #include "../dataset_serialize.hpp" @@ -169,14 +170,13 @@ void serialize_to_hnswlib( static_cast(dataset.extent(0)), static_cast(dataset.extent(1))); host_dataset = raft::make_host_matrix(dataset.extent(0), dataset.extent(1)); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(), - sizeof(T) * host_dataset.extent(1), - dataset.data_handle(), - sizeof(T) * dataset.stride(0), - sizeof(T) * host_dataset.extent(1), - dataset.extent(0), - cudaMemcpyDefault, - raft::resource::get_cuda_stream(res))); + raft::copy_matrix(host_dataset.data_handle(), + host_dataset.extent(1), + dataset.data_handle(), + dataset.stride(0), + host_dataset.extent(1), + dataset.extent(0), + raft::resource::get_cuda_stream(res)); raft::resource::sync_stream(res); host_dataset_view = raft::make_const_mdspan(host_dataset.view()); } diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 75c51f4da7..59b983b511 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -284,14 +285,13 @@ void copy_with_padding( } else { // copy with padding raft::matrix::fill(res, dst.view(), T(0)); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(dst.data_handle(), - sizeof(T) * dst.extent(1), - src.data_handle(), - sizeof(T) * src.extent(1), - sizeof(T) * src.extent(1), - src.extent(0), - cudaMemcpyDefault, - raft::resource::get_cuda_stream(res))); + raft::copy_matrix(dst.data_handle(), + dst.extent(1), + src.data_handle(), + src.extent(1), + src.extent(1), + src.extent(0), + raft::resource::get_cuda_stream(res)); } } } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/dataset_serialize.hpp b/cpp/src/neighbors/detail/dataset_serialize.hpp index 7da60ff906..be11f2da53 100644 --- a/cpp/src/neighbors/detail/dataset_serialize.hpp +++ b/cpp/src/neighbors/detail/dataset_serialize.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -44,14 +45,13 @@ void serialize(const raft::resources& res, // Remove padding before saving the dataset auto src = dataset.view(); auto dst = raft::make_host_matrix(n_rows, dim); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(dst.data_handle(), - sizeof(DataT) * dim, - src.data_handle(), - sizeof(DataT) * stride, - sizeof(DataT) * dim, - n_rows, - cudaMemcpyDefault, - raft::resource::get_cuda_stream(res))); + raft::copy_matrix(dst.data_handle(), + dim, + src.data_handle(), + stride, + dim, + n_rows, + raft::resource::get_cuda_stream(res)); raft::resource::sync_stream(res); raft::serialize_mdspan(res, os, dst.view()); } diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp index bdb3697809..5c8630632c 100644 --- a/cpp/src/neighbors/detail/hnsw.hpp +++ b/cpp/src/neighbors/detail/hnsw.hpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -233,14 +234,13 @@ std::enable_if_t>> fro static_cast(cagra_dataset.extent(1))); host_dataset = raft::make_host_matrix(cagra_dataset.extent(0), cagra_dataset.extent(1)); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(), - sizeof(T) * host_dataset.extent(1), - cagra_dataset.data_handle(), - sizeof(T) * cagra_dataset.stride(0), - sizeof(T) * host_dataset.extent(1), - cagra_dataset.extent(0), - cudaMemcpyDefault, - raft::resource::get_cuda_stream(res))); + raft::copy_matrix(host_dataset.data_handle(), + host_dataset.extent(1), + cagra_dataset.data_handle(), + cagra_dataset.stride(0), + host_dataset.extent(1), + cagra_dataset.extent(0), + raft::resource::get_cuda_stream(res)); raft::resource::sync_stream(res); host_dataset_view = host_dataset.view(); } @@ -1048,14 +1048,13 @@ std::enable_if_t>> fro } } else { common::nvtx::range copy_scope("get_linklist0"); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(appr_algo->get_linklist0(0) + 1, - appr_algo->size_data_per_element_, - graph_ptr, - degree * sizeof(uint32_t), - degree * sizeof(uint32_t), - n_rows, - cudaMemcpyDefault, - raft::resource::get_cuda_stream(res))); + raft::copy_matrix(reinterpret_cast(appr_algo->get_linklist0(0) + 1), + appr_algo->size_data_per_element_, + reinterpret_cast(graph_ptr), + degree * sizeof(uint32_t), + degree * sizeof(uint32_t), + n_rows, + raft::resource::get_cuda_stream(res)); #pragma omp parallel for num_threads(num_threads) for (int64_t i = 0; i < n_rows; i++) { appr_algo->setListCount(appr_algo->get_linklist0(i), degree); diff --git a/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh b/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh index 887c9eb448..b9f20e3063 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh @@ -16,6 +16,7 @@ #include #include #include +#include #include "../dataset_serialize.hpp" @@ -156,15 +157,13 @@ void serialize_sector_aligned(raft::resources const& res, if (!dataset_strided) { RAFT_FAIL("Invalid dataset"); } auto d_data = dataset_strided->view(); auto h_data = raft::make_host_matrix(npts, ndims); - auto stride = dataset_strided->stride(); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(h_data.data_handle(), - sizeof(T) * ndims, - d_data.data_handle(), - sizeof(T) * stride, - sizeof(T) * ndims, - npts, - cudaMemcpyDefault, - raft::resource::get_cuda_stream(res))); + raft::copy_matrix(h_data.data_handle(), + ndims, + d_data.data_handle(), + dataset_strided->stride(), + ndims, + npts, + raft::resource::get_cuda_stream(res)); raft::resource::sync_stream(res); // buffers diff --git a/cpp/src/neighbors/detail/vpq_dataset.cuh b/cpp/src/neighbors/detail/vpq_dataset.cuh index fc114dd215..cbe06f5ca4 100644 --- a/cpp/src/neighbors/detail/vpq_dataset.cuh +++ b/cpp/src/neighbors/detail/vpq_dataset.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -56,14 +57,13 @@ auto subsample(raft::resources const& res, size_t trainset_ratio = dataset.extent(0) / n_samples; auto result = raft::make_device_matrix(res, n_samples, dataset.extent(1)); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(result.data_handle(), - sizeof(value_type) * dim, - dataset.data_handle(), - sizeof(value_type) * dim * trainset_ratio, - sizeof(value_type) * dim, - n_samples, - cudaMemcpyDefault, - raft::resource::get_cuda_stream(res))); + raft::copy_matrix(result.data_handle(), + dim, + dataset.data_handle(), + dim * trainset_ratio, + dim, + n_samples, + raft::resource::get_cuda_stream(res)); return result; } diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh index 06862c083d..229c703505 100644 --- a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh +++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -411,14 +412,13 @@ inline auto build(raft::resources const& handle, rmm::device_uvector trainset( n_rows_train * index.dim(), stream, raft::resource::get_large_workspace_resource(handle)); // TODO: a proper sampling - RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(), - sizeof(T) * index.dim(), - dataset, - sizeof(T) * index.dim() * trainset_ratio, - sizeof(T) * index.dim(), - n_rows_train, - cudaMemcpyDefault, - stream)); + raft::copy_matrix(trainset.data(), + index.dim(), + dataset, + index.dim() * trainset_ratio, + index.dim(), + n_rows_train, + stream); auto trainset_const_view = raft::make_device_matrix_view(trainset.data(), n_rows_train, index.dim()); auto centers_view = raft::make_device_matrix_view( diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh index 05f83141ad..05c0176faa 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -263,28 +264,15 @@ inline void pad_centers_with_norms(raft::resources const& res, // We rely on this to enable padded tensor gemm kernels during coarse search. cuvs::spatial::knn::detail::utils::memzero(padded_centers, n_lists * dim_ext, stream); // combine cluster_centers and their norms - RAFT_CUDA_TRY(cudaMemcpy2DAsync(padded_centers, - sizeof(float) * dim_ext, - centers, - sizeof(float) * dim, - sizeof(float) * dim, - n_lists, - cudaMemcpyDefault, - stream)); + raft::copy_matrix(padded_centers, dim_ext, centers, dim, dim, n_lists, stream); rmm::device_uvector center_norms(n_lists, stream); raft::linalg::norm( res, raft::make_device_matrix_view(centers, n_lists, dim), raft::make_device_vector_view(center_norms.data(), n_lists)); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(padded_centers + dim, - sizeof(float) * dim_ext, - center_norms.data(), - sizeof(float), - sizeof(float), - n_lists, - cudaMemcpyDefault, - stream)); + raft::copy_matrix( + padded_centers + dim, dim_ext, center_norms.data(), size_t(1), size_t(1), n_lists, stream); } template @@ -1102,14 +1090,13 @@ void extend(raft::resources const& handle, // the kmeans_balanced::predict. Thus, we need the restructuring raft::copy. rmm::device_uvector cluster_centers( size_t(n_clusters) * size_t(index->dim()), stream, device_memory); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data(), - sizeof(float) * index->dim(), - index->centers().data_handle(), - sizeof(float) * index->dim_ext(), - sizeof(float) * index->dim(), - n_clusters, - cudaMemcpyDefault, - stream)); + raft::copy_matrix(cluster_centers.data(), + index->dim(), + index->centers().data_handle(), + index->dim_ext(), + index->dim(), + n_clusters, + stream); vec_batches.prefetch_next_batch(); for (const auto& batch : vec_batches) { auto batch_data_view = raft::make_device_matrix_view( @@ -1656,13 +1643,12 @@ inline void extract_centers(raft::resources const& res, cluster_centers.extent(1) == index.dim(), "Number of columns in the output buffer for cluster centers and index dim are different"); auto stream = raft::resource::get_cuda_stream(res); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data_handle(), - sizeof(float) * index.dim(), - index.centers().data_handle(), - sizeof(float) * index.dim_ext(), - sizeof(float) * index.dim(), - index.n_lists(), - cudaMemcpyDefault, - stream)); + raft::copy_matrix(cluster_centers.data_handle(), + index.dim(), + index.centers().data_handle(), + index.dim_ext(), + index.dim(), + index.n_lists(), + stream); } } // namespace cuvs::neighbors::ivf_pq::detail diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build_common.cu b/cpp/src/neighbors/ivf_pq/ivf_pq_build_common.cu index 54cc5fee87..2deab7a073 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_build_common.cu +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build_common.cu @@ -9,6 +9,7 @@ #include #include #include +#include namespace cuvs::neighbors::ivf_pq { namespace helpers { @@ -248,16 +249,7 @@ void make_rotation_matrix(raft::resources const& handle, float* mat = inplace ? rotation_matrix : buf.data(); raft::random::normal(handle, rng, mat, n * n, 0.0f, 1.0f); raft::linalg::detail::qrGetQ_inplace(handle, mat, n, n, stream); - if (!inplace) { - RAFT_CUDA_TRY(cudaMemcpy2DAsync(rotation_matrix, - sizeof(float) * n_cols, - mat, - sizeof(float) * n, - sizeof(float) * n_cols, - n_rows, - cudaMemcpyDefault, - stream)); - } + if (!inplace) { raft::copy_matrix(rotation_matrix, n_cols, mat, n, n_cols, n_rows, stream); } } else { uint32_t stride = n + 1; auto rotation_matrix_view = diff --git a/cpp/src/preprocessing/quantize/detail/binary.cuh b/cpp/src/preprocessing/quantize/detail/binary.cuh index 3c61aba562..fe8288a7cb 100644 --- a/cpp/src/preprocessing/quantize/detail/binary.cuh +++ b/cpp/src/preprocessing/quantize/detail/binary.cuh @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -238,14 +239,13 @@ auto train(raft::resources const& res, thrust::sort(thrust::device, start_ptr, start_ptr + num_samples); } - RAFT_CUDA_TRY(cudaMemcpy2DAsync(threshold_ptr + dim_offset, - sizeof(T), - sampled_dataset_chunk.data_handle() + (num_samples - 1) / 2, - num_samples * sizeof(T), - sizeof(T), - dim_chunk, - cudaMemcpyDefault, - raft::resource::get_cuda_stream(res))); + raft::copy_matrix(threshold_ptr + dim_offset, + size_t(1), + sampled_dataset_chunk.data_handle() + (num_samples - 1) / 2, + num_samples, + size_t(1), + dim_chunk, + raft::resource::get_cuda_stream(res)); } } return quantizer; diff --git a/cpp/src/preprocessing/quantize/detail/pq.cuh b/cpp/src/preprocessing/quantize/detail/pq.cuh index e04681b974..77fb0ac4f9 100644 --- a/cpp/src/preprocessing/quantize/detail/pq.cuh +++ b/cpp/src/preprocessing/quantize/detail/pq.cuh @@ -13,6 +13,7 @@ #include #include #include +#include #include "../../../cluster/kmeans_balanced.cuh" @@ -99,14 +100,13 @@ auto train_pq_subspaces( } for (ix_t m = 0; m < pq_dim; m++) { - RAFT_CUDA_TRY(cudaMemcpy2DAsync(sub_dataset.data_handle(), - sizeof(MathT) * pq_len, - trainset_ptr + m * pq_len, - sizeof(MathT) * dim, - sizeof(MathT) * pq_len, - n_rows_train, - cudaMemcpyDefault, - raft::resource::get_cuda_stream(res))); + raft::copy_matrix(sub_dataset.data_handle(), + pq_len, + trainset_ptr + m * pq_len, + dim, + pq_len, + n_rows_train, + raft::resource::get_cuda_stream(res)); auto pq_centers_subspace_view = raft::make_device_matrix_view( pq_centers.data_handle() + m * pq_n_centers * pq_len, pq_n_centers, pq_len); cuvs::neighbors::detail::train_pq_centers(