From fcffe8a17ee87873b8aeb2dede29ce79b7f9eb79 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 16 Mar 2026 20:11:53 +0000
Subject: [PATCH 01/21] Always build with JIT+LTO

Since https://github.com/rapidsai/cuvs/pull/1909, we've been able to use
older versions of the CUDA driver, since we no longer rely on
`cudaLibraryEnumerateKernels()`. Since
https://github.com/rapidsai/cuvs/pull/1918, we've been using static
cudart, which allows us to run on platforms with versions of CUDA older
than 12.8 installed, since the runtime library API is now bundled with
cuvs. Always build with JIT+LTO so that we can get the full compile time
and binary size benefits in CUDA 12 too.
---
 .../all_cuda-129_arch-aarch64.yaml            |  1 +
 .../all_cuda-129_arch-x86_64.yaml             |  1 +
 .../bench_ann_cuda-129_arch-aarch64.yaml      |  1 +
 .../bench_ann_cuda-129_arch-x86_64.yaml       |  1 +
 .../go_cuda-129_arch-aarch64.yaml             |  1 +
 .../environments/go_cuda-129_arch-x86_64.yaml |  1 +
 .../rust_cuda-129_arch-aarch64.yaml           |  1 +
 .../rust_cuda-129_arch-x86_64.yaml            |  1 +
 conda/recipes/libcuvs/recipe.yaml             | 52 +++++--------------
 cpp/CMakeLists.txt                            |  8 +--
 dependencies.yaml                             | 11 +---
 11 files changed, 24 insertions(+), 55 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 91c16a5478..2cfa357ea5 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libnvjitlink-dev
 - librmm==26.4.*,>=0.0.0a0
 - make
 - nccl>=2.19
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 7eae2608ea..29c4318ff9 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libnvjitlink-dev
 - librmm==26.4.*,>=0.0.0a0
 - make
 - nccl>=2.19
diff --git a/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
index ab7833b9e1..e4ee1f9449 100644
--- a/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
@@ -29,6 +29,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libcuvs==26.4.*,>=0.0.0a0
+- libnvjitlink-dev
 - librmm==26.4.*,>=0.0.0a0
 - matplotlib-base>=3.9
 - nccl>=2.19
diff --git a/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
index e451e258eb..f738028490 100644
--- a/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
@@ -31,6 +31,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libcuvs==26.4.*,>=0.0.0a0
+- libnvjitlink-dev
 - librmm==26.4.*,>=0.0.0a0
 - matplotlib-base>=3.9
 - mkl-devel=2023
diff --git a/conda/environments/go_cuda-129_arch-aarch64.yaml b/conda/environments/go_cuda-129_arch-aarch64.yaml
index 55842e86b8..60455b7cec 100644
--- a/conda/environments/go_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/go_cuda-129_arch-aarch64.yaml
@@ -25,6 +25,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libcuvs==26.4.*,>=0.0.0a0
+- libnvjitlink-dev
 - libraft==26.4.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
diff --git a/conda/environments/go_cuda-129_arch-x86_64.yaml b/conda/environments/go_cuda-129_arch-x86_64.yaml
index 2854de33b7..cd6911797d 100644
--- a/conda/environments/go_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/go_cuda-129_arch-x86_64.yaml
@@ -25,6 +25,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libcuvs==26.4.*,>=0.0.0a0
+- libnvjitlink-dev
 - libraft==26.4.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
diff --git a/conda/environments/rust_cuda-129_arch-aarch64.yaml b/conda/environments/rust_cuda-129_arch-aarch64.yaml
index 0aa5a7ea6f..1f84c62a68 100644
--- a/conda/environments/rust_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/rust_cuda-129_arch-aarch64.yaml
@@ -22,6 +22,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libcuvs==26.4.*,>=0.0.0a0
+- libnvjitlink-dev
 - libraft==26.4.*,>=0.0.0a0
 - make
 - nccl>=2.19
diff --git a/conda/environments/rust_cuda-129_arch-x86_64.yaml b/conda/environments/rust_cuda-129_arch-x86_64.yaml
index b9dabfafa7..55d79b591a 100644
--- a/conda/environments/rust_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/rust_cuda-129_arch-x86_64.yaml
@@ -22,6 +22,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libcuvs==26.4.*,>=0.0.0a0
+- libnvjitlink-dev
 - libraft==26.4.*,>=0.0.0a0
 - make
 - nccl>=2.19
diff --git a/conda/recipes/libcuvs/recipe.yaml b/conda/recipes/libcuvs/recipe.yaml
index 4453b18393..71b2ec0b4d 100644
--- a/conda/recipes/libcuvs/recipe.yaml
+++ b/conda/recipes/libcuvs/recipe.yaml
@@ -72,9 +72,7 @@ cache:
       - ninja
       - ${{ stdlib("c") }}
     host:
-      - if: cuda_major == "13"
-        then:
-          - libnvjitlink-dev
+      - libnvjitlink-dev
       - librmm =${{ minor_version }}
       - libraft-headers =${{ minor_version }}
       - nccl ${{ nccl_version }}
@@ -121,9 +119,7 @@ outputs:
         - libcurand-dev
         - libcusolver-dev
         - libcusparse-dev
-        - if: cuda_major == "13"
-          then:
-            - libnvjitlink-dev
+        - libnvjitlink-dev
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - libraft-headers =${{ minor_version }}
@@ -133,9 +129,7 @@ outputs:
         - libcurand
         - libcusolver
         - libcusparse
-        - if: cuda_major == "13"
-          then:
-            - libnvjitlink
+        - libnvjitlink
       ignore_run_exports:
         by_name:
           - cuda-cudart
@@ -149,9 +143,7 @@ outputs:
           - librmm
           - mkl
           - nccl
-          - if: cuda_major == "13"
-            then:
-              - libnvjitlink
+          - libnvjitlink
     about:
       homepage: ${{ load_from_file("python/libcuvs/pyproject.toml").project.urls.Homepage }}
       license: ${{ load_from_file("python/libcuvs/pyproject.toml").project.license }}
@@ -188,9 +180,7 @@ outputs:
         - libcurand-dev
         - libcusolver-dev
         - libcusparse-dev
-        - if: cuda_major == "13"
-          then:
-            - libnvjitlink-dev
+        - libnvjitlink-dev
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - ${{ pin_subpackage("libcuvs-headers", exact=True) }}
@@ -201,9 +191,7 @@ outputs:
         - libcurand
         - libcusolver
         - libcusparse
-        - if: cuda_major == "13"
-          then:
-            - libnvjitlink
+        - libnvjitlink
       ignore_run_exports:
         by_name:
           - cuda-cudart
@@ -217,9 +205,7 @@ outputs:
           - librmm
           - mkl
           - nccl
-          - if: cuda_major == "13"
-            then:
-              - libnvjitlink
+          - libnvjitlink
     about:
       homepage: ${{ load_from_file("python/libcuvs/pyproject.toml").project.urls.Homepage }}
       license: ${{ load_from_file("python/libcuvs/pyproject.toml").project.license }}
@@ -254,9 +240,7 @@ outputs:
         - libcurand-dev
         - libcusolver-dev
         - libcusparse-dev
-        - if: cuda_major == "13"
-          then:
-            - libnvjitlink-dev
+        - libnvjitlink-dev
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - ${{ pin_subpackage("libcuvs-headers", exact=True) }}
@@ -267,9 +251,7 @@ outputs:
         - libcurand
         - libcusolver
         - libcusparse
-        - if: cuda_major == "13"
-          then:
-            - libnvjitlink
+        - libnvjitlink
       ignore_run_exports:
         by_name:
           - cuda-cudart
@@ -280,9 +262,7 @@ outputs:
           - libcurand
           - libcusolver
           - libcusparse
-          - if: cuda_major == "13"
-            then:
-              - libnvjitlink
+          - libnvjitlink
           - librmm
           - mkl
           - nccl
@@ -419,9 +399,7 @@ outputs:
         - libcurand-dev
         - libcusolver-dev
         - libcusparse-dev
-        - if: cuda_major == "13"
-          then:
-            - libnvjitlink-dev
+        - libnvjitlink-dev
       run:
         - ${{ pin_subpackage("libcuvs-headers", exact=True) }}
         - ${{ pin_subpackage("libcuvs", exact=True) }}
@@ -431,9 +409,7 @@ outputs:
         - libcurand
         - libcusolver
         - libcusparse
-        - if: cuda_major == "13"
-          then:
-            - libnvjitlink
+        - libnvjitlink
       ignore_run_exports:
         by_name:
           - cuda-cudart
@@ -444,9 +420,7 @@ outputs:
           - libcurand
           - libcusolver
           - libcusparse
-          - if: cuda_major == "13"
-            then:
-              - libnvjitlink
+          - libnvjitlink
           - librmm
           - mkl
           - nccl
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f1aeafdda2..e8ffe449f7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -355,12 +355,8 @@ if(NOT BUILD_CPU_ONLY)
     )
   endif()
 
-  set(JIT_LTO_TARGET_ARCHITECTURE "")
-  set(JIT_LTO_COMPILATION OFF)
-  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
-    set(JIT_LTO_TARGET_ARCHITECTURE "75-real")
-    set(JIT_LTO_COMPILATION ON)
-  endif()
+  set(JIT_LTO_TARGET_ARCHITECTURE "75-real")
+  set(JIT_LTO_COMPILATION ON)
 
   if(JIT_LTO_COMPILATION)
     # Generate interleaved scan kernel files at build time
diff --git a/dependencies.yaml b/dependencies.yaml
index f925b41927..274977d5c5 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -326,16 +326,7 @@ dependencies:
           - libcurand-dev
           - libcusolver-dev
           - libcusparse-dev
-    specific:
-      - output_types: conda
-        matrices:
-          - matrix:
-              cuda: "13.*"
-            packages:
-              - libnvjitlink-dev
-          - matrix:
-              cuda: "12.*"
-            packages:
+          - libnvjitlink-dev
   cuda_wheels:
     specific:
       - output_types: [requirements, pyproject]

From 6c91f9de8d831f75eef719e6b743f44e3d83de2a Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 16 Mar 2026 23:26:42 +0000
Subject: [PATCH 02/21] Use the driver API instead

---
 cpp/CMakeLists.txt                            |  2 +
 .../cuvs/detail/jit_lto/AlgorithmLauncher.hpp | 16 +++---
 cpp/include/cuvs/detail/jit_lto/cu_try.hpp    | 25 +++++++++
 cpp/src/detail/jit_lto/AlgorithmLauncher.cpp  | 55 ++++++++++---------
 cpp/src/detail/jit_lto/AlgorithmPlanner.cpp   | 28 ++++++----
 .../ivf_flat_interleaved_scan_jit.cuh         | 14 ++---
 6 files changed, 87 insertions(+), 53 deletions(-)
 create mode 100644 cpp/include/cuvs/detail/jit_lto/cu_try.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e8ffe449f7..fcf2b30368 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -776,6 +776,7 @@ if(NOT BUILD_CPU_ONLY)
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:NCCL::NCCL>>
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>>
              $<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
+             CUDA::cuda_driver
       PRIVATE
         $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
         $<COMPILE_ONLY:nvidia::cutlass::cutlass>
@@ -838,6 +839,7 @@ SECTIONS
              ${CUVS_CTK_MATH_DEPENDENCIES}
              $<TARGET_NAME_IF_EXISTS:NCCL::NCCL> # needs to be public for DT_NEEDED
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>> # header only
+             CUDA::cuda_driver
       PRIVATE
         $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
         $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
diff --git a/cpp/include/cuvs/detail/jit_lto/AlgorithmLauncher.hpp b/cpp/include/cuvs/detail/jit_lto/AlgorithmLauncher.hpp
index 18e7f7cb2f..3749a77773 100644
--- a/cpp/include/cuvs/detail/jit_lto/AlgorithmLauncher.hpp
+++ b/cpp/include/cuvs/detail/jit_lto/AlgorithmLauncher.hpp
@@ -11,13 +11,13 @@
 #include <unordered_map>
 #include <vector_types.h>
 
-#include <cuda_runtime.h>
+#include <cuda.h>
 #include <memory>
 
 struct AlgorithmLauncher {
-  AlgorithmLauncher() : kernel{nullptr}, library{nullptr} {}
+  AlgorithmLauncher() : function{nullptr}, library{nullptr} {}
 
-  AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib);
+  AlgorithmLauncher(CUfunction f, CUlibrary lib);
 
   ~AlgorithmLauncher();
 
@@ -28,18 +28,18 @@ struct AlgorithmLauncher {
   AlgorithmLauncher& operator=(AlgorithmLauncher&& other) noexcept;
 
   template <typename... Args>
-  void dispatch(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, Args&&... args)
+  void dispatch(CUstream stream, dim3 grid, dim3 block, std::size_t shared_mem, Args&&... args)
   {
     void* kernel_args[] = {const_cast<void*>(static_cast<void const*>(&args))...};
     this->call(stream, grid, block, shared_mem, kernel_args);
   }
 
-  cudaKernel_t get_kernel() { return this->kernel; }
+  CUfunction get_function() { return this->function; }
 
  private:
-  void call(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** args);
-  cudaKernel_t kernel;
-  cudaLibrary_t library;
+  void call(CUstream stream, dim3 grid, dim3 block, std::size_t shared_mem, void** args);
+  CUfunction function;
+  CUlibrary library;
 };
 
 std::unordered_map<std::string, std::shared_ptr<AlgorithmLauncher>>& get_cached_launchers();
diff --git a/cpp/include/cuvs/detail/jit_lto/cu_try.hpp b/cpp/include/cuvs/detail/jit_lto/cu_try.hpp
new file mode 100644
index 0000000000..041d482846
--- /dev/null
+++ b/cpp/include/cuvs/detail/jit_lto/cu_try.hpp
@@ -0,0 +1,25 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <raft/util/cuda_rt_essentials.hpp>
+
+#include <cuda.h>
+
+#define CU_TRY(call)                                                                      \
+  do {                                                                                    \
+    CUresult const status = call;                                                         \
+    if (status != CUDA_SUCCESS) {                                                         \
+      std::string msg{};                                                                  \
+      const char* name;                                                                   \
+      cuGetErrorName(status, &name);                                                      \
+      const char* str;                                                                    \
+      cuGetErrorString(status, &str);                                                     \
+      SET_ERROR_MSG(                                                                      \
+        msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, name, str); \
+      throw raft::cuda_error(msg);                                                        \
+    }                                                                                     \
+  } while (0)
diff --git a/cpp/src/detail/jit_lto/AlgorithmLauncher.cpp b/cpp/src/detail/jit_lto/AlgorithmLauncher.cpp
index 0402ef8304..f490ec9dcd 100644
--- a/cpp/src/detail/jit_lto/AlgorithmLauncher.cpp
+++ b/cpp/src/detail/jit_lto/AlgorithmLauncher.cpp
@@ -4,52 +4,55 @@
  */
 
 #include <cuvs/detail/jit_lto/AlgorithmLauncher.hpp>
+#include <cuvs/detail/jit_lto/cu_try.hpp>
 
-#include <raft/util/cuda_rt_essentials.hpp>
-
-AlgorithmLauncher::AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib) : kernel{k}, library{lib} {}
+AlgorithmLauncher::AlgorithmLauncher(CUfunction f, CUlibrary lib) : function{f}, library{lib} {}
 
 AlgorithmLauncher::~AlgorithmLauncher()
 {
-  if (library != nullptr) { (void)cudaLibraryUnload(library); }
+  if (library != nullptr) { (void)cuLibraryUnload(library); }
 }
 
 AlgorithmLauncher::AlgorithmLauncher(AlgorithmLauncher&& other) noexcept
-  : kernel{other.kernel}, library{other.library}
+  : function{other.function}, library{other.library}
 {
-  other.kernel  = nullptr;
-  other.library = nullptr;
+  other.function = nullptr;
+  other.library  = nullptr;
 }
 
 AlgorithmLauncher& AlgorithmLauncher::operator=(AlgorithmLauncher&& other) noexcept
 {
   if (this != &other) {
     // Unload current library if it exists
-    if (library != nullptr) { cudaLibraryUnload(library); }
-    kernel        = other.kernel;
-    library       = other.library;
-    other.kernel  = nullptr;
-    other.library = nullptr;
+    if (library != nullptr) { cuLibraryUnload(library); }
+    function       = other.function;
+    library        = other.library;
+    other.function = nullptr;
+    other.library  = nullptr;
   }
   return *this;
 }
 
 void AlgorithmLauncher::call(
-  cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** kernel_args)
+  CUstream stream, dim3 grid, dim3 block, std::size_t shared_mem, void** kernel_args)
 {
-  cudaLaunchAttribute attribute[1];
-  attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attribute[0].val.programmaticStreamSerializationAllowed = 1;
-
-  cudaLaunchConfig_t config;
-  config.gridDim          = grid;
-  config.blockDim         = block;
-  config.stream           = stream;
-  config.attrs            = attribute;
-  config.numAttrs         = 1;
-  config.dynamicSmemBytes = shared_mem;
-
-  RAFT_CUDA_TRY(cudaLaunchKernelExC(&config, kernel, kernel_args));
+  CUlaunchAttribute attribute[1];
+  attribute[0].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION;
+  attribute[0].value.programmaticStreamSerializationAllowed = 1;
+
+  CUlaunchConfig config;
+  config.gridDimX       = grid.x;
+  config.gridDimY       = grid.y;
+  config.gridDimZ       = grid.z;
+  config.blockDimX      = block.x;
+  config.blockDimY      = block.y;
+  config.blockDimZ      = block.z;
+  config.hStream        = stream;
+  config.attrs          = attribute;
+  config.numAttrs       = 1;
+  config.sharedMemBytes = shared_mem;
+
+  CU_TRY(cuLaunchKernelEx(&config, function, kernel_args, nullptr));
 }
 
 std::unordered_map<std::string, std::shared_ptr<AlgorithmLauncher>>& get_cached_launchers()
diff --git a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
index dbb1f09c30..11d1638481 100644
--- a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
+++ b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
@@ -15,7 +15,9 @@
 
 #include <cuvs/detail/jit_lto/AlgorithmPlanner.hpp>
 #include <cuvs/detail/jit_lto/FragmentDatabase.hpp>
+#include <cuvs/detail/jit_lto/cu_try.hpp>
 
+#include "cuda.h"
 #include "cuda_runtime.h"
 #include "nvJitLink.h"
 
@@ -69,12 +71,12 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::get_launcher()
 
 std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build()
 {
-  int device = 0;
-  int major  = 0;
-  int minor  = 0;
-  RAFT_CUDA_TRY(cudaGetDevice(&device));
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
+  CUdevice device;
+  int major = 0;
+  int minor = 0;
+  CU_TRY(cuDeviceGet(&device, 0));
+  CU_TRY(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+  CU_TRY(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
 
   std::string archs = "-arch=sm_" + std::to_string((major * 10 + minor));
 
@@ -106,12 +108,14 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build()
   RAFT_EXPECTS(result == NVJITLINK_SUCCESS, "nvJitLinkDestroy failed");
 
   // cubin is linked, so now load it
-  cudaLibrary_t library;
-  RAFT_CUDA_TRY(
-    cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0));
+  CUlibrary library;
+  CU_TRY(cuLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0));
 
-  cudaKernel_t kernel;
-  RAFT_CUDA_TRY(cudaLibraryGetKernel(&kernel, library, this->entrypoint.c_str()));
+  CUkernel kernel;
+  CU_TRY(cuLibraryGetKernel(&kernel, library, this->entrypoint.c_str()));
 
-  return std::make_shared<AlgorithmLauncher>(kernel, library);
+  CUfunction function;
+  CU_TRY(cuKernelGetFunction(&function, kernel));
+
+  return std::make_shared<AlgorithmLauncher>(function, library);
 }
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh
index be8652dd59..c1f67094c7 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh
@@ -8,6 +8,7 @@
 #include "../ivf_common.cuh"
 #include "jit_lto_kernels/interleaved_scan_planner.hpp"
 #include <cstdint>
+#include <cuvs/detail/jit_lto/cu_try.hpp>
 #include <cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp>
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_flat.hpp>
@@ -16,7 +17,6 @@
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
-#include <raft/util/cuda_rt_essentials.hpp>  // RAFT_CUDA_TRY
 #include <raft/util/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -102,14 +102,14 @@ constexpr auto get_post_lambda_name()
 inline uint32_t configure_launch_x(uint32_t numQueries,
                                    uint32_t n_probes,
                                    int32_t sMemSize,
-                                   cudaKernel_t func)
+                                   CUfunction func)
 {
-  int dev_id;
-  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+  CUdevice dev_id;
+  CU_TRY(cuDeviceGet(&dev_id, 0));
   int num_sms;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  CU_TRY(cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev_id));
   int num_blocks_per_sm = 0;
-  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+  CU_TRY(cuOccupancyMaxActiveBlocksPerMultiprocessor(
     &num_blocks_per_sm, func, kThreadsPerBlock, sMemSize));
 
   size_t min_grid_size = num_sms * num_blocks_per_sm;
@@ -178,7 +178,7 @@ void launch_kernel(const index<T, IdxT>& index,
 
   if (grid_dim_x == 0) {
     grid_dim_x = configure_launch_x(
-      std::min(kMaxGridY, num_queries), n_probes, smem_size, kernel_launcher->get_kernel());
+      std::min(kMaxGridY, num_queries), n_probes, smem_size, kernel_launcher->get_function());
     return;
   }
 

From e858407d3156a02babfdd28daf406a28aa5a2f0c Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 16 Mar 2026 23:33:41 +0000
Subject: [PATCH 03/21] Conda recipe

---
 conda/recipes/libcuvs/recipe.yaml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/conda/recipes/libcuvs/recipe.yaml b/conda/recipes/libcuvs/recipe.yaml
index 71b2ec0b4d..d6fc8b6891 100644
--- a/conda/recipes/libcuvs/recipe.yaml
+++ b/conda/recipes/libcuvs/recipe.yaml
@@ -78,6 +78,7 @@ cache:
       - nccl ${{ nccl_version }}
       - cuda-version =${{ cuda_version }}
       - cuda-cudart-dev
+      - cuda-driver-dev
       - cuda-profiler-api
       - libcublas-dev
       - libcurand-dev
@@ -114,6 +115,7 @@ outputs:
         - nccl ${{ nccl_version }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
+        - cuda-driver-dev
         - cuda-profiler-api
         - libcublas-dev
         - libcurand-dev
@@ -133,6 +135,7 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
+          - cuda-driver
           - cuda-version
           - libaio
           - libboost
@@ -175,6 +178,7 @@ outputs:
         - nccl ${{ nccl_version }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
+        - cuda-driver-dev
         - cuda-profiler-api
         - libcublas-dev
         - libcurand-dev
@@ -195,6 +199,7 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
+          - cuda-driver
           - cuda-version
           - libaio
           - libboost
@@ -235,6 +240,7 @@ outputs:
         - nccl ${{ nccl_version }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
+        - cuda-driver-dev
         - cuda-profiler-api
         - libcublas-dev
         - libcurand-dev
@@ -255,6 +261,7 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
+          - cuda-driver
           - cuda-version
           - libaio
           - libboost
@@ -294,6 +301,7 @@ outputs:
         - cuda-version =${{ cuda_version }}
         - openblas  # required by some CPU algos in benchmarks
         - cuda-cudart-dev
+        - cuda-driver-dev
         - cuda-profiler-api
         - libcublas-dev
         - libcurand-dev
@@ -312,6 +320,7 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
+          - cuda-driver
           - cuda-version
           - libaio
           - libboost
@@ -394,6 +403,7 @@ outputs:
         - librmm =${{ minor_version }}
         - nccl ${{ nccl_version }}
         - cuda-cudart-dev
+        - cuda-driver-dev
         - cuda-profiler-api
         - libcublas-dev
         - libcurand-dev
@@ -413,6 +423,7 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
+          - cuda-driver
           - cuda-version
           - libaio
           - libboost
@@ -455,6 +466,7 @@ outputs:
         - ${{ pin_subpackage("libcuvs-headers", exact=True) }}
         - ${{ pin_subpackage("libcuvs", exact=True) }}
         - cuda-cudart-dev
+        - cuda-driver-dev
         - cuda-profiler-api
         - cuda-version =${{ cuda_version }}
         - libcublas-dev
@@ -486,6 +498,7 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
+          - cuda-driver
           - cuda-version
           - libcublas
           - libcurand

From 1972a74c9f5115013a4d4b0d69f1165649095660 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 16 Mar 2026 23:38:46 +0000
Subject: [PATCH 04/21] deps

---
 conda/environments/all_cuda-129_arch-aarch64.yaml       | 1 +
 conda/environments/all_cuda-129_arch-x86_64.yaml        | 1 +
 conda/environments/all_cuda-131_arch-aarch64.yaml       | 1 +
 conda/environments/all_cuda-131_arch-x86_64.yaml        | 1 +
 conda/environments/bench_ann_cuda-129_arch-aarch64.yaml | 1 +
 conda/environments/bench_ann_cuda-129_arch-x86_64.yaml  | 1 +
 conda/environments/bench_ann_cuda-131_arch-aarch64.yaml | 1 +
 conda/environments/bench_ann_cuda-131_arch-x86_64.yaml  | 1 +
 conda/environments/go_cuda-129_arch-aarch64.yaml        | 1 +
 conda/environments/go_cuda-129_arch-x86_64.yaml         | 1 +
 conda/environments/go_cuda-131_arch-aarch64.yaml        | 1 +
 conda/environments/go_cuda-131_arch-x86_64.yaml         | 1 +
 conda/environments/rust_cuda-129_arch-aarch64.yaml      | 1 +
 conda/environments/rust_cuda-129_arch-x86_64.yaml       | 1 +
 conda/environments/rust_cuda-131_arch-aarch64.yaml      | 1 +
 conda/environments/rust_cuda-131_arch-x86_64.yaml       | 1 +
 dependencies.yaml                                       | 1 +
 17 files changed, 17 insertions(+)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 2cfa357ea5..2e675c6e63 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -11,6 +11,7 @@ dependencies:
 - clang-tools==20.1.4
 - clang==20.1.4
 - cmake>=3.30.4
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 29c4318ff9..924fba96c9 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -11,6 +11,7 @@ dependencies:
 - clang-tools==20.1.4
 - clang==20.1.4
 - cmake>=3.30.4
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
index 25c575c085..541f1ce78d 100644
--- a/conda/environments/all_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -11,6 +11,7 @@ dependencies:
 - clang-tools==20.1.4
 - clang==20.1.4
 - cmake>=3.30.4
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
index 97f553e4d2..140a2cc740 100644
--- a/conda/environments/all_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -11,6 +11,7 @@ dependencies:
 - clang-tools==20.1.4
 - clang==20.1.4
 - cmake>=3.30.4
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
index e4ee1f9449..9f26cb8636 100644
--- a/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - clang==20.1.4
 - click
 - cmake>=3.30.4
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
index f738028490..d2d8fb20ca 100644
--- a/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - clang==20.1.4
 - click
 - cmake>=3.30.4
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml
index 944d1aa8dc..69dbea6f9e 100644
--- a/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - clang==20.1.4
 - click
 - cmake>=3.30.4
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml
index dd2497821a..9e1f435d92 100644
--- a/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - clang==20.1.4
 - click
 - cmake>=3.30.4
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/go_cuda-129_arch-aarch64.yaml b/conda/environments/go_cuda-129_arch-aarch64.yaml
index 60455b7cec..41cca9dc65 100644
--- a/conda/environments/go_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/go_cuda-129_arch-aarch64.yaml
@@ -11,6 +11,7 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/go_cuda-129_arch-x86_64.yaml b/conda/environments/go_cuda-129_arch-x86_64.yaml
index cd6911797d..11ab42b6cc 100644
--- a/conda/environments/go_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/go_cuda-129_arch-x86_64.yaml
@@ -11,6 +11,7 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/go_cuda-131_arch-aarch64.yaml b/conda/environments/go_cuda-131_arch-aarch64.yaml
index 135f6a88cc..0b6305d4bf 100644
--- a/conda/environments/go_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/go_cuda-131_arch-aarch64.yaml
@@ -11,6 +11,7 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/go_cuda-131_arch-x86_64.yaml b/conda/environments/go_cuda-131_arch-x86_64.yaml
index df6a779331..5099c2bb87 100644
--- a/conda/environments/go_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/go_cuda-131_arch-x86_64.yaml
@@ -11,6 +11,7 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/rust_cuda-129_arch-aarch64.yaml b/conda/environments/rust_cuda-129_arch-aarch64.yaml
index 1f84c62a68..f0080eeb7d 100644
--- a/conda/environments/rust_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/rust_cuda-129_arch-aarch64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/rust_cuda-129_arch-x86_64.yaml b/conda/environments/rust_cuda-129_arch-x86_64.yaml
index 55d79b591a..f661febdfe 100644
--- a/conda/environments/rust_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/rust_cuda-129_arch-x86_64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/rust_cuda-131_arch-aarch64.yaml b/conda/environments/rust_cuda-131_arch-aarch64.yaml
index 062cbc8ea0..7f5b84fc18 100644
--- a/conda/environments/rust_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/rust_cuda-131_arch-aarch64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/rust_cuda-131_arch-x86_64.yaml b/conda/environments/rust_cuda-131_arch-x86_64.yaml
index 2b96d4a64e..b2a5820c7d 100644
--- a/conda/environments/rust_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/rust_cuda-131_arch-x86_64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
+- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/dependencies.yaml b/dependencies.yaml
index 274977d5c5..9d47f84516 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -320,6 +320,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
+          - cuda-driver-dev
           - cuda-nvtx-dev
           - cuda-profiler-api
           - libcublas-dev

From 45033074a9a63fba845df9f1ad322be602ebc713 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 16 Mar 2026 23:43:29 +0000
Subject: [PATCH 05/21] PRIVATE

---
 cpp/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fcf2b30368..2a944dd7e1 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -776,13 +776,13 @@ if(NOT BUILD_CPU_ONLY)
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:NCCL::NCCL>>
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>>
              $<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
-             CUDA::cuda_driver
       PRIVATE
         $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
         $<COMPILE_ONLY:nvidia::cutlass::cutlass>
         $<COMPILE_ONLY:cuco::cuco>
         $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
         $<$<BOOL:${JIT_LTO_COMPILATION}>:$<LINK_LIBRARY:WHOLE_ARCHIVE,cuvs::cuvs_jit_lto_kernels>>
+        CUDA::cuda_driver
     )
 
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
@@ -839,7 +839,6 @@ SECTIONS
              ${CUVS_CTK_MATH_DEPENDENCIES}
              $<TARGET_NAME_IF_EXISTS:NCCL::NCCL> # needs to be public for DT_NEEDED
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>> # header only
-             CUDA::cuda_driver
       PRIVATE
         $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
         $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
@@ -847,6 +846,7 @@ SECTIONS
         $<COMPILE_ONLY:nvidia::cutlass::cutlass>
         $<COMPILE_ONLY:cuco::cuco>
         $<$<BOOL:${JIT_LTO_COMPILATION}>:$<LINK_LIBRARY:WHOLE_ARCHIVE,cuvs::cuvs_jit_lto_kernels>>
+        CUDA::cuda_driver
     )
   endif()
 

From a42ede0a2144df64a47d74228e361332680f32d6 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 16 Mar 2026 23:58:08 +0000
Subject: [PATCH 06/21] auditwheel

---
 ci/build_wheel.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index cd307a6a77..06891222b0 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -37,6 +37,7 @@ cd "${package_dir}"
 EXCLUDE_ARGS=(
   --exclude "libcublas.so.*"
   --exclude "libcublasLt.so.*"
+  --exclude "libcuda.so.*"
   --exclude "libcurand.so.*"
   --exclude "libcusolver.so.*"
   --exclude "libcusparse.so.*"

From e26519ff0cf152523940feeeafef723e9fdb71a9 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 17 Mar 2026 02:43:08 +0000
Subject: [PATCH 07/21] Conda recipe

---
 conda/recipes/libcuvs/recipe.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/conda/recipes/libcuvs/recipe.yaml b/conda/recipes/libcuvs/recipe.yaml
index d6fc8b6891..18c4795046 100644
--- a/conda/recipes/libcuvs/recipe.yaml
+++ b/conda/recipes/libcuvs/recipe.yaml
@@ -127,6 +127,7 @@ outputs:
         - libraft-headers =${{ minor_version }}
         - librmm =${{ minor_version }}
         - nccl
+        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver
@@ -191,6 +192,7 @@ outputs:
         - libraft-headers =${{ minor_version }}
         - librmm =${{ minor_version }}
         - nccl
+        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver
@@ -253,6 +255,7 @@ outputs:
         - libraft-headers =${{ minor_version }}
         - librmm =${{ minor_version }}
         - nccl
+        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver
@@ -313,6 +316,7 @@ outputs:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - libraft-headers =${{ minor_version }}
         - nccl
+        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver
@@ -415,6 +419,7 @@ outputs:
         - ${{ pin_subpackage("libcuvs", exact=True) }}
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - nccl
+        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver
@@ -487,6 +492,7 @@ outputs:
         - ${{ pin_subpackage("libcuvs-headers", exact=True) }}
         - ${{ pin_subpackage("libcuvs", exact=True) }}
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver

From 326905586334f35e260f8b52967e3a12e7446e2c Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 17 Mar 2026 03:21:39 +0000
Subject: [PATCH 08/21] Revert "Conda recipe"

This reverts commit e26519ff0cf152523940feeeafef723e9fdb71a9.
---
 conda/recipes/libcuvs/recipe.yaml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/conda/recipes/libcuvs/recipe.yaml b/conda/recipes/libcuvs/recipe.yaml
index 7e948d2e60..a2848326c9 100644
--- a/conda/recipes/libcuvs/recipe.yaml
+++ b/conda/recipes/libcuvs/recipe.yaml
@@ -128,7 +128,6 @@ outputs:
         - librmm =${{ minor_version }}
         - nccl
         - cuda-cudart
-        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver
@@ -194,7 +193,6 @@ outputs:
         - librmm =${{ minor_version }}
         - nccl
         - cuda-cudart
-        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver
@@ -258,7 +256,6 @@ outputs:
         - librmm =${{ minor_version }}
         - nccl
         - cuda-cudart
-        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver
@@ -320,7 +317,6 @@ outputs:
         - libraft-headers =${{ minor_version }}
         - nccl
         - cuda-cudart
-        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver
@@ -424,7 +420,6 @@ outputs:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - nccl
         - cuda-cudart
-        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver
@@ -498,7 +493,6 @@ outputs:
         - ${{ pin_subpackage("libcuvs", exact=True) }}
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - cuda-cudart
-        - cuda-driver
         - libcublas
         - libcurand
         - libcusolver

From 07c50e6e5277f968b46345d75bc95c0bddbf9d16 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 17 Mar 2026 03:23:42 +0000
Subject: [PATCH 09/21] COMPILE_ONLY

---
 cpp/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 99cd511e6c..77472adacd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -784,7 +784,7 @@ if(NOT BUILD_CPU_ONLY)
         $<COMPILE_ONLY:cuco::cuco>
         $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
         $<$<BOOL:${JIT_LTO_COMPILATION}>:$<LINK_LIBRARY:WHOLE_ARCHIVE,cuvs::cuvs_jit_lto_kernels>>
-        CUDA::cuda_driver
+        $<COMPILE_ONLY:CUDA::cuda_driver>
     )
 
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
@@ -848,7 +848,7 @@ SECTIONS
         $<COMPILE_ONLY:nvidia::cutlass::cutlass>
         $<COMPILE_ONLY:cuco::cuco>
         $<$<BOOL:${JIT_LTO_COMPILATION}>:$<LINK_LIBRARY:WHOLE_ARCHIVE,cuvs::cuvs_jit_lto_kernels>>
-        CUDA::cuda_driver
+        $<COMPILE_ONLY:CUDA::cuda_driver>
     )
   endif()
 

From 788fd34e2e05a6b601755d81a1f122e76a004bed Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 17 Mar 2026 03:45:58 +0000
Subject: [PATCH 10/21] PUBLIC

---
 cpp/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 77472adacd..50df7c1436 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -774,6 +774,7 @@ if(NOT BUILD_CPU_ONLY)
       PUBLIC rmm::rmm
              raft::raft
              cuvs::cuvs_cpp_headers
+             CUDA::cuda_driver
              ${CUVS_CTK_MATH_DEPENDENCIES}
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:NCCL::NCCL>>
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>>
@@ -784,7 +785,6 @@ if(NOT BUILD_CPU_ONLY)
         $<COMPILE_ONLY:cuco::cuco>
         $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
         $<$<BOOL:${JIT_LTO_COMPILATION}>:$<LINK_LIBRARY:WHOLE_ARCHIVE,cuvs::cuvs_jit_lto_kernels>>
-        $<COMPILE_ONLY:CUDA::cuda_driver>
     )
 
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
@@ -838,6 +838,7 @@ SECTIONS
       PUBLIC rmm::rmm
              raft::raft
              cuvs::cuvs_cpp_headers
+             CUDA::cuda_driver
              ${CUVS_CTK_MATH_DEPENDENCIES}
              $<TARGET_NAME_IF_EXISTS:NCCL::NCCL> # needs to be public for DT_NEEDED
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>> # header only
@@ -848,7 +849,6 @@ SECTIONS
         $<COMPILE_ONLY:nvidia::cutlass::cutlass>
         $<COMPILE_ONLY:cuco::cuco>
         $<$<BOOL:${JIT_LTO_COMPILATION}>:$<LINK_LIBRARY:WHOLE_ARCHIVE,cuvs::cuvs_jit_lto_kernels>>
-        $<COMPILE_ONLY:CUDA::cuda_driver>
     )
   endif()
 

From e16b88fea390f17d64113929b242923a56ddbfcb Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 17 Mar 2026 14:16:07 +0000
Subject: [PATCH 11/21] Revert "Use the driver API instead"

This reverts commit 6c91f9de8d831f75eef719e6b743f44e3d83de2a.
---
 .../cuvs/detail/jit_lto/AlgorithmLauncher.hpp | 16 +++---
 cpp/include/cuvs/detail/jit_lto/cu_try.hpp    | 25 ---------
 cpp/src/detail/jit_lto/AlgorithmLauncher.cpp  | 55 +++++++++----------
 cpp/src/detail/jit_lto/AlgorithmPlanner.cpp   | 28 ++++------
 .../ivf_flat_interleaved_scan_jit.cuh         | 14 ++---
 5 files changed, 53 insertions(+), 85 deletions(-)
 delete mode 100644 cpp/include/cuvs/detail/jit_lto/cu_try.hpp

diff --git a/cpp/include/cuvs/detail/jit_lto/AlgorithmLauncher.hpp b/cpp/include/cuvs/detail/jit_lto/AlgorithmLauncher.hpp
index 3749a77773..18e7f7cb2f 100644
--- a/cpp/include/cuvs/detail/jit_lto/AlgorithmLauncher.hpp
+++ b/cpp/include/cuvs/detail/jit_lto/AlgorithmLauncher.hpp
@@ -11,13 +11,13 @@
 #include <unordered_map>
 #include <vector_types.h>
 
-#include <cuda.h>
+#include <cuda_runtime.h>
 #include <memory>
 
 struct AlgorithmLauncher {
-  AlgorithmLauncher() : function{nullptr}, library{nullptr} {}
+  AlgorithmLauncher() : kernel{nullptr}, library{nullptr} {}
 
-  AlgorithmLauncher(CUfunction f, CUlibrary lib);
+  AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib);
 
   ~AlgorithmLauncher();
 
@@ -28,18 +28,18 @@ struct AlgorithmLauncher {
   AlgorithmLauncher& operator=(AlgorithmLauncher&& other) noexcept;
 
   template <typename... Args>
-  void dispatch(CUstream stream, dim3 grid, dim3 block, std::size_t shared_mem, Args&&... args)
+  void dispatch(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, Args&&... args)
   {
     void* kernel_args[] = {const_cast<void*>(static_cast<void const*>(&args))...};
     this->call(stream, grid, block, shared_mem, kernel_args);
   }
 
-  CUfunction get_function() { return this->function; }
+  cudaKernel_t get_kernel() { return this->kernel; }
 
  private:
-  void call(CUstream stream, dim3 grid, dim3 block, std::size_t shared_mem, void** args);
-  CUfunction function;
-  CUlibrary library;
+  void call(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** args);
+  cudaKernel_t kernel;
+  cudaLibrary_t library;
 };
 
 std::unordered_map<std::string, std::shared_ptr<AlgorithmLauncher>>& get_cached_launchers();
diff --git a/cpp/include/cuvs/detail/jit_lto/cu_try.hpp b/cpp/include/cuvs/detail/jit_lto/cu_try.hpp
deleted file mode 100644
index 041d482846..0000000000
--- a/cpp/include/cuvs/detail/jit_lto/cu_try.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include <raft/util/cuda_rt_essentials.hpp>
-
-#include <cuda.h>
-
-#define CU_TRY(call)                                                                      \
-  do {                                                                                    \
-    CUresult const status = call;                                                         \
-    if (status != CUDA_SUCCESS) {                                                         \
-      std::string msg{};                                                                  \
-      const char* name;                                                                   \
-      cuGetErrorName(status, &name);                                                      \
-      const char* str;                                                                    \
-      cuGetErrorString(status, &str);                                                     \
-      SET_ERROR_MSG(                                                                      \
-        msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, name, str); \
-      throw raft::cuda_error(msg);                                                        \
-    }                                                                                     \
-  } while (0)
diff --git a/cpp/src/detail/jit_lto/AlgorithmLauncher.cpp b/cpp/src/detail/jit_lto/AlgorithmLauncher.cpp
index f490ec9dcd..0402ef8304 100644
--- a/cpp/src/detail/jit_lto/AlgorithmLauncher.cpp
+++ b/cpp/src/detail/jit_lto/AlgorithmLauncher.cpp
@@ -4,55 +4,52 @@
  */
 
 #include <cuvs/detail/jit_lto/AlgorithmLauncher.hpp>
-#include <cuvs/detail/jit_lto/cu_try.hpp>
 
-AlgorithmLauncher::AlgorithmLauncher(CUfunction f, CUlibrary lib) : function{f}, library{lib} {}
+#include <raft/util/cuda_rt_essentials.hpp>
+
+AlgorithmLauncher::AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib) : kernel{k}, library{lib} {}
 
 AlgorithmLauncher::~AlgorithmLauncher()
 {
-  if (library != nullptr) { (void)cuLibraryUnload(library); }
+  if (library != nullptr) { (void)cudaLibraryUnload(library); }
 }
 
 AlgorithmLauncher::AlgorithmLauncher(AlgorithmLauncher&& other) noexcept
-  : function{other.function}, library{other.library}
+  : kernel{other.kernel}, library{other.library}
 {
-  other.function = nullptr;
-  other.library  = nullptr;
+  other.kernel  = nullptr;
+  other.library = nullptr;
 }
 
 AlgorithmLauncher& AlgorithmLauncher::operator=(AlgorithmLauncher&& other) noexcept
 {
   if (this != &other) {
     // Unload current library if it exists
-    if (library != nullptr) { cuLibraryUnload(library); }
-    function       = other.function;
-    library        = other.library;
-    other.function = nullptr;
-    other.library  = nullptr;
+    if (library != nullptr) { cudaLibraryUnload(library); }
+    kernel        = other.kernel;
+    library       = other.library;
+    other.kernel  = nullptr;
+    other.library = nullptr;
   }
   return *this;
 }
 
 void AlgorithmLauncher::call(
-  CUstream stream, dim3 grid, dim3 block, std::size_t shared_mem, void** kernel_args)
+  cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** kernel_args)
 {
-  CUlaunchAttribute attribute[1];
-  attribute[0].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION;
-  attribute[0].value.programmaticStreamSerializationAllowed = 1;
-
-  CUlaunchConfig config;
-  config.gridDimX       = grid.x;
-  config.gridDimY       = grid.y;
-  config.gridDimZ       = grid.z;
-  config.blockDimX      = block.x;
-  config.blockDimY      = block.y;
-  config.blockDimZ      = block.z;
-  config.hStream        = stream;
-  config.attrs          = attribute;
-  config.numAttrs       = 1;
-  config.sharedMemBytes = shared_mem;
-
-  CU_TRY(cuLaunchKernelEx(&config, function, kernel_args, nullptr));
+  cudaLaunchAttribute attribute[1];
+  attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attribute[0].val.programmaticStreamSerializationAllowed = 1;
+
+  cudaLaunchConfig_t config;
+  config.gridDim          = grid;
+  config.blockDim         = block;
+  config.stream           = stream;
+  config.attrs            = attribute;
+  config.numAttrs         = 1;
+  config.dynamicSmemBytes = shared_mem;
+
+  RAFT_CUDA_TRY(cudaLaunchKernelExC(&config, kernel, kernel_args));
 }
 
 std::unordered_map<std::string, std::shared_ptr<AlgorithmLauncher>>& get_cached_launchers()
diff --git a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
index 11d1638481..dbb1f09c30 100644
--- a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
+++ b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
@@ -15,9 +15,7 @@
 
 #include <cuvs/detail/jit_lto/AlgorithmPlanner.hpp>
 #include <cuvs/detail/jit_lto/FragmentDatabase.hpp>
-#include <cuvs/detail/jit_lto/cu_try.hpp>
 
-#include "cuda.h"
 #include "cuda_runtime.h"
 #include "nvJitLink.h"
 
@@ -71,12 +69,12 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::get_launcher()
 
 std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build()
 {
-  CUdevice device;
-  int major = 0;
-  int minor = 0;
-  CU_TRY(cuDeviceGet(&device, 0));
-  CU_TRY(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
-  CU_TRY(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
+  int device = 0;
+  int major  = 0;
+  int minor  = 0;
+  RAFT_CUDA_TRY(cudaGetDevice(&device));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
 
   std::string archs = "-arch=sm_" + std::to_string((major * 10 + minor));
 
@@ -108,14 +106,12 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build()
   RAFT_EXPECTS(result == NVJITLINK_SUCCESS, "nvJitLinkDestroy failed");
 
   // cubin is linked, so now load it
-  CUlibrary library;
-  CU_TRY(cuLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0));
+  cudaLibrary_t library;
+  RAFT_CUDA_TRY(
+    cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0));
 
-  CUkernel kernel;
-  CU_TRY(cuLibraryGetKernel(&kernel, library, this->entrypoint.c_str()));
+  cudaKernel_t kernel;
+  RAFT_CUDA_TRY(cudaLibraryGetKernel(&kernel, library, this->entrypoint.c_str()));
 
-  CUfunction function;
-  CU_TRY(cuKernelGetFunction(&function, kernel));
-
-  return std::make_shared<AlgorithmLauncher>(function, library);
+  return std::make_shared<AlgorithmLauncher>(kernel, library);
 }
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh
index c1f67094c7..be8652dd59 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh
@@ -8,7 +8,6 @@
 #include "../ivf_common.cuh"
 #include "jit_lto_kernels/interleaved_scan_planner.hpp"
 #include <cstdint>
-#include <cuvs/detail/jit_lto/cu_try.hpp>
 #include <cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp>
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_flat.hpp>
@@ -17,6 +16,7 @@
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/util/cuda_rt_essentials.hpp>  // RAFT_CUDA_TRY
 #include <raft/util/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -102,14 +102,14 @@ constexpr auto get_post_lambda_name()
 inline uint32_t configure_launch_x(uint32_t numQueries,
                                    uint32_t n_probes,
                                    int32_t sMemSize,
-                                   CUfunction func)
+                                   cudaKernel_t func)
 {
-  CUdevice dev_id;
-  CU_TRY(cuDeviceGet(&dev_id, 0));
+  int dev_id;
+  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
   int num_sms;
-  CU_TRY(cuDeviceGetAttribute(&num_sms, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev_id));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
   int num_blocks_per_sm = 0;
-  CU_TRY(cuOccupancyMaxActiveBlocksPerMultiprocessor(
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &num_blocks_per_sm, func, kThreadsPerBlock, sMemSize));
 
   size_t min_grid_size = num_sms * num_blocks_per_sm;
@@ -178,7 +178,7 @@ void launch_kernel(const index<T, IdxT>& index,
 
   if (grid_dim_x == 0) {
     grid_dim_x = configure_launch_x(
-      std::min(kMaxGridY, num_queries), n_probes, smem_size, kernel_launcher->get_function());
+      std::min(kMaxGridY, num_queries), n_probes, smem_size, kernel_launcher->get_kernel());
     return;
   }
 

From 96e91620981c70111836dfd83dbc0dbd8f556a0b Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 17 Mar 2026 14:20:02 +0000
Subject: [PATCH 12/21] Remove driver dep

---
 ci/build_wheel.sh                                   |  1 -
 conda/environments/all_cuda-129_arch-aarch64.yaml   |  1 -
 conda/environments/all_cuda-129_arch-x86_64.yaml    |  1 -
 conda/environments/all_cuda-131_arch-aarch64.yaml   |  1 -
 conda/environments/all_cuda-131_arch-x86_64.yaml    |  1 -
 .../bench_ann_cuda-129_arch-aarch64.yaml            |  1 -
 .../bench_ann_cuda-129_arch-x86_64.yaml             |  1 -
 .../bench_ann_cuda-131_arch-aarch64.yaml            |  1 -
 .../bench_ann_cuda-131_arch-x86_64.yaml             |  1 -
 conda/environments/go_cuda-129_arch-aarch64.yaml    |  1 -
 conda/environments/go_cuda-129_arch-x86_64.yaml     |  1 -
 conda/environments/go_cuda-131_arch-aarch64.yaml    |  1 -
 conda/environments/go_cuda-131_arch-x86_64.yaml     |  1 -
 conda/environments/rust_cuda-129_arch-aarch64.yaml  |  1 -
 conda/environments/rust_cuda-129_arch-x86_64.yaml   |  1 -
 conda/environments/rust_cuda-131_arch-aarch64.yaml  |  1 -
 conda/environments/rust_cuda-131_arch-x86_64.yaml   |  1 -
 conda/recipes/libcuvs/recipe.yaml                   | 13 -------------
 cpp/CMakeLists.txt                                  |  2 --
 dependencies.yaml                                   |  1 -
 20 files changed, 33 deletions(-)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 06891222b0..cd307a6a77 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -37,7 +37,6 @@ cd "${package_dir}"
 EXCLUDE_ARGS=(
   --exclude "libcublas.so.*"
   --exclude "libcublasLt.so.*"
-  --exclude "libcuda.so.*"
   --exclude "libcurand.so.*"
   --exclude "libcusolver.so.*"
   --exclude "libcusparse.so.*"
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 6f1988ae51..c6762b4d2c 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -12,7 +12,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 3e2c5af6f9..767aa5ce32 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -12,7 +12,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
index 6289ea4974..5c348e8f34 100644
--- a/conda/environments/all_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -12,7 +12,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
index a9f29ae857..5020088a7c 100644
--- a/conda/environments/all_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -12,7 +12,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
index f6b8523e1e..2794d571fb 100644
--- a/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - click
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
index 60793ee8c3..0eb77c1730 100644
--- a/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - click
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml
index d57e588a32..2a7f1cd9ea 100644
--- a/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - click
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml
index 212b5aa6cc..07fb692de1 100644
--- a/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - click
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/go_cuda-129_arch-aarch64.yaml b/conda/environments/go_cuda-129_arch-aarch64.yaml
index 41cca9dc65..60455b7cec 100644
--- a/conda/environments/go_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/go_cuda-129_arch-aarch64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/go_cuda-129_arch-x86_64.yaml b/conda/environments/go_cuda-129_arch-x86_64.yaml
index 11ab42b6cc..cd6911797d 100644
--- a/conda/environments/go_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/go_cuda-129_arch-x86_64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/go_cuda-131_arch-aarch64.yaml b/conda/environments/go_cuda-131_arch-aarch64.yaml
index 0b6305d4bf..135f6a88cc 100644
--- a/conda/environments/go_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/go_cuda-131_arch-aarch64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/go_cuda-131_arch-x86_64.yaml b/conda/environments/go_cuda-131_arch-x86_64.yaml
index 5099c2bb87..df6a779331 100644
--- a/conda/environments/go_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/go_cuda-131_arch-x86_64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/rust_cuda-129_arch-aarch64.yaml b/conda/environments/rust_cuda-129_arch-aarch64.yaml
index f0080eeb7d..1f84c62a68 100644
--- a/conda/environments/rust_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/rust_cuda-129_arch-aarch64.yaml
@@ -10,7 +10,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/rust_cuda-129_arch-x86_64.yaml b/conda/environments/rust_cuda-129_arch-x86_64.yaml
index f661febdfe..55d79b591a 100644
--- a/conda/environments/rust_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/rust_cuda-129_arch-x86_64.yaml
@@ -10,7 +10,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/rust_cuda-131_arch-aarch64.yaml b/conda/environments/rust_cuda-131_arch-aarch64.yaml
index 7f5b84fc18..062cbc8ea0 100644
--- a/conda/environments/rust_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/rust_cuda-131_arch-aarch64.yaml
@@ -10,7 +10,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/rust_cuda-131_arch-x86_64.yaml b/conda/environments/rust_cuda-131_arch-x86_64.yaml
index b2a5820c7d..2b96d4a64e 100644
--- a/conda/environments/rust_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/rust_cuda-131_arch-x86_64.yaml
@@ -10,7 +10,6 @@ dependencies:
 - clang==20.1.4
 - cmake>=3.30.4
 - cuda-cudart-dev
-- cuda-driver-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/recipes/libcuvs/recipe.yaml b/conda/recipes/libcuvs/recipe.yaml
index a2848326c9..b81f0f90cd 100644
--- a/conda/recipes/libcuvs/recipe.yaml
+++ b/conda/recipes/libcuvs/recipe.yaml
@@ -78,7 +78,6 @@ cache:
       - nccl ${{ nccl_version }}
       - cuda-version =${{ cuda_version }}
       - cuda-cudart-dev
-      - cuda-driver-dev
       - cuda-profiler-api
       - libcublas-dev
       - libcurand-dev
@@ -115,7 +114,6 @@ outputs:
         - nccl ${{ nccl_version }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
-        - cuda-driver-dev
         - cuda-profiler-api
         - libcublas-dev
         - libcurand-dev
@@ -136,7 +134,6 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
-          - cuda-driver
           - cuda-version
           - libaio
           - libboost
@@ -179,7 +176,6 @@ outputs:
         - nccl ${{ nccl_version }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
-        - cuda-driver-dev
         - cuda-profiler-api
         - libcublas-dev
         - libcurand-dev
@@ -201,7 +197,6 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
-          - cuda-driver
           - cuda-version
           - libaio
           - libboost
@@ -242,7 +237,6 @@ outputs:
         - nccl ${{ nccl_version }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
-        - cuda-driver-dev
         - cuda-profiler-api
         - libcublas-dev
         - libcurand-dev
@@ -264,7 +258,6 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
-          - cuda-driver
           - cuda-version
           - libaio
           - libboost
@@ -304,7 +297,6 @@ outputs:
         - cuda-version =${{ cuda_version }}
         - openblas  # required by some CPU algos in benchmarks
         - cuda-cudart-dev
-        - cuda-driver-dev
         - cuda-profiler-api
         - libcublas-dev
         - libcurand-dev
@@ -324,7 +316,6 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
-          - cuda-driver
           - cuda-version
           - libaio
           - libboost
@@ -407,7 +398,6 @@ outputs:
         - librmm =${{ minor_version }}
         - nccl ${{ nccl_version }}
         - cuda-cudart-dev
-        - cuda-driver-dev
         - cuda-profiler-api
         - libcublas-dev
         - libcurand-dev
@@ -428,7 +418,6 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
-          - cuda-driver
           - cuda-version
           - libaio
           - libboost
@@ -471,7 +460,6 @@ outputs:
         - ${{ pin_subpackage("libcuvs-headers", exact=True) }}
         - ${{ pin_subpackage("libcuvs", exact=True) }}
         - cuda-cudart-dev
-        - cuda-driver-dev
         - cuda-profiler-api
         - cuda-version =${{ cuda_version }}
         - libcublas-dev
@@ -504,7 +492,6 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-cudart
-          - cuda-driver
           - cuda-version
           - libcublas
           - libcurand
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 50df7c1436..570e77c3bb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -774,7 +774,6 @@ if(NOT BUILD_CPU_ONLY)
       PUBLIC rmm::rmm
              raft::raft
              cuvs::cuvs_cpp_headers
-             CUDA::cuda_driver
              ${CUVS_CTK_MATH_DEPENDENCIES}
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:NCCL::NCCL>>
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>>
@@ -838,7 +837,6 @@ SECTIONS
       PUBLIC rmm::rmm
              raft::raft
              cuvs::cuvs_cpp_headers
-             CUDA::cuda_driver
              ${CUVS_CTK_MATH_DEPENDENCIES}
              $<TARGET_NAME_IF_EXISTS:NCCL::NCCL> # needs to be public for DT_NEEDED
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>> # header only
diff --git a/dependencies.yaml b/dependencies.yaml
index 23003ffa95..d6e2d406c2 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -318,7 +318,6 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cuda-driver-dev
           - cuda-nvtx-dev
           - cuda-cudart-dev
           - cuda-profiler-api

From 8027d975b3d593e49e16848ef2e0a3c176d6e5b1 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 16 Mar 2026 11:33:12 -0500
Subject: [PATCH 13/21] Default to static linking of libcudart (#1627)

- Enable static linking of libcudart by default (`CUDA_STATIC_RUNTIME=ON`)
- Remove `cuda-cudart` from conda recipe run requirements (no longer needed when statically linked)

This is part of a RAPIDS-wide effort to switch to static CUDA runtime linking. See https://github.com/rapidsai/build-planning/issues/235 for tracking.

- `cpp/CMakeLists.txt`: Change `CUDA_STATIC_RUNTIME` default from OFF to ON
- `conda/recipes/cuvs/recipe.yaml`: Remove `cuda-cudart` from run deps
- `conda/recipes/libcuvs/recipe.yaml`: Remove `cuda-cudart` from run deps (4 outputs)

Note: Python builds already use `CUDA_STATIC_RUNTIME=ON` (set in `python/libcuvs/CMakeLists.txt`).

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Robert Maynard (https://github.com/robertmaynard)
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/1627
---
 c/tests/CMakeLists.txt                                 |  2 +-
 ci/build_go.sh                                         |  2 +-
 conda/environments/all_cuda-129_arch-aarch64.yaml      |  1 -
 conda/environments/all_cuda-129_arch-x86_64.yaml       |  1 -
 conda/environments/all_cuda-131_arch-aarch64.yaml      |  1 -
 conda/environments/all_cuda-131_arch-x86_64.yaml       |  1 -
 .../environments/bench_ann_cuda-129_arch-aarch64.yaml  |  1 -
 conda/environments/bench_ann_cuda-129_arch-x86_64.yaml |  1 -
 .../environments/bench_ann_cuda-131_arch-aarch64.yaml  |  1 -
 conda/environments/bench_ann_cuda-131_arch-x86_64.yaml |  1 -
 conda/recipes/cuvs/recipe.yaml                         |  1 -
 conda/recipes/libcuvs/recipe.yaml                      |  6 ------
 cpp/CMakeLists.txt                                     |  4 +---
 cpp/cmake/thirdparty/get_faiss.cmake                   |  2 +-
 dependencies.yaml                                      |  8 +++++++-
 docs/source/build.rst                                  |  7 +------
 examples/c/CMakeLists.txt                              | 10 +++++-----
 examples/go/README.md                                  |  2 +-
 go/dlpack.go                                           |  1 +
 python/libcuvs/CMakeLists.txt                          |  1 -
 rust/cuvs-sys/build.rs                                 |  5 ++++-
 21 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/c/tests/CMakeLists.txt b/c/tests/CMakeLists.txt
index 343e70ef63..2d09490975 100644
--- a/c/tests/CMakeLists.txt
+++ b/c/tests/CMakeLists.txt
@@ -94,7 +94,7 @@ endif()
 
 ConfigureTest(NAME cuvs_c_headers PATH core/headers.c)
 ConfigureTest(NAME cuvs_c_test PATH core/c_api.c)
-target_link_libraries(cuvs_c_test PRIVATE CUDA::cudart)
+target_link_libraries(cuvs_c_test PRIVATE CUDA::cudart_static)
 ConfigureTest(NAME cuvs_c_neighbors_test PATH neighbors/c_api.c)
 
 # ##################################################################################################
diff --git a/ci/build_go.sh b/ci/build_go.sh
index af3ed10c88..80370048ff 100755
--- a/ci/build_go.sh
+++ b/ci/build_go.sh
@@ -31,7 +31,7 @@ set -eu
 rapids-print-env
 
 export CGO_CFLAGS="-I${CONDA_PREFIX}/include"
-export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcudart -lcuvs -lcuvs_c"
+export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcudart_static -ldl -lrt -lcuvs -lcuvs_c"
 export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
 export CC=clang
 
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 74f13bad68..d3a6d4dd8d 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - clang-tools==20.1.4
 - clang==20.1.4
 - cmake>=3.30.4
-- cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index f726fbd93f..7da3405cec 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - clang-tools==20.1.4
 - clang==20.1.4
 - cmake>=3.30.4
-- cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
index d35632064f..94129ae55a 100644
--- a/conda/environments/all_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - clang-tools==20.1.4
 - clang==20.1.4
 - cmake>=3.30.4
-- cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
index 367cf0ec6e..23150f3ba4 100644
--- a/conda/environments/all_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -11,7 +11,6 @@ dependencies:
 - clang-tools==20.1.4
 - clang==20.1.4
 - cmake>=3.30.4
-- cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
index 26df7c96c9..5dd1abf183 100644
--- a/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
@@ -10,7 +10,6 @@ dependencies:
 - clang==20.1.4
 - click
 - cmake>=3.30.4
-- cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
index d63791f084..75d36d1f33 100644
--- a/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
@@ -10,7 +10,6 @@ dependencies:
 - clang==20.1.4
 - click
 - cmake>=3.30.4
-- cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml
index d1e3a5bae4..3758fab63b 100644
--- a/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml
@@ -10,7 +10,6 @@ dependencies:
 - clang==20.1.4
 - click
 - cmake>=3.30.4
-- cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml
index 4b09d1b885..5f93dbb946 100644
--- a/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml
@@ -10,7 +10,6 @@ dependencies:
 - clang==20.1.4
 - click
 - cmake>=3.30.4
-- cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
diff --git a/conda/recipes/cuvs/recipe.yaml b/conda/recipes/cuvs/recipe.yaml
index dcce17cb13..690a2cf1f9 100644
--- a/conda/recipes/cuvs/recipe.yaml
+++ b/conda/recipes/cuvs/recipe.yaml
@@ -91,7 +91,6 @@ requirements:
     - if: cuda_major == "12"
       then: cuda-python >=12.9.2,<13.0
       else: cuda-python >=13.0.1,<14.0
-    - cuda-cudart
   ignore_run_exports:
     by_name:
       - cuda-version
diff --git a/conda/recipes/libcuvs/recipe.yaml b/conda/recipes/libcuvs/recipe.yaml
index abd3031a94..4453b18393 100644
--- a/conda/recipes/libcuvs/recipe.yaml
+++ b/conda/recipes/libcuvs/recipe.yaml
@@ -129,7 +129,6 @@ outputs:
         - libraft-headers =${{ minor_version }}
         - librmm =${{ minor_version }}
         - nccl
-        - cuda-cudart
         - libcublas
         - libcurand
         - libcusolver
@@ -198,7 +197,6 @@ outputs:
         - libraft-headers =${{ minor_version }}
         - librmm =${{ minor_version }}
         - nccl
-        - cuda-cudart
         - libcublas
         - libcurand
         - libcusolver
@@ -265,7 +263,6 @@ outputs:
         - libraft-headers =${{ minor_version }}
         - librmm =${{ minor_version }}
         - nccl
-        - cuda-cudart
         - libcublas
         - libcurand
         - libcusolver
@@ -328,7 +325,6 @@ outputs:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - libraft-headers =${{ minor_version }}
         - nccl
-        - cuda-cudart
         - libcublas
         - libcurand
         - libcusolver
@@ -431,7 +427,6 @@ outputs:
         - ${{ pin_subpackage("libcuvs", exact=True) }}
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - nccl
-        - cuda-cudart
         - libcublas
         - libcurand
         - libcusolver
@@ -506,7 +501,6 @@ outputs:
         - ${{ pin_subpackage("libcuvs-headers", exact=True) }}
         - ${{ pin_subpackage("libcuvs", exact=True) }}
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-        - cuda-cudart
         - libcublas
         - libcurand
         - libcusolver
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7f1ce7666b..13db980b22 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -57,7 +57,6 @@ option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
        "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
 )
-option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 option(CUDA_STATIC_MATH_LIBRARIES "Statically link the CUDA math libraries" OFF)
 option(CUVS_STATIC_RAPIDS_LIBRARIES "Build and statically link RAPIDS libraries" OFF)
 option(CUDA_LOG_COMPILE_TIME "Write a log of compilation times to nvcc_compile_log.csv" OFF)
@@ -116,7 +115,6 @@ message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}")
 message(VERBOSE "cuVS: Enable kernel resource usage info: ${CUDA_ENABLE_KERNELINFO}")
 message(VERBOSE "cuVS: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}")
 message(VERBOSE "cuVS: Enable nvtx markers: ${CUVS_NVTX}")
-message(VERBOSE "cuVS: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 message(VERBOSE "cuVS: Statically link the CUDA math libraries: ${CUDA_STATIC_MATH_LIBRARIES}")
 message(VERBOSE "cuVS: Build and statically link RAPIDS libraries: ${CUVS_STATIC_RAPIDS_LIBRARIES}")
 
@@ -153,7 +151,7 @@ endif()
 
 if(NOT BUILD_CPU_ONLY)
   # CUDA runtime
-  rapids_cuda_init_runtime(USE_STATIC ${CUDA_STATIC_RUNTIME})
+  rapids_cuda_init_runtime(USE_STATIC ON)
   # * find CUDAToolkit package
   # * determine GPU architectures
   # * enable the CMake CUDA language
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index 75a4473276..dfdc1a4d31 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -48,7 +48,7 @@ function(find_and_configure_faiss)
     "FAISS_ENABLE_CUVS ${PKG_ENABLE_GPU}"
     "FAISS_ENABLE_PYTHON OFF"
     "FAISS_OPT_LEVEL ${CUVS_FAISS_OPT_LEVEL}"
-    "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
+    "FAISS_USE_CUDA_TOOLKIT_STATIC ON"
     "BUILD_TESTING OFF"
     "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
     )
diff --git a/dependencies.yaml b/dependencies.yaml
index e855a93f3e..df6159084b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -96,6 +96,7 @@ files:
       - cuda_version
       - rapids_build
       - rust
+      - depends_on_cudart
       - depends_on_libcuvs
       - depends_on_libraft
       - depends_on_nccl
@@ -110,6 +111,7 @@ files:
       - cuda_version
       - rapids_build
       - go
+      - depends_on_cudart
       - depends_on_libcuvs
       - depends_on_libraft
       - depends_on_nccl
@@ -319,7 +321,6 @@ dependencies:
       - output_types: [conda]
         packages:
           - cuda-nvtx-dev
-          - cuda-cudart-dev
           - cuda-profiler-api
           - libcublas-dev
           - libcurand-dev
@@ -357,6 +358,11 @@ dependencies:
           - matrix:
             packages:
               - cuda-toolkit[cublas,curand,cusolver,cusparse,nvjitlink]>=12,<14
+  depends_on_cudart:
+    common:
+      - output_types: conda
+        packages:
+          - cuda-cudart-dev
   depends_on_cupy:
     common:
       - output_types: conda
diff --git a/docs/source/build.rst b/docs/source/build.rst
index dcef10c96b..5e863e40f4 100644
--- a/docs/source/build.rst
+++ b/docs/source/build.rst
@@ -205,7 +205,7 @@ After building the C and C++ libraries, the Golang library can be built with the
 
     export CUDA_HOME="/usr/local/cuda" # or wherever your CUDA installation is.
     export CGO_CFLAGS="-I${CONDA_PREFIX}/include -I${CUDA_HOME}/include"
-    export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcudart -lcuvs -lcuvs_c"
+    export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcuvs -lcuvs_c"
     export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
     export CC=clang
 
@@ -259,11 +259,6 @@ cuVS has the following configurable cmake flags available:
    - OFF
    - Enable the `-lineinfo` option for nvcc
 
- * - CUDA_STATIC_RUNTIME
-   - ON, OFF
-   - OFF
-   - Statically link the CUDA runtime
-
  * - CUDA_STATIC_MATH_LIBRARIES
    - ON, OFF
    - OFF
diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt
index feb7a03309..61092834c7 100644
--- a/examples/c/CMakeLists.txt
+++ b/examples/c/CMakeLists.txt
@@ -34,29 +34,29 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Werror")
 add_executable(CAGRA_C_EXAMPLE src/cagra_c_example.c)
 target_include_directories(CAGRA_C_EXAMPLE PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 target_link_libraries(
-  CAGRA_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart $<TARGET_NAME_IF_EXISTS:conda_env>
+  CAGRA_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart_static $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
 add_executable(L2_C_EXAMPLE src/L2_c_example.c)
 target_include_directories(L2_C_EXAMPLE PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 target_link_libraries(
-  L2_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart $<TARGET_NAME_IF_EXISTS:conda_env>
+  L2_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart_static $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
 add_executable(IVF_FLAT_C_EXAMPLE src/ivf_flat_c_example.c)
 target_include_directories(IVF_FLAT_C_EXAMPLE PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 target_link_libraries(
-  IVF_FLAT_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart $<TARGET_NAME_IF_EXISTS:conda_env>
+  IVF_FLAT_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart_static $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
 add_executable(IVF_PQ_C_EXAMPLE src/ivf_pq_c_example.c)
 target_include_directories(IVF_PQ_C_EXAMPLE PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 target_link_libraries(
-  IVF_PQ_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart $<TARGET_NAME_IF_EXISTS:conda_env>
+  IVF_PQ_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart_static $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
 add_executable(BRUTEFORCE_C_EXAMPLE src/bruteforce_c_example.c)
 target_include_directories(BRUTEFORCE_C_EXAMPLE PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 target_link_libraries(
-  BRUTEFORCE_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart $<TARGET_NAME_IF_EXISTS:conda_env>
+  BRUTEFORCE_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart_static $<TARGET_NAME_IF_EXISTS:conda_env>
 )
diff --git a/examples/go/README.md b/examples/go/README.md
index 798a024de4..819b05c700 100644
--- a/examples/go/README.md
+++ b/examples/go/README.md
@@ -17,7 +17,7 @@ You may prefer to use `mamba`, as it provides significant speedup over `conda`.
 1. Set up the required environment variables:
 ```bash
 export CGO_CFLAGS="-I${CONDA_PREFIX}/include"
-export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcudart -lcuvs -lcuvs_c"
+export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcudart_static -ldl -lrt -lcuvs -lcuvs_c"
 export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
 export CC=clang
 ```
diff --git a/go/dlpack.go b/go/dlpack.go
index 6fe619fd35..fcb9632586 100644
--- a/go/dlpack.go
+++ b/go/dlpack.go
@@ -3,6 +3,7 @@ package cuvs
 // #include <stdlib.h>
 // #include <dlpack/dlpack.h>
 // #include <cuvs/core/c_api.h>
+// #include <cuda_runtime_api.h>
 import "C"
 
 import (
diff --git a/python/libcuvs/CMakeLists.txt b/python/libcuvs/CMakeLists.txt
index 318e82a2b9..bac8373cf7 100644
--- a/python/libcuvs/CMakeLists.txt
+++ b/python/libcuvs/CMakeLists.txt
@@ -31,7 +31,6 @@ endif()
 unset(cuvs_FOUND)
 
 # --- CUDA --- #
-set(CUDA_STATIC_RUNTIME ON)
 set(CUDA_STATIC_MATH_LIBRARIES OFF)
 
 # --- RAFT ---#
diff --git a/rust/cuvs-sys/build.rs b/rust/cuvs-sys/build.rs
index 99e811eeb9..cec80eb736 100644
--- a/rust/cuvs-sys/build.rs
+++ b/rust/cuvs-sys/build.rs
@@ -15,8 +15,11 @@ fn main() {
         "cargo:rustc-link-search=native={}/lib",
         cuvs_build.display()
     );
+    if let Ok(conda_prefix) = env::var("CONDA_PREFIX") {
+        println!("cargo:rustc-link-search=native={}/lib", conda_prefix);
+    }
     println!("cargo:rustc-link-lib=dylib=cuvs_c");
-    println!("cargo:rustc-link-lib=dylib=cudart");
+    println!("cargo:rustc-link-lib=static=cudart_static");
 
     // we need some extra flags both to link against cuvs, and also to run bindgen
     // specifically we need to:

From 56229e8788a7520d840f2decf25bb8784f70c7dc Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Wed, 18 Mar 2026 16:57:10 +0000
Subject: [PATCH 14/21] Opt out of rmm's cudart dependency

---
 cpp/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 13db980b22..0eaf1388bf 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -773,6 +773,7 @@ if(NOT BUILD_CPU_ONLY)
       PRIVATE $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX> $<COMPILE_ONLY:nvidia::cutlass::cutlass>
               $<COMPILE_ONLY:cuco::cuco> $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
     )
+    set_property(TARGET cuvs PROPERTY NO_CUDART_DEP ON)
 
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
     file(
@@ -834,6 +835,7 @@ SECTIONS
               $<COMPILE_ONLY:nvidia::cutlass::cutlass>
               $<COMPILE_ONLY:cuco::cuco>
     )
+    set_property(TARGET cuvs_static PROPERTY NO_CUDART_DEP ON)
   endif()
 
   # ################################################################################################

From 0a0540afa3d8421a851d367dcc6c2dfefeddff4d Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Wed, 18 Mar 2026 17:48:52 +0000
Subject: [PATCH 15/21] Make rmm interface dependency COMPILE_ONLY

---
 cpp/CMakeLists.txt | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0eaf1388bf..b9885eb553 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -763,15 +763,18 @@ if(NOT BUILD_CPU_ONLY)
 
     target_link_libraries(
       cuvs
-      PUBLIC rmm::rmm
-             raft::raft
+      INTERFACE $<COMPILE_ONLY:rmm::rmm>
+      PUBLIC raft::raft
              cuvs::cuvs_cpp_headers
              ${CUVS_CTK_MATH_DEPENDENCIES}
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:NCCL::NCCL>>
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>>
              $<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
-      PRIVATE $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX> $<COMPILE_ONLY:nvidia::cutlass::cutlass>
-              $<COMPILE_ONLY:cuco::cuco> $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
+      PRIVATE rmm::rmm
+              $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+              $<COMPILE_ONLY:nvidia::cutlass::cutlass>
+              $<COMPILE_ONLY:cuco::cuco>
+              $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
     )
     set_property(TARGET cuvs PROPERTY NO_CUDART_DEP ON)
 
@@ -823,13 +826,14 @@ SECTIONS
 
     target_link_libraries(
       cuvs_static
-      PUBLIC rmm::rmm
-             raft::raft
+      INTERFACE $<COMPILE_ONLY:rmm::rmm>
+      PUBLIC raft::raft
              cuvs::cuvs_cpp_headers
              ${CUVS_CTK_MATH_DEPENDENCIES}
              $<TARGET_NAME_IF_EXISTS:NCCL::NCCL> # needs to be public for DT_NEEDED
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>> # header only
-      PRIVATE $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+      PRIVATE rmm::rmm
+              $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
               $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
               $<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
               $<COMPILE_ONLY:nvidia::cutlass::cutlass>

From 17c5cd7d835b1e6bdad281f9c8e5a6dbc14c990f Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 19 Mar 2026 12:13:25 -0400
Subject: [PATCH 16/21] Push


From b6560bee00c8161a5587472221b0835176b326bb Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 23 Mar 2026 21:09:52 +0000
Subject: [PATCH 17/21] Debugging

---
 cpp/src/detail/jit_lto/AlgorithmPlanner.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
index 6622476687..3bdd988907 100644
--- a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
+++ b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
@@ -4,6 +4,7 @@
  */
 
 #include <chrono>
+#include <iostream>
 #include <iterator>
 #include <memory>
 #include <mutex>
@@ -65,6 +66,7 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build()
   RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
 
   std::string archs = "-arch=sm_" + std::to_string((major * 10 + minor));
+  std::cerr << "Passing argument to nvJitLink: " << archs << "\n";
 
   // Load the generated LTO IR and link them together
   nvJitLinkHandle handle;

From 84ddcf9752068805632eba66be186a66fc36fc5b Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 24 Mar 2026 13:37:45 +0000
Subject: [PATCH 18/21] Downgrade to compute 7.0 for CUDA 12

---
 cpp/CMakeLists.txt                          | 5 ++++-
 cpp/src/detail/jit_lto/AlgorithmPlanner.cpp | 2 --
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d4dbdbfe02..bc51d58444 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -355,9 +355,12 @@ if(NOT BUILD_CPU_ONLY)
     )
   endif()
 
-  set(JIT_LTO_TARGET_ARCHITECTURE "75-real")
+  set(JIT_LTO_TARGET_ARCHITECTURE "70-real")
   set(JIT_LTO_COMPILATION ON)
   set(jit_lto_files)
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+    set(JIT_LTO_TARGET_ARCHITECTURE "75-real")
+  endif()
 
   if(JIT_LTO_COMPILATION)
     # Generate interleaved scan kernel files at build time
diff --git a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
index 3bdd988907..6622476687 100644
--- a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
+++ b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp
@@ -4,7 +4,6 @@
  */
 
 #include <chrono>
-#include <iostream>
 #include <iterator>
 #include <memory>
 #include <mutex>
@@ -66,7 +65,6 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build()
   RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
 
   std::string archs = "-arch=sm_" + std::to_string((major * 10 + minor));
-  std::cerr << "Passing argument to nvJitLink: " << archs << "\n";
 
   // Load the generated LTO IR and link them together
   nvJitLinkHandle handle;

From a8493a3d670ef898828e1139c6158eca78ffd2a9 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 24 Mar 2026 16:35:13 +0000
Subject: [PATCH 19/21] Remove JIT_LTO_COMPILATION variable

---
 cpp/CMakeLists.txt | 189 +++++++++++++++++++++------------------------
 1 file changed, 87 insertions(+), 102 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bc51d58444..d6f82e8914 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -356,95 +356,89 @@ if(NOT BUILD_CPU_ONLY)
   endif()
 
   set(JIT_LTO_TARGET_ARCHITECTURE "70-real")
-  set(JIT_LTO_COMPILATION ON)
-  set(jit_lto_files)
   if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
     set(JIT_LTO_TARGET_ARCHITECTURE "75-real")
   endif()
 
-  if(JIT_LTO_COMPILATION)
-    # Generate interleaved scan kernel files at build time
-    include(cmake/modules/generate_jit_lto_kernels.cmake)
+  # Generate interleaved scan kernel files at build time
+  include(cmake/modules/generate_jit_lto_kernels.cmake)
 
-    add_library(jit_lto_kernel_usage_requirements INTERFACE)
-    target_include_directories(
-      jit_lto_kernel_usage_requirements
-      INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src"
-                "${CMAKE_CURRENT_SOURCE_DIR}/../c/include"
-    )
-    target_compile_options(
-      jit_lto_kernel_usage_requirements INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
-                                                  "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
-    )
-    target_compile_features(jit_lto_kernel_usage_requirements INTERFACE cuda_std_20)
-    target_link_libraries(
-      jit_lto_kernel_usage_requirements INTERFACE rmm::rmm raft::raft CCCL::CCCL
-    )
-
-    block(PROPAGATE interleaved_scan_files metric_files filter_files post_lambda_files)
-    set(CMAKE_CUDA_ARCHITECTURES ${JIT_LTO_TARGET_ARCHITECTURE})
-    generate_jit_lto_kernels(
-      interleaved_scan_files
-      NAME_FORMAT
-        "interleaved_scan_capacity_@capacity@_veclen_@veclen@_@ascending_descending@_@compute_norm_name@_data_@type_abbrev@_acc_@acc_abbrev@_idx_@idx_abbrev@"
-      MATRIX_JSON_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_matrix.json"
-      KERNEL_INPUT_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in"
-      EMBEDDED_INPUT_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in"
-      OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/interleaved_scan"
-      KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
-    )
-    generate_jit_lto_kernels(
-      metric_files
-      NAME_FORMAT "metric_@metric_name@_veclen_@veclen@_data_@type_abbrev@_acc_@acc_abbrev@"
-      MATRIX_JSON_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json"
-      KERNEL_INPUT_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in"
-      EMBEDDED_INPUT_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in"
-      OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/metric"
-      KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
-    )
-    generate_jit_lto_kernels(
-      filter_files
-      NAME_FORMAT "@filter_name@"
-      MATRIX_JSON_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_matrix.json"
-      KERNEL_INPUT_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in"
-      EMBEDDED_INPUT_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in"
-      OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/filter"
-      KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
-    )
-    generate_jit_lto_kernels(
-      post_lambda_files
-      NAME_FORMAT "@post_lambda_name@"
-      MATRIX_JSON_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_matrix.json"
-      KERNEL_INPUT_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in"
-      EMBEDDED_INPUT_FILE
-        "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in"
-      OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/post_lambda"
-      KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
-    )
-    endblock()
-
-    set(jit_lto_files
-        ${interleaved_scan_files}
-        ${metric_files}
-        ${filter_files}
-        ${post_lambda_files}
-        src/detail/jit_lto/AlgorithmLauncher.cpp
-        src/detail/jit_lto/AlgorithmPlanner.cpp
-        src/detail/jit_lto/FragmentEntry.cpp
-        src/detail/jit_lto/nvjitlink_checker.cpp
-    )
-  endif()
+  add_library(jit_lto_kernel_usage_requirements INTERFACE)
+  target_include_directories(
+    jit_lto_kernel_usage_requirements
+    INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src"
+              "${CMAKE_CURRENT_SOURCE_DIR}/../c/include"
+  )
+  target_compile_options(
+    jit_lto_kernel_usage_requirements INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
+                                                "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
+  )
+  target_compile_features(jit_lto_kernel_usage_requirements INTERFACE cuda_std_20)
+  target_link_libraries(jit_lto_kernel_usage_requirements INTERFACE rmm::rmm raft::raft CCCL::CCCL)
+
+  block(PROPAGATE interleaved_scan_files metric_files filter_files post_lambda_files)
+  set(CMAKE_CUDA_ARCHITECTURES ${JIT_LTO_TARGET_ARCHITECTURE})
+  generate_jit_lto_kernels(
+    interleaved_scan_files
+    NAME_FORMAT
+      "interleaved_scan_capacity_@capacity@_veclen_@veclen@_@ascending_descending@_@compute_norm_name@_data_@type_abbrev@_acc_@acc_abbrev@_idx_@idx_abbrev@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in"
+    EMBEDDED_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/interleaved_scan"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    metric_files
+    NAME_FORMAT "metric_@metric_name@_veclen_@veclen@_data_@type_abbrev@_acc_@acc_abbrev@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in"
+    EMBEDDED_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/metric"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    filter_files
+    NAME_FORMAT "@filter_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in"
+    EMBEDDED_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/filter"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    post_lambda_files
+    NAME_FORMAT "@post_lambda_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in"
+    EMBEDDED_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/post_lambda"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  endblock()
+
+  set(jit_lto_files
+      ${interleaved_scan_files}
+      ${metric_files}
+      ${filter_files}
+      ${post_lambda_files}
+      src/detail/jit_lto/AlgorithmLauncher.cpp
+      src/detail/jit_lto/AlgorithmPlanner.cpp
+      src/detail/jit_lto/FragmentEntry.cpp
+      src/detail/jit_lto/nvjitlink_checker.cpp
+  )
 
   add_library(
     cuvs_objs OBJECT
@@ -676,10 +670,8 @@ if(NOT BUILD_CPU_ONLY)
   )
 
   target_compile_definitions(
-    cuvs_objs
-    PRIVATE $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
-            $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
-            $<$<BOOL:${JIT_LTO_COMPILATION}>:CUVS_ENABLE_JIT_LTO>
+    cuvs_objs PRIVATE $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
+                      $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED> CUVS_ENABLE_JIT_LTO
   )
 
   target_link_libraries(
@@ -754,10 +746,8 @@ if(NOT BUILD_CPU_ONLY)
               "$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CONFIG:Debug>>:${CUVS_DEBUG_CUDA_FLAGS}>"
     )
     target_compile_definitions(
-      cuvs
-      PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
-             $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
-             $<$<BOOL:${JIT_LTO_COMPILATION}>:CUVS_ENABLE_JIT_LTO>
+      cuvs PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
+                  $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED> CUVS_ENABLE_JIT_LTO
     )
 
     target_link_libraries(
@@ -769,11 +759,8 @@ if(NOT BUILD_CPU_ONLY)
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:NCCL::NCCL>>
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>>
              $<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
-      PRIVATE rmm::rmm
-              $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-              $<COMPILE_ONLY:nvidia::cutlass::cutlass>
-              $<COMPILE_ONLY:cuco::cuco>
-              $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
+      PRIVATE rmm::rmm $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+              $<COMPILE_ONLY:nvidia::cutlass::cutlass> $<COMPILE_ONLY:cuco::cuco> CUDA::nvJitLink
     )
     set_property(TARGET cuvs PROPERTY NO_CUDART_DEP ON)
 
@@ -812,10 +799,8 @@ SECTIONS
 
     target_compile_options(cuvs_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
     target_compile_definitions(
-      cuvs_static
-      PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
-             $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
-             $<$<BOOL:${JIT_LTO_COMPILATION}>:CUVS_ENABLE_JIT_LTO>
+      cuvs_static PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
+                         $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED> CUVS_ENABLE_JIT_LTO
     )
 
     target_include_directories(cuvs_static INTERFACE "$<INSTALL_INTERFACE:include>")
@@ -833,7 +818,7 @@ SECTIONS
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>> # header only
       PRIVATE rmm::rmm
               $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-              $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
+              CUDA::nvJitLink
               $<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
               $<COMPILE_ONLY:nvidia::cutlass::cutlass>
               $<COMPILE_ONLY:cuco::cuco>

From 997ab66afb9a8181d82c1b27c9557e56bcfe31b2 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 24 Mar 2026 16:41:22 +0000
Subject: [PATCH 20/21] Remove CUVS_ENABLE_JIT_LTO preprocessor definition

---
 cpp/CMakeLists.txt                            |    6 +-
 .../ivf_flat/ivf_flat_interleaved_scan.cuh    | 1336 -----------------
 ...vf_flat_interleaved_scan_explicit_inst.cuh |    4 -
 3 files changed, 3 insertions(+), 1343 deletions(-)
 delete mode 100644 cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d6f82e8914..8f2067fa78 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -671,7 +671,7 @@ if(NOT BUILD_CPU_ONLY)
 
   target_compile_definitions(
     cuvs_objs PRIVATE $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
-                      $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED> CUVS_ENABLE_JIT_LTO
+                      $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
   )
 
   target_link_libraries(
@@ -747,7 +747,7 @@ if(NOT BUILD_CPU_ONLY)
     )
     target_compile_definitions(
       cuvs PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
-                  $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED> CUVS_ENABLE_JIT_LTO
+                  $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
     )
 
     target_link_libraries(
@@ -800,7 +800,7 @@ SECTIONS
     target_compile_options(cuvs_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
     target_compile_definitions(
       cuvs_static PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
-                         $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED> CUVS_ENABLE_JIT_LTO
+                         $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
     )
 
     target_include_directories(cuvs_static INTERFACE "$<INSTALL_INTERFACE:include>")
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
deleted file mode 100644
index 4c0bb3644a..0000000000
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
+++ /dev/null
@@ -1,1336 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include "../ivf_common.cuh"
-#include "../sample_filter.cuh"
-#include <cuvs/neighbors/common.hpp>
-#include <cuvs/neighbors/ivf_flat.hpp>
-
-#include "../detail/ann_utils.cuh"
-#include <cuvs/distance/distance.hpp>
-#include <raft/core/logger.hpp>
-#include <raft/core/operators.hpp>
-#include <raft/matrix/detail/select_warpsort.cuh>
-#include <raft/util/cuda_rt_essentials.hpp>  // RAFT_CUDA_TRY
-#include <raft/util/device_loads_stores.cuh>
-#include <raft/util/integer_utils.hpp>
-#include <raft/util/pow2_utils.cuh>
-#include <raft/util/vectorized.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-
-namespace cuvs::neighbors::ivf_flat::detail {
-
-using namespace cuvs::spatial::knn::detail;  // NOLINT
-
-constexpr int kThreadsPerBlock = 128;
-
-/**
- * @brief Copy `n` elements per block from one place to another.
- *
- * @param[out] out target pointer (unique per block)
- * @param[in] in source pointer
- * @param n number of elements to copy
- */
-template <int VecBytes = 16, typename T>
-__device__ inline void copy_vectorized(T* out, const T* in, uint32_t n)
-{
-  constexpr int VecElems = VecBytes / sizeof(T);  // NOLINT
-  using align_bytes      = raft::Pow2<(size_t)VecBytes>;
-  if constexpr (VecElems > 1) {
-    using align_elems = raft::Pow2<VecElems>;
-    if (!align_bytes::areSameAlignOffsets(out, in)) {
-      return copy_vectorized<(VecBytes >> 1), T>(out, in, n);
-    }
-    {  // process unaligned head
-      uint32_t head = align_bytes::roundUp(in) - in;
-      if (head > 0) {
-        copy_vectorized<sizeof(T), T>(out, in, head);
-        n -= head;
-        in += head;
-        out += head;
-      }
-    }
-    {  // process main part vectorized
-      using vec_t = typename raft::IOType<T, VecElems>::Type;
-      copy_vectorized<sizeof(vec_t), vec_t>(
-        reinterpret_cast<vec_t*>(out), reinterpret_cast<const vec_t*>(in), align_elems::div(n));
-    }
-    {  // process unaligned tail
-      uint32_t tail = align_elems::mod(n);
-      if (tail > 0) {
-        n -= tail;
-        copy_vectorized<sizeof(T), T>(out + n, in + n, tail);
-      }
-    }
-  }
-  if constexpr (VecElems <= 1) {
-    for (int i = threadIdx.x; i < n; i += blockDim.x) {
-      out[i] = in[i];
-    }
-  }
-}
-
-/**
- * @brief Load a part of a vector from the index and from query, compute the (part of the) distance
- * between them, and aggregate it using the provided Lambda; one structure per thread, per query,
- * and per index item.
- *
- * @tparam kUnroll elements per loop (normally, kUnroll = WarpSize / Veclen)
- * @tparam Lambda computing the part of the distance for one dimension and aggregating it:
- *                void (AccT& acc, AccT x, AccT y)
- * @tparam Veclen size of the vectorized load
- * @tparam T type of the data in the query and the index
- * @tparam AccT type of the accumulated value (an optimization for 8bit values to be loaded as 32bit
- * values)
- */
-template <int kUnroll, typename Lambda, int Veclen, typename T, typename AccT, bool ComputeNorm>
-struct loadAndComputeDist {
-  Lambda compute_dist;
-  AccT& dist;
-  AccT& norm_query;
-  AccT& norm_data;
-
-  __device__ __forceinline__
-  loadAndComputeDist(AccT& dist, Lambda op, AccT& norm_query, AccT& norm_data)
-    : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data)
-  {
-  }
-
-  /**
-   * Load parts of vectors from the index and query and accumulates the partial distance.
-   * This version assumes the query is stored in shared memory.
-   * Every thread here processes exactly kUnroll * Veclen elements independently of others.
-   */
-  template <typename IdxT>
-  __device__ __forceinline__ void runLoadShmemCompute(const T* const& data,
-                                                      const T* query_shared,
-                                                      IdxT loadIndex,
-                                                      IdxT shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      T encV[Veclen];
-      raft::ldg(encV, data + (loadIndex + j * kIndexGroupSize) * Veclen);
-      T queryRegs[Veclen];
-      raft::lds(queryRegs, &query_shared[shmemIndex + j * Veclen]);
-#pragma unroll
-      for (int k = 0; k < Veclen; ++k) {
-        compute_dist(dist, queryRegs[k], encV[k]);
-        if constexpr (ComputeNorm) {
-          norm_query += queryRegs[k] * queryRegs[k];
-          norm_data += encV[k] * encV[k];
-        }
-      }
-    }
-  }
-
-  /**
-   * Load parts of vectors from the index and query and accumulates the partial distance.
-   * This version assumes the query is stored in the global memory and is different for every
-   * thread. One warp loads exactly WarpSize query elements at once and then reshuffles them into
-   * corresponding threads (`WarpSize / (kUnroll * Veclen)` elements per thread at once).
-   */
-  template <typename IdxT>
-  __device__ __forceinline__ void runLoadShflAndCompute(const T*& data,
-                                                        const T* query,
-                                                        IdxT baseLoadIndex,
-                                                        const int lane_id)
-  {
-    T queryReg               = query[baseLoadIndex + lane_id];
-    constexpr int stride     = kUnroll * Veclen;
-    constexpr int totalIter  = raft::WarpSize / stride;
-    constexpr int gmemStride = stride * kIndexGroupSize;
-#pragma unroll
-    for (int i = 0; i < totalIter; ++i, data += gmemStride) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        T encV[Veclen];
-        raft::ldg(encV, data + (lane_id + j * kIndexGroupSize) * Veclen);
-        const int d = (i * kUnroll + j) * Veclen;
-#pragma unroll
-        for (int k = 0; k < Veclen; ++k) {
-          T q = raft::shfl(queryReg, d + k, raft::WarpSize);
-          compute_dist(dist, q, encV[k]);
-          if constexpr (ComputeNorm) {
-            norm_query += q * q;
-            norm_data += encV[k] * encV[k];
-          }
-        }
-      }
-    }
-  }
-
-  /**
-   * Load parts of vectors from the index and query and accumulates the partial distance.
-   * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `WarpSize`.
-   */
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const T*& data, const T* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    const int loadDim     = dimBlocks + lane_id;
-    T queryReg            = loadDim < dim ? query[loadDim] : T{0};
-    const int loadDataIdx = lane_id * Veclen;
-    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) {
-      T enc[Veclen];
-      raft::ldg(enc, data + loadDataIdx);
-#pragma unroll
-      for (int k = 0; k < Veclen; k++) {
-        T q = raft::shfl(queryReg, d + k, raft::WarpSize);
-        compute_dist(dist, q, enc[k]);
-        if constexpr (ComputeNorm) {
-          norm_query += q * q;
-          norm_data += enc[k] * enc[k];
-        }
-      }
-    }
-  }
-};
-
-// This handles uint8_t 8, 16 Veclens
-template <int kUnroll, typename Lambda, int uint8_veclen, bool ComputeNorm>
-struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t, ComputeNorm> {
-  Lambda compute_dist;
-  uint32_t& dist;
-  uint32_t& norm_query;
-  uint32_t& norm_data;
-
-  __device__ __forceinline__
-  loadAndComputeDist(uint32_t& dist, Lambda op, uint32_t& norm_query, uint32_t& norm_data)
-    : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
-    loadIndex                = loadIndex * veclen_int;
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV[veclen_int];
-      raft::ldg(
-        encV,
-        reinterpret_cast<unsigned const*>(data) + loadIndex + j * kIndexGroupSize * veclen_int);
-      uint32_t queryRegs[veclen_int];
-      raft::lds(queryRegs,
-                reinterpret_cast<unsigned const*>(query_shared + shmemIndex) + j * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        compute_dist(dist, queryRegs[k], encV[k]);
-        if constexpr (ComputeNorm) {
-          norm_query = raft::dp4a(queryRegs[k], queryRegs[k], norm_query);
-          norm_data  = raft::dp4a(encV[k], encV[k], norm_data);
-        }
-      }
-    }
-  }
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
-    uint32_t queryReg =
-      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int stride = kUnroll * uint8_veclen;
-
-#pragma unroll
-    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV[veclen_int];
-        raft::ldg(
-          encV,
-          reinterpret_cast<unsigned const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
-        const int d = (i * kUnroll + j) * veclen_int;
-#pragma unroll
-        for (int k = 0; k < veclen_int; ++k) {
-          uint32_t q = raft::shfl(queryReg, d + k, raft::WarpSize);
-          compute_dist(dist, q, encV[k]);
-          if constexpr (ComputeNorm) {
-            norm_query = raft::dp4a(q, q, norm_query);
-            norm_data  = raft::dp4a(encV[k], encV[k], norm_data);
-          }
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen_int = uint8_veclen / 4;
-    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int
-    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint32_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks;
-         d += uint8_veclen, data += kIndexGroupSize * uint8_veclen) {
-      uint32_t enc[veclen_int];
-      raft::ldg(enc, reinterpret_cast<uint32_t const*>(data) + lane_id * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        uint32_t q = raft::shfl(queryReg, (d / 4) + k, raft::WarpSize);
-        compute_dist(dist, q, enc[k]);
-        if constexpr (ComputeNorm) {
-          norm_query = raft::dp4a(q, q, norm_query);
-          norm_data  = raft::dp4a(enc[k], enc[k], norm_data);
-        }
-      }
-    }
-  }
-};
-
-// Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while
-// using above common template of int2/int4
-template <int kUnroll, typename Lambda, bool ComputeNorm>
-struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t, ComputeNorm> {
-  Lambda compute_dist;
-  uint32_t& dist;
-  uint32_t& norm_query;
-  uint32_t& norm_data;
-
-  __device__ __forceinline__
-  loadAndComputeDist(uint32_t& dist, Lambda op, uint32_t& norm_query, uint32_t& norm_data)
-    : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = reinterpret_cast<unsigned const*>(data)[loadIndex + j * kIndexGroupSize];
-      uint32_t queryRegs = reinterpret_cast<unsigned const*>(query_shared + shmemIndex)[j];
-      compute_dist(dist, queryRegs, encV);
-      if constexpr (ComputeNorm) {
-        norm_query = raft::dp4a(queryRegs, queryRegs, norm_query);
-        norm_data  = raft::dp4a(encV, encV, norm_data);
-      }
-    }
-  }
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    uint32_t queryReg =
-      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int veclen = 4;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = reinterpret_cast<unsigned const*>(data)[lane_id + j * kIndexGroupSize];
-        uint32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
-        compute_dist(dist, q, encV);
-        if constexpr (ComputeNorm) {
-          norm_query = raft::dp4a(q, q, norm_query);
-          norm_data  = raft::dp4a(encV, encV, norm_data);
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen = 4;
-    const int loadDim    = dimBlocks + lane_id;
-    uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = reinterpret_cast<unsigned const*>(data)[lane_id];
-      uint32_t q   = raft::shfl(queryReg, d / veclen, raft::WarpSize);
-      compute_dist(dist, q, enc);
-      if constexpr (ComputeNorm) {
-        norm_query = raft::dp4a(q, q, norm_query);
-        norm_data  = raft::dp4a(enc, enc, norm_data);
-      }
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda, bool ComputeNorm>
-struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t, ComputeNorm> {
-  Lambda compute_dist;
-  uint32_t& dist;
-  uint32_t& norm_query;
-  uint32_t& norm_data;
-
-  __device__ __forceinline__
-  loadAndComputeDist(uint32_t& dist, Lambda op, uint32_t& norm_query, uint32_t& norm_data)
-    : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
-      uint32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
-      compute_dist(dist, queryRegs, encV);
-      if constexpr (ComputeNorm) {
-        norm_query = raft::dp4a(queryRegs, queryRegs, norm_query);
-        norm_data  = raft::dp4a(encV, encV, norm_data);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    uint32_t queryReg =
-      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int veclen = 2;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
-        uint32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
-        compute_dist(dist, q, encV);
-        if constexpr (ComputeNorm) {
-          norm_query = raft::dp4a(q, q, norm_query);
-          norm_data  = raft::dp4a(encV, encV, norm_data);
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen = 2;
-    int loadDim          = dimBlocks + lane_id * veclen;
-    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = reinterpret_cast<uint16_t const*>(data)[lane_id];
-      uint32_t q   = raft::shfl(queryReg, d / veclen, raft::WarpSize);
-      compute_dist(dist, q, enc);
-      if constexpr (ComputeNorm) {
-        norm_query = raft::dp4a(q, q, norm_query);
-        norm_data  = raft::dp4a(enc, enc, norm_data);
-      }
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda, bool ComputeNorm>
-struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t, ComputeNorm> {
-  Lambda compute_dist;
-  uint32_t& dist;
-  uint32_t& norm_query;
-  uint32_t& norm_data;
-
-  __device__ __forceinline__
-  loadAndComputeDist(uint32_t& dist, Lambda op, uint32_t& norm_query, uint32_t& norm_data)
-    : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = data[loadIndex + j * kIndexGroupSize];
-      uint32_t queryRegs = query_shared[shmemIndex + j];
-      compute_dist(dist, queryRegs, encV);
-      if constexpr (ComputeNorm) {
-        norm_query += queryRegs * queryRegs;
-        norm_data += encV * encV;
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    uint32_t queryReg    = query[baseLoadIndex + lane_id];
-    constexpr int veclen = 1;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = data[lane_id + j * kIndexGroupSize];
-        uint32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
-        compute_dist(dist, q, encV);
-        if constexpr (ComputeNorm) {
-          norm_query += q * q;
-          norm_data += encV * encV;
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen = 1;
-    int loadDim          = dimBlocks + lane_id;
-    uint32_t queryReg    = loadDim < dim ? query[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = data[lane_id];
-      uint32_t q   = raft::shfl(queryReg, d, raft::WarpSize);
-      compute_dist(dist, q, enc);
-      if constexpr (ComputeNorm) {
-        norm_query += q * q;
-        norm_data += enc * enc;
-      }
-    }
-  }
-};
-
-// This device function is for int8 veclens 4, 8 and 16
-template <int kUnroll, typename Lambda, int int8_veclen, bool ComputeNorm>
-struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t, ComputeNorm> {
-  Lambda compute_dist;
-  int32_t& dist;
-  int32_t& norm_query;
-  int32_t& norm_data;
-
-  __device__ __forceinline__
-  loadAndComputeDist(int32_t& dist, Lambda op, int32_t& norm_query, int32_t& norm_data)
-    : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
-
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      int32_t encV[veclen_int];
-      raft::ldg(
-        encV,
-        reinterpret_cast<int32_t const*>(data) + (loadIndex + j * kIndexGroupSize) * veclen_int);
-      int32_t queryRegs[veclen_int];
-      raft::lds(queryRegs,
-                reinterpret_cast<int32_t const*>(query_shared + shmemIndex) + j * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        compute_dist(dist, queryRegs[k], encV[k]);
-        if constexpr (ComputeNorm) {
-          norm_query = raft::dp4a(queryRegs[k], queryRegs[k], norm_query);
-          norm_data  = raft::dp4a(encV[k], encV[k], norm_data);
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
-                                                        const int8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
-
-    int32_t queryReg =
-      (lane_id < 8) ? reinterpret_cast<int32_t const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int stride = kUnroll * int8_veclen;
-
-#pragma unroll
-    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        int32_t encV[veclen_int];
-        raft::ldg(
-          encV,
-          reinterpret_cast<int32_t const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
-        const int d = (i * kUnroll + j) * veclen_int;
-#pragma unroll
-        for (int k = 0; k < veclen_int; ++k) {
-          int32_t q = raft::shfl(queryReg, d + k, raft::WarpSize);
-          compute_dist(dist, q, encV[k]);
-          if constexpr (ComputeNorm) {
-            norm_query = raft::dp4a(q, q, norm_query);
-            norm_data  = raft::dp4a(encV[k], encV[k], norm_data);
-          }
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    constexpr int veclen_int = int8_veclen / 4;
-    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int;
-    int32_t queryReg = loadDim < dim ? reinterpret_cast<int32_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += kIndexGroupSize * int8_veclen) {
-      int32_t enc[veclen_int];
-      raft::ldg(enc, reinterpret_cast<int32_t const*>(data) + lane_id * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        int32_t q = raft::shfl(queryReg, (d / 4) + k, raft::WarpSize);  // Here 4 is for 1 - int;
-        compute_dist(dist, q, enc[k]);
-        if constexpr (ComputeNorm) {
-          norm_query = raft::dp4a(q, q, norm_query);
-          norm_data  = raft::dp4a(enc[k], enc[k], norm_data);
-        }
-      }
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda, bool ComputeNorm>
-struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t, ComputeNorm> {
-  Lambda compute_dist;
-  int32_t& dist;
-  int32_t& norm_query;
-  int32_t& norm_data;
-  __device__ __forceinline__
-  loadAndComputeDist(int32_t& dist, Lambda op, int32_t& norm_query, int32_t& norm_data)
-    : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data)
-  {
-  }
-  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      int32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
-      int32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
-      compute_dist(dist, queryRegs, encV);
-      if constexpr (ComputeNorm) {
-        norm_query = raft::dp4a(queryRegs, queryRegs, norm_query);
-        norm_data  = raft::dp4a(encV, encV, norm_data);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
-                                                        const int8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    int32_t queryReg =
-      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int veclen = 2;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        int32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
-        int32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
-        compute_dist(dist, q, encV);
-        if constexpr (ComputeNorm) {
-          norm_query = raft::dp4a(queryReg, queryReg, norm_query);
-          norm_data  = raft::dp4a(encV, encV, norm_data);
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    constexpr int veclen = 2;
-    int loadDim          = dimBlocks + lane_id * veclen;
-    int32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      int32_t enc = reinterpret_cast<uint16_t const*>(data + lane_id * veclen)[0];
-      int32_t q   = raft::shfl(queryReg, d / veclen, raft::WarpSize);
-      compute_dist(dist, q, enc);
-      if constexpr (ComputeNorm) {
-        norm_query = raft::dp4a(q, q, norm_query);
-        norm_data  = raft::dp4a(enc, enc, norm_data);
-      }
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda, bool ComputeNorm>
-struct loadAndComputeDist<kUnroll, Lambda, 1, int8_t, int32_t, ComputeNorm> {
-  Lambda compute_dist;
-  int32_t& dist;
-  int32_t& norm_query;
-  int32_t& norm_data;
-  __device__ __forceinline__
-  loadAndComputeDist(int32_t& dist, Lambda op, int32_t& norm_query, int32_t& norm_data)
-    : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      compute_dist(dist, query_shared[shmemIndex + j], data[loadIndex + j * kIndexGroupSize]);
-      if constexpr (ComputeNorm) {
-        norm_query += int32_t{query_shared[shmemIndex + j]} * int32_t{query_shared[shmemIndex + j]};
-        norm_data += int32_t{data[loadIndex + j * kIndexGroupSize]} *
-                     int32_t{data[loadIndex + j * kIndexGroupSize]};
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
-                                                        const int8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    constexpr int veclen = 1;
-    constexpr int stride = kUnroll * veclen;
-    int32_t queryReg     = query[baseLoadIndex + lane_id];
-
-#pragma unroll
-    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        int32_t q = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
-        compute_dist(dist, q, data[lane_id + j * kIndexGroupSize]);
-        if constexpr (ComputeNorm) {
-          norm_query += q * q;
-          norm_data += data[lane_id + j * kIndexGroupSize] * data[lane_id + j * kIndexGroupSize];
-        }
-      }
-    }
-  }
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    constexpr int veclen = 1;
-    const int loadDim    = dimBlocks + lane_id;
-    int32_t queryReg     = loadDim < dim ? query[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      int32_t q = raft::shfl(queryReg, d, raft::WarpSize);
-      compute_dist(dist, q, data[lane_id]);
-      if constexpr (ComputeNorm) {
-        norm_query += q * q;
-        norm_data += int32_t{data[lane_id]} * int32_t{data[lane_id]};
-      }
-    }
-  }
-};
-
-// switch to dummy blocksort when Capacity is 0 this explicit dummy is chosen
-// to support access to warpsort constants like ::queue_t::kDummy
-template <int Capacity, bool Ascending, typename T, typename IdxT>
-struct flat_block_sort {
-  using type = raft::matrix::detail::select::warpsort::block_sort<
-    raft::matrix::detail::select::warpsort::warp_sort_filtered,
-    Capacity,
-    Ascending,
-    T,
-    IdxT>;
-};
-
-template <typename T, bool Ascending, typename IdxT>
-struct flat_block_sort<0, Ascending, T, IdxT>
-  : ivf::detail::dummy_block_sort_t<T, IdxT, Ascending> {
-  using type = ivf::detail::dummy_block_sort_t<T, IdxT, Ascending>;
-};
-
-template <int Capacity, bool Ascending, typename T, typename IdxT>
-using block_sort_t = typename flat_block_sort<Capacity, Ascending, T, IdxT>::type;
-
-/**
- * Scan clusters for nearest neighbors of the query vectors.
- * See `ivfflat_interleaved_scan` for more information.
- *
- * The clusters are stored in the interleaved index format described in ivf_flat_types.hpp.
- * For each query vector, a set of clusters is probed: the distance to each vector in the cluster is
- * calculated, and the top-k nearest neighbors are selected.
- *
- * @param compute_dist distance function
- * @param query_smem_elems number of dimensions of the query vector to fit in a shared memory of a
- * block; this number must be a multiple of `WarpSize * Veclen`.
- * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim]
- * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes]
- * @param[in] list_indices index<T, IdxT>.indices
- * @param[in] list_data index<T, IdxT>.data
- * @param[in] list_sizes index<T, IdxT>.list_sizes
- * @param[in] list_offsets index<T, IdxT>.list_offsets
- * @param n_probes
- * @param k
- * @param dim
- * @param sample_filter
- * @param[out] neighbors
- * @param[out] distances
- */
-template <int Capacity,
-          int Veclen,
-          bool Ascending,
-          bool ComputeNorm,
-          typename T,
-          typename AccT,
-          typename IdxT,
-          typename IvfSampleFilterT,
-          typename Lambda,
-          typename PostLambda>
-RAFT_KERNEL __launch_bounds__(kThreadsPerBlock)
-  interleaved_scan_kernel(Lambda compute_dist,
-                          PostLambda post_process,
-                          const uint32_t query_smem_elems,
-                          const T* query,
-                          const uint32_t* coarse_index,
-                          const T* const* list_data_ptrs,
-                          const uint32_t* list_sizes,
-                          const uint32_t queries_offset,
-                          const uint32_t n_probes,
-                          const uint32_t k,
-                          const uint32_t max_samples,
-                          const uint32_t* chunk_indices,
-                          const uint32_t dim,
-                          IvfSampleFilterT sample_filter,
-                          uint32_t* neighbors,
-                          float* distances)
-{
-  extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
-  constexpr bool kManageLocalTopK = Capacity > 0;
-  // Using shared memory for the (part of the) query;
-  // This allows to save on global memory bandwidth when reading index and query
-  // data at the same time.
-  // Its size is `query_smem_elems`.
-  T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
-  // Make the query input and output point to this block's shared query
-  {
-    const int query_id = blockIdx.y;
-    query += query_id * dim;
-    if constexpr (kManageLocalTopK) {
-      neighbors += query_id * k * gridDim.x + blockIdx.x * k;
-      distances += query_id * k * gridDim.x + blockIdx.x * k;
-    } else {
-      distances += query_id * uint64_t(max_samples);
-    }
-    chunk_indices += (n_probes * query_id);
-    coarse_index += query_id * n_probes;
-  }
-
-  // Copy a part of the query into shared memory for faster processing
-  copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
-  __syncthreads();
-
-  using local_topk_t = block_sort_t<Capacity, Ascending, float, uint32_t>;
-  local_topk_t queue(k);
-  {
-    using align_warp  = raft::Pow2<raft::WarpSize>;
-    const int lane_id = align_warp::mod(threadIdx.x);
-
-    // How many full warps needed to compute the distance (without remainder)
-    const uint32_t full_warps_along_dim = align_warp::roundDown(dim);
-
-    const uint32_t shm_assisted_dim =
-      (dim > query_smem_elems) ? query_smem_elems : full_warps_along_dim;
-
-    // Every CUDA block scans one cluster at a time.
-    for (int probe_id = blockIdx.x; probe_id < n_probes; probe_id += gridDim.x) {
-      const uint32_t list_id = coarse_index[probe_id];  // The id of cluster(list)
-
-      // The number of vectors in each cluster(list); [nlist]
-      const uint32_t list_length = list_sizes[list_id];
-
-      // The number of interleaved groups to be processed
-      const uint32_t num_groups =
-        align_warp::div(list_length + align_warp::Mask);  // ceildiv by power of 2
-
-      uint32_t sample_offset = 0;
-      if (probe_id > 0) { sample_offset = chunk_indices[probe_id - 1]; }
-      assert(list_length == chunk_indices[probe_id] - sample_offset);
-      assert(sample_offset + list_length <= max_samples);
-
-      constexpr int kUnroll        = raft::WarpSize / Veclen;
-      constexpr uint32_t kNumWarps = kThreadsPerBlock / raft::WarpSize;
-      // Every warp reads WarpSize vectors and computes the distances to them.
-      // Then, the distances and corresponding ids are distributed among the threads,
-      // and each thread adds one (id, dist) pair to the filtering queue.
-      for (uint32_t group_id = align_warp::div(threadIdx.x); group_id < num_groups;
-           group_id += kNumWarps) {
-        AccT dist         = 0;
-        AccT norm_query   = 0;
-        AccT norm_dataset = 0;
-        // This is where this warp begins reading data (start position of an interleaved group)
-        const T* data = list_data_ptrs[list_id] + (group_id * kIndexGroupSize) * dim;
-
-        // This is the vector a given lane/thread handles
-        const uint32_t vec_id = group_id * raft::WarpSize + lane_id;
-        const bool valid =
-          vec_id < list_length && sample_filter(queries_offset + blockIdx.y, list_id, vec_id);
-
-        if (valid) {
-          // Process first shm_assisted_dim dimensions (always using shared memory)
-          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT, ComputeNorm> lc(
-            dist, compute_dist, norm_query, norm_dataset);
-          for (int pos = 0; pos < shm_assisted_dim;
-               pos += raft::WarpSize, data += kIndexGroupSize * raft::WarpSize) {
-            lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
-          }
-
-          if (dim > query_smem_elems) {
-            // The default path - using shfl ops - for dimensions beyond query_smem_elems
-            loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT, ComputeNorm> lc(
-              dist, compute_dist, norm_query, norm_dataset);
-            for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += raft::WarpSize) {
-              lc.runLoadShflAndCompute(data, query, pos, lane_id);
-            }
-            lc.runLoadShflAndComputeRemainder(data, query, lane_id, dim, full_warps_along_dim);
-          } else {
-            // when  shm_assisted_dim == full_warps_along_dim < dim
-            loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT, ComputeNorm> lc(
-              dist, compute_dist, norm_query, norm_dataset);
-            for (int pos = full_warps_along_dim; pos < dim;
-                 pos += Veclen, data += kIndexGroupSize * Veclen) {
-              lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
-            }
-          }
-        }
-
-        // Enqueue one element per thread
-        float val = valid ? static_cast<float>(dist) : local_topk_t::queue_t::kDummy;
-
-        if constexpr (ComputeNorm) {
-          if (valid)
-            val = val / (raft::sqrt(static_cast<float>(norm_query)) *
-                         raft::sqrt(static_cast<float>(norm_dataset)));
-        }
-        if constexpr (kManageLocalTopK) {
-          queue.add(val, sample_offset + vec_id);
-        } else {
-          if (vec_id < list_length) distances[sample_offset + vec_id] = val;
-        }
-      }
-
-      // fill up unused slots for current query
-      if constexpr (!kManageLocalTopK) {
-        if (probe_id + 1 == n_probes) {
-          for (uint32_t i = threadIdx.x + sample_offset + list_length; i < max_samples;
-               i += blockDim.x) {
-            distances[i] = local_topk_t::queue_t::kDummy;
-          }
-        }
-      }
-    }
-  }
-
-  // finalize and store selected neighbours
-  if constexpr (kManageLocalTopK) {
-    __syncthreads();
-    queue.done(interleaved_scan_kernel_smem);
-    queue.store(distances, neighbors, post_process);
-  }
-}
-
-/**
- *  Configure the gridDim.x to maximize GPU occupancy, but reduce the output size
- */
-template <typename T>
-uint32_t configure_launch_x(uint32_t numQueries, uint32_t n_probes, int32_t sMemSize, T func)
-{
-  int dev_id;
-  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
-  int num_sms;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
-  int num_blocks_per_sm = 0;
-  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks_per_sm, func, kThreadsPerBlock, sMemSize));
-
-  size_t min_grid_size = num_sms * num_blocks_per_sm;
-  size_t min_grid_x    = raft::ceildiv<size_t>(min_grid_size, numQueries);
-  return min_grid_x > n_probes ? n_probes : static_cast<uint32_t>(min_grid_x);
-}
-
-template <int Capacity,
-          int Veclen,
-          bool Ascending,
-          bool ComputeNorm,
-          typename T,
-          typename AccT,
-          typename IdxT,
-          typename IvfSampleFilterT,
-          typename Lambda,
-          typename PostLambda>
-void launch_kernel(Lambda lambda,
-                   PostLambda post_process,
-                   const index<T, IdxT>& index,
-                   const T* queries,
-                   const uint32_t* coarse_index,
-                   const uint32_t num_queries,
-                   const uint32_t queries_offset,
-                   const uint32_t n_probes,
-                   const uint32_t k,
-                   const uint32_t max_samples,
-                   const uint32_t* chunk_indices,
-                   IvfSampleFilterT sample_filter,
-                   uint32_t* neighbors,
-                   float* distances,
-                   uint32_t& grid_dim_x,
-                   rmm::cuda_stream_view stream)
-{
-  RAFT_EXPECTS(Veclen == index.veclen(),
-               "Configured Veclen does not match the index interleaving pattern.");
-  constexpr auto kKernel   = interleaved_scan_kernel<Capacity,
-                                                     Veclen,
-                                                     Ascending,
-                                                     ComputeNorm,
-                                                     T,
-                                                     AccT,
-                                                     IdxT,
-                                                     IvfSampleFilterT,
-                                                     Lambda,
-                                                     PostLambda>;
-  const int max_query_smem = 16384;
-  int query_smem_elems     = std::min<int>(max_query_smem / sizeof(T),
-                                       raft::Pow2<Veclen * raft::WarpSize>::roundUp(index.dim()));
-  int smem_size            = query_smem_elems * sizeof(T);
-
-  if constexpr (Capacity > 0) {
-    constexpr int kSubwarpSize = std::min<int>(Capacity, raft::WarpSize);
-    auto block_merge_mem =
-      raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<float, IdxT>(
-        kThreadsPerBlock / kSubwarpSize, k);
-    smem_size += std::max<int>(smem_size, block_merge_mem);
-  }
-
-  // power-of-two less than cuda limit (for better addr alignment)
-  constexpr uint32_t kMaxGridY = 32768;
-
-  if (grid_dim_x == 0) {
-    grid_dim_x = configure_launch_x(std::min(kMaxGridY, num_queries), n_probes, smem_size, kKernel);
-    return;
-  }
-
-  for (uint32_t query_offset = 0; query_offset < num_queries; query_offset += kMaxGridY) {
-    uint32_t grid_dim_y = std::min<uint32_t>(kMaxGridY, num_queries - query_offset);
-    dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
-    dim3 block_dim(kThreadsPerBlock);
-    RAFT_LOG_TRACE(
-      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), n_probes = %d, "
-      "smem_size = %d",
-      grid_dim.x,
-      grid_dim.y,
-      block_dim.x,
-      n_probes,
-      smem_size);
-    kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
-                                                        post_process,
-                                                        query_smem_elems,
-                                                        queries,
-                                                        coarse_index,
-                                                        index.data_ptrs().data_handle(),
-                                                        index.list_sizes().data_handle(),
-                                                        queries_offset + query_offset,
-                                                        n_probes,
-                                                        k,
-                                                        max_samples,
-                                                        chunk_indices,
-                                                        index.dim(),
-                                                        sample_filter,
-                                                        neighbors,
-                                                        distances);
-    queries += grid_dim_y * index.dim();
-    if constexpr (Capacity > 0) {
-      neighbors += grid_dim_y * grid_dim_x * k;
-      distances += grid_dim_y * grid_dim_x * k;
-    } else {
-      distances += grid_dim_y * max_samples;
-    }
-    chunk_indices += grid_dim_y * n_probes;
-    coarse_index += grid_dim_y * n_probes;
-  }
-}
-
-template <int Veclen, typename T, typename AccT>
-struct euclidean_dist {
-  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
-  {
-    const auto diff = x - y;
-    acc += diff * diff;
-  }
-};
-
-template <int Veclen>
-struct euclidean_dist<Veclen, uint8_t, uint32_t> {
-  __device__ __forceinline__ void operator()(uint32_t& acc, uint32_t x, uint32_t y)
-  {
-    if constexpr (Veclen > 1) {
-      const auto diff = __vabsdiffu4(x, y);
-      acc             = raft::dp4a(diff, diff, acc);
-    } else {
-      const auto diff = __usad(x, y, 0u);
-      acc += diff * diff;
-    }
-  }
-};
-
-template <int Veclen>
-struct euclidean_dist<Veclen, int8_t, int32_t> {
-  __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y)
-  {
-    if constexpr (Veclen > 1) {
-      // Note that we enforce here that the unsigned version of dp4a is used, because the difference
-      // between two int8 numbers can be greater than 127 and therefore represented as a negative
-      // number in int8. Casting from int8 to int32 would yield incorrect results, while casting
-      // from uint8 to uint32 is correct.
-      const auto diff = __vabsdiffs4(x, y);
-      acc             = raft::dp4a(diff, diff, static_cast<uint32_t>(acc));
-    } else {
-      const auto diff = x - y;
-      acc += diff * diff;
-    }
-  }
-};
-
-template <int Veclen, typename T, typename AccT>
-struct inner_prod_dist {
-  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
-  {
-    if constexpr (Veclen > 1 && (std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>)) {
-      acc = raft::dp4a(x, y, acc);
-    } else {
-      acc += x * y;
-    }
-  }
-};
-
-/** Select the distance computation function and forward the rest of the arguments. */
-template <int Capacity,
-          int Veclen,
-          bool Ascending,
-          typename T,
-          typename AccT,
-          typename IdxT,
-          typename IvfSampleFilterT,
-          typename... Args>
-void launch_with_fixed_consts(cuvs::distance::DistanceType metric, Args&&... args)
-{
-  switch (metric) {
-    case cuvs::distance::DistanceType::L2Expanded:
-    case cuvs::distance::DistanceType::L2Unexpanded:
-      return launch_kernel<Capacity,
-                           Veclen,
-                           Ascending,
-                           false,
-                           T,
-                           AccT,
-                           IdxT,
-                           IvfSampleFilterT,
-                           euclidean_dist<Veclen, T, AccT>,
-                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
-    case cuvs::distance::DistanceType::L2SqrtExpanded:
-    case cuvs::distance::DistanceType::L2SqrtUnexpanded:
-      return launch_kernel<Capacity,
-                           Veclen,
-                           Ascending,
-                           false,
-                           T,
-                           AccT,
-                           IdxT,
-                           IvfSampleFilterT,
-                           euclidean_dist<Veclen, T, AccT>,
-                           raft::sqrt_op>({}, {}, std::forward<Args>(args)...);
-    case cuvs::distance::DistanceType::InnerProduct:
-      return launch_kernel<Capacity,
-                           Veclen,
-                           Ascending,
-                           false,
-                           T,
-                           AccT,
-                           IdxT,
-                           IvfSampleFilterT,
-                           inner_prod_dist<Veclen, T, AccT>,
-                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
-    case cuvs::distance::DistanceType::CosineExpanded:
-      // NB: "Ascending" is reversed because the post-processing step is done after that sort
-      return launch_kernel<Capacity,
-                           Veclen,
-                           !Ascending,
-                           true,
-                           T,
-                           AccT,
-                           IdxT,
-                           IvfSampleFilterT,
-                           inner_prod_dist<Veclen, T, AccT>>(
-        {},
-        raft::compose_op(raft::add_const_op<float>{1.0f}, raft::mul_const_op<float>{-1.0f}),
-        std::forward<Args>(args)...);  // NB: update the description of `knn::ivf_flat::build` when
-                                       // adding here a new metric.
-    default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
-  }
-}
-
-/**
- * Lift the `capacity` and `veclen` parameters to the template level,
- * forward the rest of the arguments unmodified to `launch_interleaved_scan_kernel`.
- */
-template <typename T,
-          typename AccT,
-          typename IdxT,
-          typename IvfSampleFilterT,
-          int Capacity = raft::matrix::detail::select::warpsort::kMaxCapacity,
-          int Veclen   = std::max<int>(1, 16 / sizeof(T))>
-struct select_interleaved_scan_kernel {
-  /**
-   * Recursively reduce the `Capacity` and `Veclen` parameters until they match the
-   * corresponding runtime arguments.
-   * By default, this recursive process starts with maximum possible values of the
-   * two parameters and ends with both values equal to 1.
-   */
-  template <typename... Args>
-  static inline void run(int k_max, int veclen, bool select_min, Args&&... args)
-  {
-    if constexpr (Capacity > 0) {
-      if (k_max == 0 || k_max > Capacity) {
-        return select_interleaved_scan_kernel<T, AccT, IdxT, IvfSampleFilterT, 0, Veclen>::run(
-          k_max, veclen, select_min, std::forward<Args>(args)...);
-      }
-    }
-    if constexpr (Capacity > 1) {
-      if (k_max * 2 <= Capacity) {
-        return select_interleaved_scan_kernel<T,
-                                              AccT,
-                                              IdxT,
-                                              IvfSampleFilterT,
-                                              Capacity / 2,
-                                              Veclen>::run(k_max,
-                                                           veclen,
-                                                           select_min,
-                                                           std::forward<Args>(args)...);
-      }
-    }
-    if constexpr (Veclen > 1) {
-      if (veclen % Veclen != 0) {
-        return select_interleaved_scan_kernel<T, AccT, IdxT, IvfSampleFilterT, Capacity, 1>::run(
-          k_max, 1, select_min, std::forward<Args>(args)...);
-      }
-    }
-    // NB: this is the limitation of the warpsort structures that use a huge number of
-    //     registers (used in the main kernel here).
-    RAFT_EXPECTS(Capacity == 0 || k_max == Capacity,
-                 "Capacity must be either 0 or a power-of-two not bigger than the maximum "
-                 "allowed size matrix::detail::select::warpsort::kMaxCapacity (%d).",
-                 raft::matrix::detail::select::warpsort::kMaxCapacity);
-    RAFT_EXPECTS(
-      veclen == Veclen,
-      "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
-    if (select_min) {
-      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT, IdxT, IvfSampleFilterT>(
-        std::forward<Args>(args)...);
-    } else {
-      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT, IdxT, IvfSampleFilterT>(
-        std::forward<Args>(args)...);
-    }
-  }
-};
-
-/**
- * @brief Configure and launch an appropriate template instance of the interleaved scan kernel.
- *
- * @tparam T value type
- * @tparam AccT accumulated type
- * @tparam IdxT type of the indices
- *
- * @param index previously built ivf-flat index
- * @param[in] queries device pointer to the query vectors [batch_size, dim]
- * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes]
- * @param n_queries batch size
- * @param[in] queries_offset
- *   An offset of the current query batch. It is used for feeding sample_filter with the
- *   correct query index.
- * @param metric type of the measured distance
- * @param n_probes number of nearest clusters to query
- * @param k number of nearest neighbors.
- *            NB: the maximum value of `k` is limited statically by `kMaxCapacity`.
- * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
- * metric.
- * @param[out] neighbors device pointer to the result indices for each query and cluster
- * [batch_size, grid_dim_x, k]
- * @param[out] distances device pointer to the result distances for each query and cluster
- * [batch_size, grid_dim_x, k]
- * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters;
- *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes)
- * @param stream
- * @param sample_filter
- *   A filter that selects samples for a given query. Use an instance of none_sample_filter to
- *   provide a green light for every sample.
- */
-template <typename T, typename AccT, typename IdxT, typename IvfSampleFilterT>
-void ivfflat_interleaved_scan(const index<T, IdxT>& index,
-                              const T* queries,
-                              const uint32_t* coarse_query_results,
-                              const uint32_t n_queries,
-                              const uint32_t queries_offset,
-                              const cuvs::distance::DistanceType metric,
-                              const uint32_t n_probes,
-                              const uint32_t k,
-                              const uint32_t max_samples,
-                              const uint32_t* chunk_indices,
-                              const bool select_min,
-                              IvfSampleFilterT sample_filter,
-                              uint32_t* neighbors,
-                              float* distances,
-                              uint32_t& grid_dim_x,
-                              rmm::cuda_stream_view stream)
-{
-  const int capacity = raft::bound_by_power_of_two(k);
-
-  auto filter_adapter = cuvs::neighbors::filtering::ivf_to_sample_filter(
-    index.inds_ptrs().data_handle(), sample_filter);
-  select_interleaved_scan_kernel<T, AccT, IdxT, decltype(filter_adapter)>::run(capacity,
-                                                                               index.veclen(),
-                                                                               select_min,
-                                                                               metric,
-                                                                               index,
-                                                                               queries,
-                                                                               coarse_query_results,
-                                                                               n_queries,
-                                                                               queries_offset,
-                                                                               n_probes,
-                                                                               k,
-                                                                               max_samples,
-                                                                               chunk_indices,
-                                                                               filter_adapter,
-                                                                               neighbors,
-                                                                               distances,
-                                                                               grid_dim_x,
-                                                                               stream);
-}
-
-}  // namespace cuvs::neighbors::ivf_flat::detail
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_explicit_inst.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_explicit_inst.cuh
index 81833a63b1..25e7eda686 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_explicit_inst.cuh
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_explicit_inst.cuh
@@ -6,11 +6,7 @@
 #pragma once
 
 #include "../detail/ann_utils.cuh"
-#ifdef CUVS_ENABLE_JIT_LTO
 #include "ivf_flat_interleaved_scan_jit.cuh"
-#else
-#include "ivf_flat_interleaved_scan.cuh"
-#endif
 #include <cstdint>
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_flat.hpp>

From fe675259a40ddfe254ad8da3322a925b863f3ecc Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 24 Mar 2026 18:03:32 +0000
Subject: [PATCH 21/21] Use libnvjitlink run exports

---
 conda/recipes/libcuvs/recipe.yaml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/conda/recipes/libcuvs/recipe.yaml b/conda/recipes/libcuvs/recipe.yaml
index 71b2ec0b4d..0e1b5451bc 100644
--- a/conda/recipes/libcuvs/recipe.yaml
+++ b/conda/recipes/libcuvs/recipe.yaml
@@ -129,7 +129,6 @@ outputs:
         - libcurand
         - libcusolver
         - libcusparse
-        - libnvjitlink
       ignore_run_exports:
         by_name:
           - cuda-cudart
@@ -143,7 +142,6 @@ outputs:
           - librmm
           - mkl
           - nccl
-          - libnvjitlink
     about:
       homepage: ${{ load_from_file("python/libcuvs/pyproject.toml").project.urls.Homepage }}
       license: ${{ load_from_file("python/libcuvs/pyproject.toml").project.license }}
@@ -191,7 +189,6 @@ outputs:
         - libcurand
         - libcusolver
         - libcusparse
-        - libnvjitlink
       ignore_run_exports:
         by_name:
           - cuda-cudart
@@ -205,7 +202,6 @@ outputs:
           - librmm
           - mkl
           - nccl
-          - libnvjitlink
     about:
       homepage: ${{ load_from_file("python/libcuvs/pyproject.toml").project.urls.Homepage }}
       license: ${{ load_from_file("python/libcuvs/pyproject.toml").project.license }}
@@ -251,7 +247,6 @@ outputs:
         - libcurand
         - libcusolver
         - libcusparse
-        - libnvjitlink
       ignore_run_exports:
         by_name:
           - cuda-cudart
@@ -262,7 +257,6 @@ outputs:
           - libcurand
           - libcusolver
           - libcusparse
-          - libnvjitlink
           - librmm
           - mkl
           - nccl
@@ -409,7 +403,6 @@ outputs:
         - libcurand
         - libcusolver
         - libcusparse
-        - libnvjitlink
       ignore_run_exports:
         by_name:
           - cuda-cudart
@@ -420,7 +413,6 @@ outputs:
           - libcurand
           - libcusolver
           - libcusparse
-          - libnvjitlink
           - librmm
           - mkl
           - nccl