Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
fcffe8a
Always build with JIT+LTO
KyleFromNVIDIA Mar 16, 2026
6c91f9d
Use the driver API instead
KyleFromNVIDIA Mar 16, 2026
e858407
Conda recipe
KyleFromNVIDIA Mar 16, 2026
1972a74
deps
KyleFromNVIDIA Mar 16, 2026
4503307
PRIVATE
KyleFromNVIDIA Mar 16, 2026
a42ede0
auditwheel
KyleFromNVIDIA Mar 16, 2026
e26519f
Conda recipe
KyleFromNVIDIA Mar 17, 2026
697d1d0
Merge branch 'release/26.04' into jit-lto-cuda-12
KyleFromNVIDIA Mar 17, 2026
3269055
Revert "Conda recipe"
KyleFromNVIDIA Mar 17, 2026
07c50e6
COMPILE_ONLY
KyleFromNVIDIA Mar 17, 2026
788fd34
PUBLIC
KyleFromNVIDIA Mar 17, 2026
e16b88f
Revert "Use the driver API instead"
KyleFromNVIDIA Mar 17, 2026
96e9162
Remove driver dep
KyleFromNVIDIA Mar 17, 2026
8027d97
Default to static linking of libcudart (#1627)
bdice Mar 16, 2026
56229e8
Opt out of rmm's cudart dependency
KyleFromNVIDIA Mar 18, 2026
0a0540a
Make rmm interface dependency COMPILE_ONLY
KyleFromNVIDIA Mar 18, 2026
f379ad4
Merge branch 'main' into jit-lto-cuda-12
KyleFromNVIDIA Mar 18, 2026
38c9e9d
Merge branch 'main' into jit-lto-cuda-12
KyleFromNVIDIA Mar 19, 2026
17c5cd7
Push
KyleFromNVIDIA Mar 19, 2026
af0a04e
Merge branch 'main' into cudart-static
KyleFromNVIDIA Mar 19, 2026
8c771a5
Merge branch 'cudart-static' into jit-lto-cuda-12
KyleFromNVIDIA Mar 23, 2026
b6560be
Debugging
KyleFromNVIDIA Mar 23, 2026
84ddcf9
Downgrade to compute 7.0 for CUDA 12
KyleFromNVIDIA Mar 24, 2026
b08a35d
Merge branch 'main' into jit-lto-cuda-12
KyleFromNVIDIA Mar 24, 2026
a8493a3
Remove JIT_LTO_COMPILATION variable
KyleFromNVIDIA Mar 24, 2026
997ab66
Remove CUVS_ENABLE_JIT_LTO preprocessor definition
KyleFromNVIDIA Mar 24, 2026
fe67525
Use libnvjitlink run exports
KyleFromNVIDIA Mar 24, 2026
cb61d86
Merge branch 'main' into jit-lto-cuda-12
KyleFromNVIDIA Mar 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conda/environments/all_cuda-129_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- libnvjitlink-dev
- librmm==26.6.*,>=0.0.0a0
- make
- nccl>=2.19
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-129_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- libnvjitlink-dev
- librmm==26.6.*,>=0.0.0a0
- make
- nccl>=2.19
Expand Down
1 change: 1 addition & 0 deletions conda/environments/bench_ann_cuda-129_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ dependencies:
- libcusolver-dev
- libcusparse-dev
- libcuvs==26.6.*,>=0.0.0a0
- libnvjitlink-dev
- librmm==26.6.*,>=0.0.0a0
- matplotlib-base>=3.9
- nccl>=2.19
Expand Down
1 change: 1 addition & 0 deletions conda/environments/bench_ann_cuda-129_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ dependencies:
- libcusolver-dev
- libcusparse-dev
- libcuvs==26.6.*,>=0.0.0a0
- libnvjitlink-dev
- librmm==26.6.*,>=0.0.0a0
- matplotlib-base>=3.9
- mkl-devel=2023
Expand Down
1 change: 1 addition & 0 deletions conda/environments/go_cuda-129_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ dependencies:
- libcusolver-dev
- libcusparse-dev
- libcuvs==26.6.*,>=0.0.0a0
- libnvjitlink-dev
- libraft==26.6.*,>=0.0.0a0
- nccl>=2.19
- ninja
Expand Down
1 change: 1 addition & 0 deletions conda/environments/go_cuda-129_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ dependencies:
- libcusolver-dev
- libcusparse-dev
- libcuvs==26.6.*,>=0.0.0a0
- libnvjitlink-dev
- libraft==26.6.*,>=0.0.0a0
- nccl>=2.19
- ninja
Expand Down
1 change: 1 addition & 0 deletions conda/environments/rust_cuda-129_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies:
- libcusolver-dev
- libcusparse-dev
- libcuvs==26.6.*,>=0.0.0a0
- libnvjitlink-dev
- libraft==26.6.*,>=0.0.0a0
- make
- nccl>=2.19
Expand Down
1 change: 1 addition & 0 deletions conda/environments/rust_cuda-129_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies:
- libcusolver-dev
- libcusparse-dev
- libcuvs==26.6.*,>=0.0.0a0
- libnvjitlink-dev
- libraft==26.6.*,>=0.0.0a0
- make
- nccl>=2.19
Expand Down
20 changes: 5 additions & 15 deletions conda/recipes/libcuvs/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,7 @@ cache:
- ninja
- ${{ stdlib("c") }}
host:
- if: cuda_major == "13"
then:
- libnvjitlink-dev
- libnvjitlink-dev
- librmm =${{ minor_version }}
- libraft-headers =${{ minor_version }}
- nccl ${{ nccl_version }}
Expand Down Expand Up @@ -121,9 +119,7 @@ outputs:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- if: cuda_major == "13"
then:
- libnvjitlink-dev
- libnvjitlink-dev
run:
- ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
- libraft-headers =${{ minor_version }}
Expand Down Expand Up @@ -182,9 +178,7 @@ outputs:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- if: cuda_major == "13"
then:
- libnvjitlink-dev
- libnvjitlink-dev
run:
- ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
- ${{ pin_subpackage("libcuvs-headers", exact=True) }}
Expand Down Expand Up @@ -242,9 +236,7 @@ outputs:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- if: cuda_major == "13"
then:
- libnvjitlink-dev
- libnvjitlink-dev
run:
- ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
- ${{ pin_subpackage("libcuvs-headers", exact=True) }}
Expand Down Expand Up @@ -401,9 +393,7 @@ outputs:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- if: cuda_major == "13"
then:
- libnvjitlink-dev
- libnvjitlink-dev
run:
- ${{ pin_subpackage("libcuvs-headers", exact=True) }}
- ${{ pin_subpackage("libcuvs", exact=True) }}
Expand Down
192 changes: 88 additions & 104 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -355,97 +355,90 @@ if(NOT BUILD_CPU_ONLY)
)
endif()

set(JIT_LTO_TARGET_ARCHITECTURE "")
set(JIT_LTO_COMPILATION OFF)
set(jit_lto_files)
set(JIT_LTO_TARGET_ARCHITECTURE "70-real")
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
set(JIT_LTO_TARGET_ARCHITECTURE "75-real")
set(JIT_LTO_COMPILATION ON)
endif()

if(JIT_LTO_COMPILATION)
# Generate interleaved scan kernel files at build time
include(cmake/modules/generate_jit_lto_kernels.cmake)
# Generate interleaved scan kernel files at build time
include(cmake/modules/generate_jit_lto_kernels.cmake)

add_library(jit_lto_kernel_usage_requirements INTERFACE)
target_include_directories(
jit_lto_kernel_usage_requirements
INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src"
"${CMAKE_CURRENT_SOURCE_DIR}/../c/include"
)
target_compile_options(
jit_lto_kernel_usage_requirements INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
)
target_compile_features(jit_lto_kernel_usage_requirements INTERFACE cuda_std_20)
target_link_libraries(
jit_lto_kernel_usage_requirements INTERFACE rmm::rmm raft::raft CCCL::CCCL
)

block(PROPAGATE interleaved_scan_files metric_files filter_files post_lambda_files)
set(CMAKE_CUDA_ARCHITECTURES ${JIT_LTO_TARGET_ARCHITECTURE})
generate_jit_lto_kernels(
interleaved_scan_files
NAME_FORMAT
"interleaved_scan_capacity_@capacity@_veclen_@veclen@_@ascending_descending@_@compute_norm_name@_data_@type_abbrev@_acc_@acc_abbrev@_idx_@idx_abbrev@"
MATRIX_JSON_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_matrix.json"
KERNEL_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in"
EMBEDDED_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in"
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/interleaved_scan"
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
)
generate_jit_lto_kernels(
metric_files
NAME_FORMAT "metric_@metric_name@_veclen_@veclen@_data_@type_abbrev@_acc_@acc_abbrev@"
MATRIX_JSON_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json"
KERNEL_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in"
EMBEDDED_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in"
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/metric"
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
)
generate_jit_lto_kernels(
filter_files
NAME_FORMAT "@filter_name@"
MATRIX_JSON_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_matrix.json"
KERNEL_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in"
EMBEDDED_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in"
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/filter"
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
)
generate_jit_lto_kernels(
post_lambda_files
NAME_FORMAT "@post_lambda_name@"
MATRIX_JSON_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_matrix.json"
KERNEL_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in"
EMBEDDED_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in"
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/post_lambda"
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
)
endblock()

set(jit_lto_files
${interleaved_scan_files}
${metric_files}
${filter_files}
${post_lambda_files}
src/detail/jit_lto/AlgorithmLauncher.cpp
src/detail/jit_lto/AlgorithmPlanner.cpp
src/detail/jit_lto/FragmentEntry.cpp
src/detail/jit_lto/nvjitlink_checker.cpp
)
endif()
add_library(jit_lto_kernel_usage_requirements INTERFACE)
target_include_directories(
jit_lto_kernel_usage_requirements
INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src"
"${CMAKE_CURRENT_SOURCE_DIR}/../c/include"
)
target_compile_options(
jit_lto_kernel_usage_requirements INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
)
target_compile_features(jit_lto_kernel_usage_requirements INTERFACE cuda_std_20)
target_link_libraries(jit_lto_kernel_usage_requirements INTERFACE rmm::rmm raft::raft CCCL::CCCL)

block(PROPAGATE interleaved_scan_files metric_files filter_files post_lambda_files)
set(CMAKE_CUDA_ARCHITECTURES ${JIT_LTO_TARGET_ARCHITECTURE})
generate_jit_lto_kernels(
interleaved_scan_files
NAME_FORMAT
"interleaved_scan_capacity_@capacity@_veclen_@veclen@_@ascending_descending@_@compute_norm_name@_data_@type_abbrev@_acc_@acc_abbrev@_idx_@idx_abbrev@"
MATRIX_JSON_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_matrix.json"
KERNEL_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in"
EMBEDDED_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in"
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/interleaved_scan"
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
)
generate_jit_lto_kernels(
metric_files
NAME_FORMAT "metric_@metric_name@_veclen_@veclen@_data_@type_abbrev@_acc_@acc_abbrev@"
MATRIX_JSON_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json"
KERNEL_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in"
EMBEDDED_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in"
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/metric"
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
)
generate_jit_lto_kernels(
filter_files
NAME_FORMAT "@filter_name@"
MATRIX_JSON_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_matrix.json"
KERNEL_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in"
EMBEDDED_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in"
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/filter"
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
)
generate_jit_lto_kernels(
post_lambda_files
NAME_FORMAT "@post_lambda_name@"
MATRIX_JSON_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_matrix.json"
KERNEL_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in"
EMBEDDED_INPUT_FILE
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in"
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/post_lambda"
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
)
endblock()

set(jit_lto_files
${interleaved_scan_files}
${metric_files}
${filter_files}
${post_lambda_files}
src/detail/jit_lto/AlgorithmLauncher.cpp
src/detail/jit_lto/AlgorithmPlanner.cpp
src/detail/jit_lto/FragmentEntry.cpp
src/detail/jit_lto/nvjitlink_checker.cpp
)

add_library(
cuvs_objs OBJECT
Expand Down Expand Up @@ -678,10 +671,8 @@ if(NOT BUILD_CPU_ONLY)
)

target_compile_definitions(
cuvs_objs
PRIVATE $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUVS_ENABLE_JIT_LTO>
cuvs_objs PRIVATE $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
)

target_link_libraries(
Expand Down Expand Up @@ -756,10 +747,8 @@ if(NOT BUILD_CPU_ONLY)
"$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CONFIG:Debug>>:${CUVS_DEBUG_CUDA_FLAGS}>"
)
target_compile_definitions(
cuvs
PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUVS_ENABLE_JIT_LTO>
cuvs PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
)

target_link_libraries(
Expand All @@ -771,11 +760,8 @@ if(NOT BUILD_CPU_ONLY)
$<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:NCCL::NCCL>>
$<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>>
$<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
PRIVATE rmm::rmm
$<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
$<COMPILE_ONLY:nvidia::cutlass::cutlass>
$<COMPILE_ONLY:cuco::cuco>
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
PRIVATE rmm::rmm $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
$<COMPILE_ONLY:nvidia::cutlass::cutlass> $<COMPILE_ONLY:cuco::cuco> CUDA::nvJitLink
)
set_property(TARGET cuvs PROPERTY NO_CUDART_DEP ON)

Expand Down Expand Up @@ -814,10 +800,8 @@ SECTIONS

target_compile_options(cuvs_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
target_compile_definitions(
cuvs_static
PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUVS_ENABLE_JIT_LTO>
cuvs_static PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
)

target_include_directories(cuvs_static INTERFACE "$<INSTALL_INTERFACE:include>")
Expand All @@ -835,7 +819,7 @@ SECTIONS
$<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>> # header only
PRIVATE rmm::rmm
$<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
CUDA::nvJitLink
$<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
$<COMPILE_ONLY:nvidia::cutlass::cutlass>
$<COMPILE_ONLY:cuco::cuco>
Expand Down
Loading
Loading