From 49e3a2286eea5d18b2664ffe8b9f72689ff06e18 Mon Sep 17 00:00:00 2001 From: Ishwar Bhati Date: Mon, 15 Dec 2025 14:09:59 -0800 Subject: [PATCH 1/4] To enable iterator in IVF, add support for single search --- include/svs/index/ivf/dynamic_ivf.h | 66 ++++++++++- include/svs/index/ivf/extensions.h | 89 ++++++++++++++ include/svs/index/ivf/index.h | 95 ++++++++++++++- tests/CMakeLists.txt | 1 + tests/svs/index/ivf/dynamic_ivf.cpp | 149 ++++++++++++++++++++++++ tests/svs/index/ivf/index.cpp | 172 ++++++++++++++++++++++++++++ 6 files changed, 566 insertions(+), 6 deletions(-) create mode 100644 tests/svs/index/ivf/index.cpp diff --git a/include/svs/index/ivf/dynamic_ivf.h b/include/svs/index/ivf/dynamic_ivf.h index 966fc3b12..e2267ae3e 100644 --- a/include/svs/index/ivf/dynamic_ivf.h +++ b/include/svs/index/ivf/dynamic_ivf.h @@ -79,6 +79,14 @@ class DynamicIVFIndex { using InterQueryThreadPool = threads::ThreadPoolHandle; using IntraQueryThreadPool = threads::DefaultThreadPool; + // Reuse scratchspace types from static IVF + using buffer_centroids_type = SortedBuffer; + using buffer_leaves_type = std::vector>; + using inner_scratch_type = + svs::tag_t::result_t; + using scratchspace_type = + ivf::IVFScratchspace; + private: // Core IVF components (same structure as static IVF) centroids_type centroids_; @@ -98,7 +106,7 @@ class DynamicIVFIndex { // Threading infrastructure (same as static IVF) InterQueryThreadPool inter_query_threadpool_; const size_t intra_query_thread_count_; - std::vector intra_query_threadpools_; + mutable std::vector intra_query_threadpools_; // Search infrastructure (same as static IVF) std::vector> matmul_results_; @@ -337,6 +345,60 @@ class DynamicIVFIndex { ); } + /// @brief Return scratch space resources for external threading + /// @param sp Search parameters to configure the scratchspace + /// @param num_neighbors Number of neighbors to return (default: 10) + scratchspace_type + scratchspace(const search_parameters_type& sp, size_t num_neighbors = 10) const { + size_t buffer_leaves_size = + static_cast(sp.k_reorder_ * static_cast(num_neighbors)); + return scratchspace_type{ + create_centroid_buffer(sp.n_probes_), + create_leaf_buffers(buffer_leaves_size), + extensions::per_thread_batch_search_setup(clusters_[0].data_, distance_)}; + } + + /// @brief Return scratch space resources for external threading with default parameters + scratchspace_type scratchspace() const { return scratchspace(search_parameters_); } + + /// @brief Perform a nearest neighbor search for a single query using provided scratch + /// space + /// + /// Operations performed: + /// * Search centroids to find n_probes nearest clusters + /// * Search within selected clusters to find k nearest neighbors + /// + /// Results will be present in the scratch.buffer_leaves[0] data structure. + /// The caller is responsible for extracting and processing results. + /// Results will contain internal IDs - use translate_to_external() to convert to + /// external IDs. + /// + /// **Note**: It is the caller's responsibility to ensure that the scratch space has + /// been initialized properly to return the requested number of neighbors. + /// + template + void search(const Query& query, scratchspace_type& scratch) const { + // Wrapper lambdas that drop query_idx and tid parameters + auto search_centroids_fn = [&](const auto& q, auto& buf) { + search_centroids_closure()(q, buf, 0); + }; + auto search_leaves_fn = + [&](const auto& q, auto& dist, const auto& buf_cent, auto& buf_leaves) { + search_leaves_closure()(q, dist, buf_cent, buf_leaves, 0); + }; + + extensions::single_search( + clusters_[0].data_, + *this, + scratch.buffer_centroids, + scratch.buffer_leaves, + scratch.scratch, + query, + search_centroids_fn, + search_leaves_fn + ); + } + /// @brief Iterate over all external IDs template void on_ids(F&& f) const { for (size_t i = 0; i < status_.size(); ++i) { @@ -860,7 +922,7 @@ class DynamicIVFIndex { } /// @brief Create closure for searching clusters/leaves - auto search_leaves_closure() { + auto search_leaves_closure() const { return [this]( const auto& query, auto& distance, diff --git a/include/svs/index/ivf/extensions.h b/include/svs/index/ivf/extensions.h index 79785f2a4..a60260808 100644 --- a/include/svs/index/ivf/extensions.h +++ b/include/svs/index/ivf/extensions.h @@ -61,6 +61,95 @@ Distance svs_invoke( return threads::shallow_copy(distance); } +/// +/// @brief Customization point for single query search. +/// +struct IVFSingleSearchType { + template < + typename Data, + typename Cluster, + typename BufferCentroids, + typename BufferLeaves, + typename Scratch, + typename Query, + typename SearchCentroids, + typename SearchLeaves> + void operator()( + const Data& data, + const Cluster& cluster, + BufferCentroids& buffer_centroids, + BufferLeaves& buffer_leaves, + Scratch& scratch, + const Query& query, + const SearchCentroids& search_centroids, + const SearchLeaves& search_leaves + ) const { + svs::svs_invoke( + *this, + data, + cluster, + buffer_centroids, + buffer_leaves, + scratch, + query, + search_centroids, + search_leaves + ); + } +}; + +inline constexpr IVFSingleSearchType single_search{}; + +// Default implementation for single query search +template < + typename Data, + typename Cluster, + typename BufferCentroids, + typename BufferLeaves, + typename Distance, + typename Query, + typename SearchCentroids, + typename SearchLeaves> +void svs_invoke( + svs::tag_t, + const Data& SVS_UNUSED(data), + const Cluster& cluster, + BufferCentroids& buffer_centroids, + BufferLeaves& buffer_leaves, + Distance& distance, + const Query& query, + const SearchCentroids& search_centroids, + const SearchLeaves& search_leaves +) { + size_t n_inner_threads = buffer_leaves.size(); + size_t buffer_leaves_size = buffer_leaves[0].capacity(); + + // Search centroids to find nearest clusters + search_centroids(query, buffer_centroids); + + // Search within selected clusters + search_leaves(query, distance, buffer_centroids, buffer_leaves); + + // Accumulate results from intra-query threads into buffer_leaves[0] + for (size_t j = 1; j < n_inner_threads; ++j) { + for (size_t k = 0; k < buffer_leaves_size; ++k) { + buffer_leaves[0].insert(buffer_leaves[j][k]); + } + } + + // Sort buffer to get valid results in order + buffer_leaves[0].sort(); + + // Convert (cluster_id, local_id) to global_id + for (size_t j = 0; j < buffer_leaves_size; ++j) { + auto& neighbor = buffer_leaves[0][j]; + auto cluster_id = neighbor.id(); + auto local_id = neighbor.get_local_id(); + auto global_id = cluster.get_global_id(cluster_id, local_id); + neighbor.set_id(global_id); + } +} + /// /// @brief Customization point for working with a batch of threads. /// diff --git a/include/svs/index/ivf/index.h b/include/svs/index/ivf/index.h index 779b9e0f5..db9fd4ab5 100644 --- a/include/svs/index/ivf/index.h +++ b/include/svs/index/ivf/index.h @@ -41,6 +41,33 @@ namespace svs::index::ivf { // environments. constexpr size_t MAX_QUERY_BATCH_SIZE = 10000; +/// +/// @brief Search scratchspace used by the IVF index. +/// +/// These can be pre-allocated and passed to the index when performing externally +/// threaded searches to reduce allocations. +/// +/// **NOTE**: The members ``buffer_centroids``, ``buffer_leaves``, and ``scratch`` are part +/// of the public API for this class. Users are free to access and manipulate these objects. +/// However, doing so incorrectly can yield undefined-behavior. +/// +template +struct IVFScratchspace { + public: + // Members + BufferCentroids buffer_centroids; + BufferLeaves buffer_leaves; + Scratch scratch; + + // Constructors + IVFScratchspace( + BufferCentroids buffer_centroids_, BufferLeaves buffer_leaves_, Scratch scratch_ + ) + : buffer_centroids{std::move(buffer_centroids_)} + , buffer_leaves{std::move(buffer_leaves_)} + , scratch{std::move(scratch_)} {} +}; + /// @brief IVF (Inverted File) Index implementation for efficient similarity search /// /// This class implements an IVF index structure that partitions the search space using @@ -85,6 +112,14 @@ class IVFIndex { using InterQueryThreadPool = threads::ThreadPoolHandle; // For inter-query parallelism using IntraQueryThreadPool = threads::DefaultThreadPool; // For intra-query parallelism + // Scratchspace type for external threading + using buffer_centroids_type = SortedBuffer>; + using buffer_leaves_type = std::vector>>; + using inner_scratch_type = + svs::tag_t::result_t; + using scratchspace_type = + IVFScratchspace; + /// @brief Construct a new IVF Index /// /// @param centroids Collection of centroids for space partitioning @@ -210,12 +245,64 @@ class IVFIndex { ); } + /// @brief Return scratch space resources for external threading + /// @param sp Search parameters to configure the scratchspace + /// @param num_neighbors Number of neighbors to return (default: 10) + scratchspace_type + scratchspace(const search_parameters_type& sp, size_t num_neighbors = 10) const { + size_t buffer_leaves_size = + static_cast(sp.k_reorder_ * static_cast(num_neighbors)); + return scratchspace_type{ + create_centroid_buffer(sp.n_probes_), + create_leaf_buffers(buffer_leaves_size), + extensions::per_thread_batch_search_setup(cluster0_, distance_)}; + } + + /// @brief Return scratch space resources for external threading with default parameters + scratchspace_type scratchspace() const { return scratchspace(get_search_parameters()); } + + /// @brief Perform a nearest neighbor search for a single query using provided scratch + /// space + /// + /// Operations performed: + /// * Search centroids to find n_probes nearest clusters + /// * Search within selected clusters to find k nearest neighbors + /// + /// Results will be present in the scratch.buffer_leaves[0] data structure. + /// The caller is responsible for extracting and processing results. + /// + /// **Note**: It is the caller's responsibility to ensure that the scratch space has + /// been initialized properly to return the requested number of neighbors. + /// + template + void search(const Query& query, scratchspace_type& scratch) const { + // Wrapper lambdas that drop query_idx and tid parameters + auto search_centroids_fn = [&](const auto& q, auto& buf) { + search_centroids_closure()(q, buf, 0); + }; + auto search_leaves_fn = + [&](const auto& q, auto& dist, const auto& buf_cent, auto& buf_leaves) { + search_leaves_closure()(q, dist, buf_cent, buf_leaves, 0); + }; + + extensions::single_search( + cluster0_, + cluster_, + scratch.buffer_centroids, + scratch.buffer_leaves, + scratch.scratch, + query, + search_centroids_fn, + search_leaves_fn + ); + } + ///// Search Implementation ///// /// @brief Search closure for centroid distance computation /// @return Function object handling initial centroid search phase (inter-query /// parallel) - auto search_centroids_closure() { + auto search_centroids_closure() const { return [&](const auto& query, auto& buffer, size_t id) { search_centroids( query, @@ -231,7 +318,7 @@ class IVFIndex { /// @brief Search closure for cluster traversal /// @return Function object handling cluster exploration (intra-query parallel) - auto search_leaves_closure() { + auto search_leaves_closure() const { return [&](const auto& query, auto& distance, const auto& buffer_centroids, @@ -329,11 +416,11 @@ class IVFIndex { ///// Threading Infrastructure ///// InterQueryThreadPool inter_query_threadpool_; // Handles parallelism across queries const size_t intra_query_thread_count_; // Number of threads per query processing - std::vector + mutable std::vector intra_query_threadpools_; // Per-query parallel cluster exploration ///// Search Data ///// - std::vector> matmul_results_; + mutable std::vector> matmul_results_; std::vector centroids_norm_; search_parameters_type search_parameters_; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a023608de..0e7dbde59 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -208,6 +208,7 @@ if (SVS_EXPERIMENTAL_ENABLE_IVF) ${TEST_DIR}/svs/index/ivf/kmeans.cpp ${TEST_DIR}/svs/index/ivf/hierarchical_kmeans.cpp ${TEST_DIR}/svs/index/ivf/common.cpp + ${TEST_DIR}/svs/index/ivf/index.cpp ${TEST_DIR}/svs/index/ivf/dynamic_ivf.cpp ) endif() diff --git a/tests/svs/index/ivf/dynamic_ivf.cpp b/tests/svs/index/ivf/dynamic_ivf.cpp index 690213b61..89c6308de 100644 --- a/tests/svs/index/ivf/dynamic_ivf.cpp +++ b/tests/svs/index/ivf/dynamic_ivf.cpp @@ -986,3 +986,152 @@ CATCH_TEST_CASE("Dynamic IVF Get Distance", "[index][ivf][dynamic_ivf]") { } } } + +CATCH_TEST_CASE("Dynamic IVF Single Search", "[ivf][dynamic_ivf][single_search]") { + namespace ivf = svs::index::ivf; + + // Load test data + auto data = svs::data::SimpleData::load(test_dataset::data_svs_file()); + auto queries = test_dataset::queries(); + + size_t num_threads = 2; + size_t num_inner_threads = 2; + auto distance = Distance(); + + // Build clustering + auto build_params = ivf::IVFBuildParameters(NUM_CLUSTERS, 10, false); + auto threadpool = svs::threads::SequentialThreadPool(); + auto clustering = + ivf::build_clustering(build_params, data, distance, threadpool, false); + + // Create external IDs + std::vector ids(data.size()); + std::iota(ids.begin(), ids.end(), 0); + + // Create dense clusters + auto centroids = clustering.centroids(); + using DataType = svs::data::SimpleData; + auto dense_clusters = ivf::DenseClusteredDataset( + clustering, data, threadpool, svs::lib::Allocator() + ); + + // Build dynamic IVF index + auto threadpool_for_index = svs::threads::as_threadpool(num_threads); + using IndexType = ivf::DynamicIVFIndex< + decltype(centroids), + decltype(dense_clusters), + Distance, + decltype(threadpool_for_index)>; + + auto index = IndexType( + std::move(centroids), + std::move(dense_clusters), + ids, + distance, + std::move(threadpool_for_index), + num_inner_threads + ); + + CATCH_SECTION("Test scratchspace creation") { + // Test scratchspace with custom parameters + auto search_params = ivf::IVFSearchParameters(); + search_params.n_probes_ = 5; + search_params.k_reorder_ = 100; + + auto scratch = index.scratchspace(search_params); + + // Verify scratchspace structure + CATCH_REQUIRE(scratch.buffer_centroids.capacity() == search_params.n_probes_); + CATCH_REQUIRE(scratch.buffer_leaves.size() == num_inner_threads); + + // Test default scratchspace + auto default_scratch = index.scratchspace(); + CATCH_REQUIRE(default_scratch.buffer_leaves.size() == num_inner_threads); + } + + CATCH_SECTION("Test single query search") { + size_t num_neighbors = NUM_NEIGHBORS; + + // Create scratchspace + auto search_params = ivf::IVFSearchParameters(); + search_params.n_probes_ = 5; + search_params.k_reorder_ = 100; + auto scratch = index.scratchspace(search_params); + + // Perform single search + auto query = queries.get_datum(0); + index.search(query, scratch); + + // Verify results - note these are internal IDs + auto& results_buffer = scratch.buffer_leaves[0]; + CATCH_REQUIRE(results_buffer.size() > 0); + CATCH_REQUIRE(results_buffer.size() >= num_neighbors); + + // Results should be sorted by distance + results_buffer.sort(); + for (size_t i = 1; i < results_buffer.size(); ++i) { + CATCH_REQUIRE(results_buffer[i].distance() >= results_buffer[i - 1].distance()); + } + } + + CATCH_SECTION("Test scratchspace reusability") { + auto search_params = ivf::IVFSearchParameters(); + search_params.n_probes_ = 5; + search_params.k_reorder_ = 100; + auto scratch = index.scratchspace(search_params); + + // Search with multiple queries using same scratchspace + for (size_t i = 0; i < std::min(5, queries.size()); ++i) { + auto query = queries.get_datum(i); + index.search(query, scratch); + + // Verify each search produces results + CATCH_REQUIRE(scratch.buffer_leaves[0].size() > 0); + } + } + + CATCH_SECTION("Compare single search with batch search") { + size_t num_neighbors = NUM_NEIGHBORS; + + auto search_params = ivf::IVFSearchParameters(); + search_params.n_probes_ = 5; + search_params.k_reorder_ = 100; + + // Single search + auto scratch = index.scratchspace(search_params); + auto query = queries.get_datum(0); + index.search(query, scratch); + + // Extract results from single search (already sorted and ID-converted) + auto& single_results = scratch.buffer_leaves[0]; + std::vector single_ids; + for (size_t i = 0; i < num_neighbors && i < single_results.size(); ++i) { + single_ids.push_back(single_results[i].id()); + } + + // Convert internal IDs to external IDs for comparison + std::vector single_external_ids; + for (auto internal_id : single_ids) { + single_external_ids.push_back(index.translate_internal_id(internal_id)); + } + + // Batch search + auto batch_queries = svs::data::ConstSimpleDataView( + queries.data(), 1, queries.dimensions() + ); + auto batch_results = svs::QueryResult(1, num_neighbors); + index.search(batch_results.view(), batch_queries, search_params); + + // Extract results from batch search (external IDs) + std::vector batch_ids; + for (size_t i = 0; i < num_neighbors; ++i) { + batch_ids.push_back(batch_results.index(0, i)); + } + + // Results should match + CATCH_REQUIRE(single_external_ids.size() == batch_ids.size()); + for (size_t i = 0; i < num_neighbors; ++i) { + CATCH_REQUIRE(single_external_ids[i] == batch_ids[i]); + } + } +} diff --git a/tests/svs/index/ivf/index.cpp b/tests/svs/index/ivf/index.cpp new file mode 100644 index 000000000..39c264707 --- /dev/null +++ b/tests/svs/index/ivf/index.cpp @@ -0,0 +1,172 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// header under test +#include "svs/index/ivf/index.h" + +// tests +#include "tests/utils/test_dataset.h" + +// catch +#include "catch2/catch_test_macros.hpp" + +// svs +#include "svs/core/data.h" +#include "svs/core/distance.h" +#include "svs/index/ivf/clustering.h" +#include "svs/index/ivf/hierarchical_kmeans.h" + +// stl +#include + +CATCH_TEST_CASE("IVF Index Single Search", "[ivf][index][single_search]") { + namespace ivf = svs::index::ivf; + + // Load test data + auto data = svs::data::SimpleData::load(test_dataset::data_svs_file()); + auto queries = test_dataset::queries(); + + size_t num_clusters = 10; + size_t num_threads = 2; + size_t num_inner_threads = 2; + auto distance = svs::distance::DistanceL2(); + + // Build clustering + auto build_params = ivf::IVFBuildParameters(num_clusters, 10, false); + auto threadpool = svs::threads::SequentialThreadPool(); + auto clustering = ivf::build_clustering( + build_params, data, distance, threadpool, false + ); + + // Create clustered dataset + auto centroids = clustering.centroids(); + using Idx = uint32_t; + auto cluster = ivf::DenseClusteredDataset( + clustering, data, threadpool, svs::lib::Allocator() + ); + + // Build IVF index + auto threadpool_for_index = svs::threads::as_threadpool(num_threads); + using IndexType = ivf::IVFIndex< + decltype(centroids), + decltype(cluster), + decltype(distance), + decltype(threadpool_for_index)>; + + auto index = IndexType( + std::move(centroids), + std::move(cluster), + distance, + std::move(threadpool_for_index), + num_inner_threads + ); + + CATCH_SECTION("Test scratchspace creation") { + // Test scratchspace with custom parameters + auto search_params = ivf::IVFSearchParameters(); + search_params.n_probes_ = 5; + search_params.k_reorder_ = 100; + + auto scratch = index.scratchspace(search_params); + + // Verify scratchspace has correct structure + CATCH_REQUIRE(scratch.buffer_centroids.capacity() == search_params.n_probes_); + CATCH_REQUIRE(scratch.buffer_leaves.size() == num_inner_threads); + + // Test default scratchspace + auto default_scratch = index.scratchspace(); + CATCH_REQUIRE(default_scratch.buffer_leaves.size() == num_inner_threads); + } + + CATCH_SECTION("Test single query search") { + size_t num_neighbors = 10; + + // Create scratchspace + auto search_params = ivf::IVFSearchParameters(); + search_params.n_probes_ = 5; + search_params.k_reorder_ = 100; + auto scratch = index.scratchspace(search_params); + + // Perform single search + auto query = queries.get_datum(0); + index.search(query, scratch); + + // Verify results + auto& results_buffer = scratch.buffer_leaves[0]; + CATCH_REQUIRE(results_buffer.size() > 0); + CATCH_REQUIRE(results_buffer.size() >= num_neighbors); + + // Results should be sorted by distance + results_buffer.sort(); + for (size_t i = 1; i < results_buffer.size(); ++i) { + CATCH_REQUIRE(results_buffer[i].distance() >= results_buffer[i - 1].distance()); + } + } + + CATCH_SECTION("Test scratchspace reusability") { + auto search_params = ivf::IVFSearchParameters(); + search_params.n_probes_ = 5; + search_params.k_reorder_ = 100; + auto scratch = index.scratchspace(search_params); + + // Search with multiple queries using same scratchspace + for (size_t i = 0; i < std::min(5, queries.size()); ++i) { + auto query = queries.get_datum(i); + index.search(query, scratch); + + // Verify each search produces results + CATCH_REQUIRE(scratch.buffer_leaves[0].size() > 0); + } + } + + CATCH_SECTION("Compare single search with batch search") { + size_t num_neighbors = 10; + + auto search_params = ivf::IVFSearchParameters(); + search_params.n_probes_ = 5; + search_params.k_reorder_ = 100; + + // Single search + auto scratch = index.scratchspace(search_params); + auto query = queries.get_datum(0); + index.search(query, scratch); + + // Extract results from single search (already sorted and ID-converted) + auto& single_results = scratch.buffer_leaves[0]; + std::vector single_ids; + for (size_t i = 0; i < num_neighbors && i < single_results.size(); ++i) { + single_ids.push_back(single_results[i].id()); + } + + // Batch search + auto batch_queries = + svs::data::ConstSimpleDataView(queries.data(), 1, queries.dimensions()); + auto batch_results = svs::QueryResult(1, num_neighbors); + index.search(batch_results.view(), batch_queries, search_params); + + // Extract results from batch search + std::vector batch_ids; + for (size_t i = 0; i < num_neighbors; ++i) { + batch_ids.push_back(batch_results.index(0, i)); + } + + // Results should match + CATCH_REQUIRE(single_ids.size() == batch_ids.size()); + for (size_t i = 0; i < num_neighbors; ++i) { + CATCH_REQUIRE(single_ids[i] == batch_ids[i]); + } + } +} From 7d3bee2122328b07093883027d0dfd161f71d1e2 Mon Sep 17 00:00:00 2001 From: Ishwar Bhati Date: Mon, 15 Dec 2025 16:07:10 -0800 Subject: [PATCH 2/4] Initial batch iterator implementation --- include/svs/index/ivf/dynamic_ivf.h | 35 ++- include/svs/index/ivf/index.h | 50 ++- include/svs/index/ivf/iterator.h | 311 +++++++++++++++++++ tests/CMakeLists.txt | 1 + tests/svs/index/ivf/iterator.cpp | 454 ++++++++++++++++++++++++++++ 5 files changed, 845 insertions(+), 6 deletions(-) create mode 100644 include/svs/index/ivf/iterator.h create mode 100644 tests/svs/index/ivf/iterator.cpp diff --git a/include/svs/index/ivf/dynamic_ivf.h b/include/svs/index/ivf/dynamic_ivf.h index e2267ae3e..f6b8dbb09 100644 --- a/include/svs/index/ivf/dynamic_ivf.h +++ b/include/svs/index/ivf/dynamic_ivf.h @@ -35,6 +35,10 @@ namespace svs::index::ivf { +// Forward declaration of BatchIterator (already declared in index.h, but redeclaring for +// clarity) +template class BatchIterator; + /// /// Metadata tracking the state of a particular data index for DynamicIVFIndex. /// The following states have the given meaning for their corresponding slot: @@ -365,6 +369,7 @@ class DynamicIVFIndex { /// space /// /// Operations performed: + /// * Compute centroid distances for the single query /// * Search centroids to find n_probes nearest clusters /// * Search within selected clusters to find k nearest neighbors /// @@ -376,8 +381,14 @@ class DynamicIVFIndex { /// **Note**: It is the caller's responsibility to ensure that the scratch space has /// been initialized properly to return the requested number of neighbors. /// - template - void search(const Query& query, scratchspace_type& scratch) const { + template void search(const Query& query, scratchspace_type& scratch) { + // Compute centroid distances for the single query + // Create a 1-query view and compute matmul_results + auto query_view = data::ConstSimpleDataView(query.data(), 1, query.size()); + compute_centroid_distances( + query_view, centroids_, matmul_results_, inter_query_threadpool_ + ); + // Wrapper lambdas that drop query_idx and tid parameters auto search_centroids_fn = [&](const auto& q, auto& buf) { search_centroids_closure()(q, buf, 0); @@ -399,6 +410,26 @@ class DynamicIVFIndex { ); } + ///// Batch Iterator ///// + + /// @brief Create a batch iterator for retrieving neighbors in batches. + /// + /// The iterator allows incremental retrieval of neighbors, expanding the search + /// space on each call to `next()`. This is useful for applications that need + /// to process neighbors in batches or implement early termination. + /// + /// @tparam QueryType The element type of the query vector. + /// @param query The query vector as a span. + /// @param extra_search_buffer_capacity Additional buffer capacity for the search. + /// @return A BatchIterator for the given query. + /// + template + auto make_batch_iterator( + std::span query, size_t extra_search_buffer_capacity = 0 + ) { + return BatchIterator(*this, query, extra_search_buffer_capacity); + } + /// @brief Iterate over all external IDs template void on_ids(F&& f) const { for (size_t i = 0; i < status_.size(); ++i) { diff --git a/include/svs/index/ivf/index.h b/include/svs/index/ivf/index.h index db9fd4ab5..61ba704d5 100644 --- a/include/svs/index/ivf/index.h +++ b/include/svs/index/ivf/index.h @@ -35,6 +35,9 @@ namespace svs::index::ivf { +// Forward declaration of BatchIterator +template class BatchIterator; + // The maximum batch size for queries is set to 10,000 to balance memory usage and // performance. This value was chosen based on empirical testing to avoid excessive memory // allocation while supporting large batch operations typical in high-throughput @@ -153,8 +156,20 @@ class IVFIndex { ///// Index Information ///// - /// @brief Get the number of centroids in the index - size_t size() const { return centroids_.size(); } + /// @brief Indicates whether internal IDs need translation to external IDs + static constexpr bool needs_id_translation = false; + + /// @brief Get the total number of vectors in the index + size_t size() const { + size_t total = 0; + for (size_t i = 0; i < centroids_.size(); ++i) { + total += cluster_.view_cluster(i).size(); + } + return total; + } + + /// @brief Get the number of clusters/centroids in the index + size_t num_clusters() const { return centroids_.size(); } /// @brief Get the dimensionality of the indexed vectors size_t dimensions() const { return centroids_.dimensions(); } @@ -265,6 +280,7 @@ class IVFIndex { /// space /// /// Operations performed: + /// * Compute centroid distances for the single query /// * Search centroids to find n_probes nearest clusters /// * Search within selected clusters to find k nearest neighbors /// @@ -274,8 +290,14 @@ class IVFIndex { /// **Note**: It is the caller's responsibility to ensure that the scratch space has /// been initialized properly to return the requested number of neighbors. /// - template - void search(const Query& query, scratchspace_type& scratch) const { + template void search(const Query& query, scratchspace_type& scratch) { + // Compute centroid distances for the single query + // Create a 1-query view and compute matmul_results + auto query_view = data::ConstSimpleDataView(query.data(), 1, query.size()); + compute_centroid_distances( + query_view, centroids_, matmul_results_, inter_query_threadpool_ + ); + // Wrapper lambdas that drop query_idx and tid parameters auto search_centroids_fn = [&](const auto& q, auto& buf) { search_centroids_closure()(q, buf, 0); @@ -297,6 +319,26 @@ class IVFIndex { ); } + ///// Batch Iterator ///// + + /// @brief Create a batch iterator for retrieving neighbors in batches. + /// + /// The iterator allows incremental retrieval of neighbors, expanding the search + /// space on each call to `next()`. This is useful for applications that need + /// to process neighbors in batches or implement early termination. + /// + /// @tparam QueryType The element type of the query vector. + /// @param query The query vector as a span. + /// @param extra_search_buffer_capacity Additional buffer capacity for the search. + /// @return A BatchIterator for the given query. + /// + template + auto make_batch_iterator( + std::span query, size_t extra_search_buffer_capacity = 0 + ) { + return BatchIterator(*this, query, extra_search_buffer_capacity); + } + ///// Search Implementation ///// /// @brief Search closure for centroid distance computation diff --git a/include/svs/index/ivf/iterator.h b/include/svs/index/ivf/iterator.h new file mode 100644 index 000000000..0ab1c5419 --- /dev/null +++ b/include/svs/index/ivf/iterator.h @@ -0,0 +1,311 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// svs +#include "svs/index/ivf/extensions.h" +#include "svs/lib/misc.h" +#include "svs/lib/neighbor.h" +#include "svs/lib/scopeguard.h" + +// stl +#include +#include +#include + +namespace svs::index::ivf { + +namespace detail { +constexpr void checkdims(size_t query_size, size_t index_dims) { + if (query_size != index_dims) { + throw ANNEXCEPTION( + "Incompatible dimensions. Query has {} while the index expects {}.", + query_size, + index_dims + ); + } +} +} // namespace detail + +/// @brief A batch iterator for retrieving neighbors from the IVF index in batches. +/// +/// This iterator abstracts the process of retrieving neighbors in fixed-size batches +/// while maintaining internal state for efficient IVF traversal. Unlike Vamana's graph +/// traversal, IVF maintains state through centroid buffer capacity to expand the search +/// space in subsequent iterations. +/// +/// @tparam Index The IVF index type (IVFIndex or DynamicIVFIndex) +/// @tparam QueryType The element type of the query vector +template class BatchIterator { + public: + static_assert( + std::is_trivially_copyable_v, + "The batch iterator requires a trivial (no-throw copy constructible) type to " + "provide its exception guarantees" + ); + + // Public type aliases + using scratchspace_type = typename Index::scratchspace_type; + using internal_id_type = size_t; + using external_id_type = size_t; + + // The value type yielded by the iterator. + using value_type = Neighbor; + + private: + // Private type aliases + using result_buffer_type = std::vector; + + /// @brief Copies results from the scratch buffer to the results buffer. + /// Ensures that only unique neighbors are added to the results buffer. + void copy_from_scratch(size_t batch_size) { + results_.clear(); + const auto& buffer = scratchspace_->buffer_leaves[0]; + for (size_t i = 0, imax = buffer.size(); i < imax; ++i) { + auto neighbor = buffer[i]; + auto internal_id = neighbor.id(); + auto result = yielded_.insert(internal_id); + if (result.second /* inserted */) { + // Rollback insertion into the yielded set if push_back throws. + auto guard = lib::make_dismissable_scope_guard([&]() noexcept { + yielded_.erase(result.first); + }); + results_.push_back(adapt(neighbor)); + guard.dismiss(); + } + + // Stop if the requested batch size is reached. + if (results_.size() == batch_size) { + break; + } + } + } + + /// @brief Initializes the scratchspace with the configured capacity. + void initialize_scratchspace() { + // Create scratchspace with current n_probes and buffer capacity + scratchspace_ = + parent_->scratchspace(search_params_, extra_search_buffer_capacity_); + } + + /// @brief Increments the search window by `batch_size` for the next iteration. + /// This expands n_probes to search more centroids and increases buffer capacity. + void increment_search_params(size_t batch_size) { + // Increase n_probes to explore more clusters in subsequent iterations + // This is similar to how Vamana increases the search window + search_params_.n_probes_ = + std::min(search_params_.n_probes_ + batch_size, parent_->num_clusters()); + + // Increase buffer capacity to hold more results + extra_search_buffer_capacity_ += batch_size; + + // Reinitialize scratchspace with new parameters + initialize_scratchspace(); + } + + public: + using size_type = typename result_buffer_type::size_type; + using reference = value_type&; + using const_reference = const value_type&; + + /// Random-access iterator to `value_type` over the current batch of results. + using iterator = typename result_buffer_type::iterator; + /// Random-access iterator to `const value_type` over the current batch of results. + using const_iterator = typename result_buffer_type::const_iterator; + + /// @brief Constructs a batch iterator for the given query over the IVF index. + /// @param parent The IVF index to search. + /// @param query The query data. + /// @param search_params Initial search parameters. + /// @param extra_search_buffer_capacity Additional buffer capacity for the search. + BatchIterator( + Index& parent, + std::span query, + const typename Index::search_parameters_type& search_params, + size_t extra_search_buffer_capacity = 0 + ) + : parent_{&parent} + , query_{query.begin(), query.end()} + , search_params_{search_params} + , extra_search_buffer_capacity_{extra_search_buffer_capacity} { + detail::checkdims(query.size(), parent.dimensions()); + initialize_scratchspace(); + } + + /// @brief Constructs a batch iterator with default search parameters. + /// @param parent The IVF index to search. + /// @param query The query data. + /// @param extra_search_buffer_capacity Additional buffer capacity for the search. + BatchIterator( + Index& parent, + std::span query, + size_t extra_search_buffer_capacity = 0 + ) + : BatchIterator( + parent, + query, + [&parent]() { + // Start with a reasonable initial n_probes for iteration + // Use 10% of clusters or at least 5, whichever gives more coverage + auto params = parent.get_search_parameters(); + params.n_probes_ = std::max( + params.n_probes_, + std::min( + parent.num_clusters(), + std::max(5, parent.num_clusters() / 10) + ) + ); + return params; + }(), + extra_search_buffer_capacity + ) {} + + /// @brief Updates the iterator with a new query. + /// Resets the internal state and restarts the search when `next(...)` is called. + void update(std::span newquery) { + detail::checkdims(newquery.size(), parent_->dimensions()); + assert(newquery.size() == query_.size()); + + std::copy(newquery.begin(), newquery.end(), query_.begin()); + + // Reset search parameters to initial values with reasonable n_probes + search_params_ = parent_->get_search_parameters(); + search_params_.n_probes_ = std::max( + search_params_.n_probes_, + std::min( + parent_->num_clusters(), std::max(5, parent_->num_clusters() / 10) + ) + ); + extra_search_buffer_capacity_ = 0; + initialize_scratchspace(); + restart_search_ = true; + iteration_ = 0; + yielded_.clear(); + results_.clear(); + is_exhausted_ = false; + } + + /// @brief Adapts an internal neighbor to an external neighbor. + /// For dynamic IVF, translates internal IDs to external IDs. + template svs::Neighbor adapt(N internal) const { + if constexpr (Index::needs_id_translation) { + return Neighbor{ + parent_->translate_internal_id(internal.id()), internal.distance()}; + } else { + return Neighbor{internal.id(), internal.distance()}; + } + } + + /// @brief Returns an iterator to the beginning of the results. + iterator begin() { return results_.begin(); } + /// @brief Returns an iterator to the end of the results. + iterator end() { return results_.end(); } + /// @copydoc begin() + const_iterator begin() const { return results_.begin(); } + /// @copydoc end() + const_iterator end() const { return results_.end(); } + /// @copydoc begin() + const_iterator cbegin() const { return results_.cbegin(); } + /// @copydoc begin() + const_iterator cend() const { return results_.cend(); } + + /// @brief Returns a span over the current batch of neighbors. + /// The span is invalidated by calls to `next(...)`. + std::span contents() const { return lib::as_const_span(results_); } + + /// @brief Returns the number of buffered results. + size_t size() const { return results_.size(); } + + /// @brief Return the batch number corresponding to the current buffer. + size_t batch_number() const { return iteration_; } + + /// @brief Returns whether iterator can find more neighbors or not for the given query. + /// + /// The iterator is considered done when all the available nodes have been yielded, + /// when all centroids have been searched, or when the search can not find any more + /// neighbors. + bool done() const { + // The iterator is done when: + // 1. All vectors in the index have been yielded, or + // 2. The search has been exhausted (no new results after expanding search) + // Note: We don't consider n_probes >= num_clusters as done, because we can still + // yield results from previously searched clusters that weren't returned yet. + return (yielded_.size() == parent_->size() || is_exhausted_); + } + + /// @brief Forces the next iteration to restart the search from scratch. + void restart_next_search() { restart_search_ = true; } + + /// @brief Returns the search parameters used for the current batch. + typename Index::search_parameters_type parameters_for_current_iteration() const { + return search_params_; + } + + /// @brief Prepares the next batch of neighbors (up to ``batch_size``) from the index. + /// Handles exceptions gracefully and ensures iterator state consistency. + void next( + size_t batch_size, + const lib::DefaultPredicate& SVS_UNUSED(cancel) = lib::Returns(lib::Const()) + ) { + if (done()) { + results_.clear(); + return; + } + + // Always increment search parameters before search to ensure buffer capacity + // On first call, this sets up the initial buffer; on subsequent calls, it expands + increment_search_params(batch_size); + + // Perform search using single_search with scratchspace + parent_->search(lib::as_const_span(query_), *scratchspace_); + + ++iteration_; + restart_search_ = false; + copy_from_scratch(batch_size); + + // If result is empty after calling next(), mark the iterator as exhausted. + if (results_.size() == 0 && batch_size > 0) { + is_exhausted_ = true; + } + } + + private: + Index* parent_; // The index being accessed. + std::vector query_; // Local buffer for the query. + std::optional scratchspace_; // Scratch space for search. + typename Index::search_parameters_type search_params_; // Current search parameters. + std::vector> results_{}; // Filtered results from search. + std::unordered_set yielded_{}; // Set of yielded neighbors. + size_t iteration_ = 0; // Current iteration number. + bool restart_search_ = true; // Whether the next search should restart from scratch. + size_t extra_search_buffer_capacity_ = 0; // Extra buffer capacity for the next search. + bool is_exhausted_ = false; // Whether the iterator is exhausted. +}; + +// Deduction Guides +template +BatchIterator(const Index*, std::span) -> BatchIterator; + +template +BatchIterator( + const Index*, + std::span, + const typename Index::search_parameters_type&, + size_t +) -> BatchIterator; + +} // namespace svs::index::ivf diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0e7dbde59..2271a1c06 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -210,6 +210,7 @@ if (SVS_EXPERIMENTAL_ENABLE_IVF) ${TEST_DIR}/svs/index/ivf/common.cpp ${TEST_DIR}/svs/index/ivf/index.cpp ${TEST_DIR}/svs/index/ivf/dynamic_ivf.cpp + ${TEST_DIR}/svs/index/ivf/iterator.cpp ) endif() diff --git a/tests/svs/index/ivf/iterator.cpp b/tests/svs/index/ivf/iterator.cpp new file mode 100644 index 000000000..b8fcfdb96 --- /dev/null +++ b/tests/svs/index/ivf/iterator.cpp @@ -0,0 +1,454 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// header under test +#include "svs/index/ivf/iterator.h" + +// svs +#include "svs/core/data.h" +#include "svs/core/distance.h" +#include "svs/index/ivf/clustering.h" +#include "svs/index/ivf/dynamic_ivf.h" +#include "svs/index/ivf/index.h" + +// tests +#include "tests/utils/test_dataset.h" + +// catch2 +#include "catch2/catch_test_macros.hpp" + +// stl +#include +#include +#include +#include +#include + +namespace { + +const size_t QUERIES_TO_CHECK = 10; +const size_t NUM_CLUSTERS = 10; +const size_t N = 128; // dimensions + +// Common test routines for the static and dynamic indexes. +template >> +void check( + Index& index, + svs::data::ConstSimpleDataView queries, + svs::data::ConstSimpleDataView groundtruth, + IDChecker& checker +) { + const size_t num_neighbors = 100; + const auto batchsizes = std::vector{{10, 20, 25, 50, 100}}; + + CATCH_REQUIRE(index.size() > num_neighbors); + + auto id_to_distance = std::unordered_map(); + auto id_buffer = std::vector(); + + CATCH_REQUIRE(checker(id_to_distance)); + + auto from_iterator = std::unordered_set(); + for (size_t query_index = 0; query_index < QUERIES_TO_CHECK; ++query_index) { + auto query = queries.get_datum(query_index); + + // Set up search parameters for full search + auto search_params = svs::index::ivf::IVFSearchParameters(); + search_params.n_probes_ = NUM_CLUSTERS; // Search all clusters for accuracy + search_params.k_reorder_ = num_neighbors; + + // Perform a single, full-precision search to obtain reference results. + auto scratch = index.scratchspace(search_params, num_neighbors); + index.search(query, scratch); + auto& buffer = scratch.buffer_leaves[0]; + buffer.sort(); + + id_to_distance.clear(); + id_buffer.clear(); + for (const auto& neighbor : buffer) { + size_t id = [&]() -> size_t { + if constexpr (Index::needs_id_translation) { + return index.translate_internal_id(neighbor.id()); + } else { + return neighbor.id(); + } + }(); + id_to_distance.insert({id, neighbor.distance()}); + id_buffer.push_back(id); + } + + // Ensure we have reasonable recall between. + CATCH_REQUIRE( + svs::lib::count_intersect(id_buffer, groundtruth.get_datum(query_index)) >= + 0.8 * num_neighbors + ); + + // Begin performing batch searches. + for (auto batchsize : batchsizes) { + CATCH_REQUIRE(num_neighbors % batchsize == 0); + size_t num_batches = num_neighbors / batchsize; + + auto iterator = index.make_batch_iterator(query); + CATCH_REQUIRE(iterator.size() == 0); + iterator.next(batchsize); + + from_iterator.clear(); + size_t similar_count = 0; + + // IDs returned from the most recent batch. + auto ids_returned_this_batch = std::vector(); + for (size_t batch = 0; batch < num_batches; ++batch) { + // Make sure the batch number is the same. + CATCH_REQUIRE(iterator.batch_number() == batch + 1); + ids_returned_this_batch.clear(); + for (auto i : iterator) { + auto id = i.id(); + // Make sure that this ID has not been returned yet. + CATCH_REQUIRE(!from_iterator.contains(id)); + auto itr = id_to_distance.find(id); + if (itr != id_to_distance.end()) { + // Make sure the returned distances match. + CATCH_REQUIRE(itr->second == i.distance()); + ++similar_count; + } + + // Insert the ID into the `from_iterator` container to detect for + // duplicates from future calls. + from_iterator.insert(id); + ids_returned_this_batch.push_back(id); + } + + // The number of IDs returned should equal the number of IDs reported + // by the iterator. + CATCH_REQUIRE(ids_returned_this_batch.size() == iterator.size()); + CATCH_REQUIRE(ids_returned_this_batch.size() == batchsize); + + iterator.next(batchsize); + } + + // Make sure the expected number of neighbors has been obtained. + CATCH_REQUIRE(from_iterator.size() == num_neighbors); + + // Ensure that the results returned by the iterator are "substantively similar" + // to those returned from the full search. + CATCH_REQUIRE(similar_count >= 0.95 * num_neighbors); + } + + // Invoke the checker on the IDs returned from the iterator. + CATCH_REQUIRE(checker(from_iterator)); + } +} + +template +void check( + Index& index, + svs::data::ConstSimpleDataView queries, + svs::data::ConstSimpleDataView groundtruth +) { + auto checker = svs::lib::Returns>(); + check(index, queries, groundtruth, checker); +} + +struct DynamicChecker { + DynamicChecker(const std::unordered_set& valid_ids) + : valid_ids_{valid_ids} {} + + // Check whether `id` is valid or not. + bool check(size_t id) { + seen_.insert(id); + return valid_ids_.contains(id); + } + + template bool operator()(const std::unordered_map& ids) { + for (const auto& itr : ids) { + if (!check(itr.first)) { + return false; + } + } + return true; + } + + template bool operator()(const std::unordered_set& ids) { + for (auto itr : ids) { + if (!check(itr)) { + return false; + } + } + return true; + } + + void clear() { seen_.clear(); } + + // Valid IDs + const std::unordered_set& valid_ids_; + std::unordered_set seen_; +}; + +// Helper to build a static IVF index from test data +auto build_static_ivf_index() { + namespace ivf = svs::index::ivf; + + auto data = svs::data::SimpleData::load(test_dataset::data_svs_file()); + auto distance = svs::distance::DistanceL2(); + size_t num_threads = 2; + size_t num_inner_threads = 2; + + // Build clustering + auto build_params = ivf::IVFBuildParameters(NUM_CLUSTERS, 10, false); + auto threadpool = svs::threads::SequentialThreadPool(); + auto clustering = ivf::build_clustering( + build_params, data, distance, threadpool, false + ); + + // Create clustered dataset + auto centroids = clustering.centroids(); + using Idx = uint32_t; + auto cluster = ivf::DenseClusteredDataset( + clustering, data, threadpool, svs::lib::Allocator() + ); + + // Build IVF index + auto threadpool_for_index = svs::threads::as_threadpool(num_threads); + + return ivf::IVFIndex( + std::move(centroids), + std::move(cluster), + distance, + std::move(threadpool_for_index), + num_inner_threads + ); +} + +// Helper to build a dynamic IVF index from test data +auto build_dynamic_ivf_index() { + namespace ivf = svs::index::ivf; + using Eltype = float; + using DataType = svs::data::SimpleData; + using Idx = uint32_t; + using Distance = svs::distance::DistanceL2; + + auto data = DataType::load(test_dataset::data_svs_file()); + auto distance = Distance(); + size_t num_threads = 2; + size_t num_inner_threads = 2; + + // Generate IDs for all data points + std::vector initial_indices(data.size()); + std::iota(initial_indices.begin(), initial_indices.end(), 0); + + // Build clustering + auto build_params = ivf::IVFBuildParameters(NUM_CLUSTERS, 10, false); + auto threadpool = svs::threads::SequentialThreadPool(); + auto clustering = + ivf::build_clustering(build_params, data, distance, threadpool, false); + + // Create dynamic clustered dataset using DenseClusteredDataset + auto centroids = clustering.centroids(); + auto dense_clusters = ivf::DenseClusteredDataset( + clustering, data, threadpool, svs::lib::Allocator() + ); + + // Build Dynamic IVF index + auto threadpool_for_index = svs::threads::as_threadpool(num_threads); + using IndexType = ivf::DynamicIVFIndex< + decltype(centroids), + decltype(dense_clusters), + Distance, + decltype(threadpool_for_index)>; + + return IndexType( + std::move(centroids), + std::move(dense_clusters), + initial_indices, + Distance(), + std::move(threadpool_for_index), + num_inner_threads + ); +} + +} // namespace + +CATCH_TEST_CASE("IVF Iterator", "[ivf][iterator]") { + // This tests the general behavior of the iterator for correctness. + // It is not concerned with whether the returned neighbors are accurate. + auto queries = test_dataset::queries(); + auto gt = test_dataset::groundtruth_euclidean(); + + CATCH_SECTION("Static Index") { + auto index = build_static_ivf_index(); + check(index, queries.cview(), gt.cview()); + } + + CATCH_SECTION("Static Index - Update Query") { + auto index = build_static_ivf_index(); + + // Create an iterator with the first query + auto query0 = std::span(queries.get_datum(0)); + auto iterator = index.make_batch_iterator(query0); + + // Get first batch + iterator.next(10); + CATCH_REQUIRE(iterator.size() == 10); + CATCH_REQUIRE(iterator.batch_number() == 1); + + // Store results from first query + auto first_query_results = std::vector(); + for (auto n : iterator) { + first_query_results.push_back(n.id()); + } + + // Update to second query + auto query1 = std::span(queries.get_datum(1)); + iterator.update(query1); + + // Verify iterator is reset + CATCH_REQUIRE(iterator.size() == 0); + CATCH_REQUIRE(iterator.batch_number() == 0); + + // Get first batch of second query + iterator.next(10); + CATCH_REQUIRE(iterator.size() == 10); + CATCH_REQUIRE(iterator.batch_number() == 1); + + // The results should be different (assuming different queries) + auto second_query_results = std::vector(); + for (auto n : iterator) { + second_query_results.push_back(n.id()); + } + + // Results should be different (not necessarily completely) + // Just check that update() actually reset the iterator state + CATCH_REQUIRE(!iterator.done()); + } + + CATCH_SECTION("Static Index - Done Condition") { + auto index = build_static_ivf_index(); + + auto query = std::span(queries.get_datum(0)); + auto iterator = index.make_batch_iterator(query); + + // Initially not done + CATCH_REQUIRE(!iterator.done()); + + // Keep fetching until done + size_t total_fetched = 0; + while (!iterator.done() && total_fetched < index.size() + 100) { + iterator.next(10); + total_fetched += iterator.size(); + } + + // Should eventually be done + CATCH_REQUIRE(iterator.done()); + } + + CATCH_SECTION("Dynamic Index") { + auto index = build_dynamic_ivf_index(); + + std::unordered_set valid_ids; + for (size_t i = 0; i < index.size(); ++i) { + valid_ids.insert(i); + } + auto checker = DynamicChecker(valid_ids); + check(index, queries.cview(), gt.cview(), checker); + } + + CATCH_SECTION("Dynamic Index - Delete and Search") { + auto index = build_dynamic_ivf_index(); + auto original = svs::data::SimpleData::load(test_dataset::data_svs_file()); + + std::unordered_set valid_ids; + for (size_t i = 0; i < index.size(); ++i) { + valid_ids.insert(i); + } + auto checker = DynamicChecker(valid_ids); + + // Delete the best candidate for each of the first few queries + auto ids_to_delete = std::vector(); + for (size_t i = 0; i < std::min(5, QUERIES_TO_CHECK); ++i) { + auto nearest_neighbor = gt.get_datum(i).front(); + auto it = + std::find(ids_to_delete.begin(), ids_to_delete.end(), nearest_neighbor); + if (it == ids_to_delete.end()) { + ids_to_delete.push_back(nearest_neighbor); + CATCH_REQUIRE(valid_ids.erase(nearest_neighbor) == 1); + } + } + + fmt::print("Deleting {} entries\n", ids_to_delete.size()); + index.delete_entries(ids_to_delete); + checker.clear(); + check(index, queries.cview(), gt.cview(), checker); + + // Verify deleted IDs are not returned + for (auto id : ids_to_delete) { + CATCH_REQUIRE(!checker.seen_.contains(id)); + } + } + + CATCH_SECTION("Iterator Contents and Span") { + auto index = build_static_ivf_index(); + + auto query = std::span(queries.get_datum(0)); + auto iterator = index.make_batch_iterator(query); + + // Get a batch + iterator.next(20); + CATCH_REQUIRE(iterator.size() == 20); + + // Test contents() returns a valid span + auto contents = iterator.contents(); + CATCH_REQUIRE(contents.size() == 20); + + // Verify contents match iteration + size_t idx = 0; + for (auto n : iterator) { + CATCH_REQUIRE(n.id() == contents[idx].id()); + CATCH_REQUIRE(n.distance() == contents[idx].distance()); + ++idx; + } + } + + CATCH_SECTION("Restart Search") { + auto index = build_static_ivf_index(); + + auto query = std::span(queries.get_datum(0)); + auto iterator = index.make_batch_iterator(query); + + // Get first batch + iterator.next(10); + CATCH_REQUIRE(iterator.batch_number() == 1); + + auto first_results = std::vector(); + for (auto n : iterator) { + first_results.push_back(n.id()); + } + + // Force restart + iterator.restart_next_search(); + + // Get another batch + iterator.next(10); + CATCH_REQUIRE(iterator.batch_number() == 2); + + // After restart, the new batch should not duplicate any IDs from first batch + for (auto n : iterator) { + CATCH_REQUIRE( + std::find(first_results.begin(), first_results.end(), n.id()) == + first_results.end() + ); + } + } +} From 033fe6cf3d67407f2e85888dde8fb0317c34b8d6 Mon Sep 17 00:00:00 2001 From: Ishwar Bhati Date: Mon, 15 Dec 2025 16:33:59 -0800 Subject: [PATCH 3/4] Iterator at orchestrator level --- include/svs/orchestrators/dynamic_ivf.h | 22 ++ include/svs/orchestrators/ivf.h | 45 ++++ include/svs/orchestrators/ivf_iterator.h | 179 +++++++++++++++ tests/CMakeLists.txt | 1 + tests/integration/ivf/iterator.cpp | 269 +++++++++++++++++++++++ 5 files changed, 516 insertions(+) create mode 100644 include/svs/orchestrators/ivf_iterator.h create mode 100644 tests/integration/ivf/iterator.cpp diff --git a/include/svs/orchestrators/dynamic_ivf.h b/include/svs/orchestrators/dynamic_ivf.h index d0e15ff1a..de1322b41 100644 --- a/include/svs/orchestrators/dynamic_ivf.h +++ b/include/svs/orchestrators/dynamic_ivf.h @@ -216,6 +216,28 @@ class DynamicIVF : public manager::IndexManager { return impl_->get_distance(id, query_array); } + /// + /// @brief Return a new iterator (an instance of `svs::IVFIterator`) for the query. + /// + /// @tparam QueryType The element type of the query that will be given to the iterator. + /// @tparam N The dimension of the query. + /// + /// @param query The query to use for the iterator. + /// @param extra_search_buffer_capacity An optional extra search buffer capacity. + /// For IVF, the default of 0 means the buffer will be sized based on the first + /// batch_size passed to next(). + /// + /// The returned iterator will maintain an internal copy of the query. + /// + template + svs::IVFIterator batch_iterator( + std::span query, size_t extra_search_buffer_capacity = 0 + ) { + return impl_->batch_iterator( + svs::AnonymousArray<1>(query.data(), query.size()), extra_search_buffer_capacity + ); + } + ///// Assembly - Assemble from clustering and data template < manager::QueryTypeDefinition QueryTypes, diff --git a/include/svs/orchestrators/ivf.h b/include/svs/orchestrators/ivf.h index f0f86f84b..b45ab7edb 100644 --- a/include/svs/orchestrators/ivf.h +++ b/include/svs/orchestrators/ivf.h @@ -17,6 +17,7 @@ #pragma once #include "svs/index/ivf/index.h" +#include "svs/orchestrators/ivf_iterator.h" #include "svs/orchestrators/manager.h" namespace svs { @@ -30,6 +31,11 @@ class IVFInterface { ///// Distance calculation virtual double get_distance(size_t id, const AnonymousArray<1>& query) const = 0; + + ///// Iterator + virtual IVFIterator batch_iterator( + svs::AnonymousArray<1> query, size_t extra_search_buffer_capacity = 0 + ) = 0; }; template @@ -72,6 +78,23 @@ class IVFImpl : public manager::ManagerImpl { } ); } + + ///// Iterator + IVFIterator batch_iterator( + svs::AnonymousArray<1> query, size_t extra_search_buffer_capacity = 0 + ) override { + // Match the query type. + return svs::lib::match( + QueryTypes{}, + query.type(), + [&](svs::lib::Type SVS_UNUSED(type)) { + return IVFIterator{ + impl(), + std::span(svs::get(query), query.size(0)), + extra_search_buffer_capacity}; + } + ); + } }; ///// @@ -105,6 +128,28 @@ class IVF : public manager::IndexManager { return impl_->get_distance(id, query_array); } + /// + /// @brief Return a new iterator (an instance of `svs::IVFIterator`) for the query. + /// + /// @tparam QueryType The element type of the query that will be given to the iterator. + /// @tparam N The dimension of the query. + /// + /// @param query The query to use for the iterator. + /// @param extra_search_buffer_capacity An optional extra search buffer capacity. + /// For IVF, the default of 0 means the buffer will be sized based on the first + /// batch_size passed to next(). + /// + /// The returned iterator will maintain an internal copy of the query. + /// + template + svs::IVFIterator batch_iterator( + std::span query, size_t extra_search_buffer_capacity = 0 + ) { + return impl_->batch_iterator( + svs::AnonymousArray<1>(query.data(), query.size()), extra_search_buffer_capacity + ); + } + ///// Assembling template < manager::QueryTypeDefinition QueryTypes, diff --git a/include/svs/orchestrators/ivf_iterator.h b/include/svs/orchestrators/ivf_iterator.h new file mode 100644 index 000000000..24c2b8e65 --- /dev/null +++ b/include/svs/orchestrators/ivf_iterator.h @@ -0,0 +1,179 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// svs-ivf +#include "svs/index/ivf/iterator.h" + +// stl +#include + +namespace svs { + +/// @brief Type-erased wrapper for the low-level IVF iterator. +class IVFIterator { + private: + struct Interface { + virtual svs::index::ivf::IVFSearchParameters + parameters_for_current_iteration() const = 0; + virtual svs::DataType query_type() const = 0; + virtual size_t batch_number() const = 0; + virtual size_t size() const = 0; + virtual std::span> results() const = 0; + virtual void restart_next_search() = 0; + virtual void next( + size_t batch_size, + const lib::DefaultPredicate& cancel = lib::Returns(lib::Const()) + ) = 0; + virtual bool done() const = 0; + virtual void update(svs::AnonymousArray<1> newquery) = 0; + + virtual ~Interface() = default; + }; + + template struct Implementation : Interface { + // For the type-erased implementation - require the schedule to be type-erased as + // well. + using type = svs::index::ivf::BatchIterator; + + Implementation( + Index& index, + std::span query, + size_t extra_search_buffer_capacity = 0 + ) + : impl_{index, query, extra_search_buffer_capacity} {} + + svs::index::ivf::IVFSearchParameters + parameters_for_current_iteration() const override { + return impl_.parameters_for_current_iteration(); + } + + svs::DataType query_type() const override { return svs::datatype_v; } + + size_t batch_number() const override { return impl_.batch_number(); } + size_t size() const override { return impl_.size(); } + + std::span> results() const override { + return impl_.contents(); + } + + void restart_next_search() override { impl_.restart_next_search(); } + void next( + size_t batch_size, + const lib::DefaultPredicate& cancel = lib::Returns(lib::Const()) + ) override { + impl_.next(batch_size, cancel); + } + bool done() const override { return impl_.done(); } + + // Query Updates + void update(svs::AnonymousArray<1> newquery) override { + if (newquery.type() == svs::datatype_v) { + impl_.update( + std::span(get(newquery), newquery.size(0)) + ); + } + } + + // Member + type impl_; + }; + + std::unique_ptr impl_; + + public: + /// @brief Construct a new batch iterator for the query over the index. + /// + /// Argument ``extra_search_buffer_capacity`` is the extra search buffer capacity to use + /// for the search. For IVF, this determines the initial buffer size before the first + /// batch. The default of 0 means the buffer will be sized based on the first batch_size + /// passed to next(). + template + IVFIterator( + Index& parent, + std::span query, + size_t extra_search_buffer_capacity = 0 + ) + : impl_{std::make_unique>( + parent, query, extra_search_buffer_capacity + )} {} + + /// @brief Return the search parameters used for the current batch. + [[nodiscard]] svs::index::ivf::IVFSearchParameters + parameters_for_current_iteration() const { + return impl_->parameters_for_current_iteration(); + } + + /// @brief Return the element type of the captured query. + [[nodiscard]] svs::DataType query_type() const { return impl_->query_type(); } + + /// @brief Return the current batch number. + [[nodiscard]] size_t batch_number() const { return impl_->batch_number(); } + + /// @brief Return the number of results for the current batch. + [[nodiscard]] size_t size() const { return impl_->size(); } + + /// @brief Return a span of the results for the current batch. + [[nodiscard]] std::span> results() const { + return impl_->results(); + } + + /// @brief Prepare a new batch of results. + /// + /// After calling this method, previous results will no longer be available. + /// This method invalidates previous values return by ``results()``. + /// @param batch_size The number of results to return in the next batch. + /// In some scenarios (like when all entries are returned or if search is + /// cancelled), results size can be lower than the ``batch_size``. + /// @param cancel A predicate called during the search to determine if the search should + /// be cancelled. + /// + void next( + size_t batch_size, + const lib::DefaultPredicate& cancel = lib::Returns(lib::Const()) + ) { + impl_->next(batch_size, cancel); + } + + /// @brief Signal that the next batch search should begin entirely from scratch. + /// + /// The iterator records some internal state to accelerate future calls to ``next()``. + /// This caching of results may yield slightly different results than beginning index + /// search completely over from the original entry points. + /// + /// Calling this method signals the iterator to abandon its cached state. + /// + /// This can be helpful for measuring performance and verifying recall values. + void restart_next_search() const { impl_->restart_next_search(); } + + /// @brief Returns whether iterator can find more neighbors or not for the given query. + /// + /// The iterator is considered done when all the available nodes have been yielded or + /// when the search can not find any more neighbors. The transition from not done to + /// done will be triggered by a call to ``next()``. The contents of ``batch_number()`` + /// and ``parameters_for_current_iteration()`` will then remain unchanged by subsequent + /// invocations of ``next()``. + bool done() const { return impl_->done(); } + + /// @brief Update the iterator with a new query. + /// + template void update(std::span newquery) { + impl_->update(svs::AnonymousArray<1>{newquery.data(), newquery.size()}); + } +}; + +} // namespace svs diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2271a1c06..95555eb15 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -179,6 +179,7 @@ if (SVS_EXPERIMENTAL_ENABLE_IVF) ${TEST_DIR}/integration/ivf/index_search.cpp ${TEST_DIR}/integration/ivf/scalar_search.cpp ${TEST_DIR}/integration/ivf/dynamic_scalar.cpp + ${TEST_DIR}/integration/ivf/iterator.cpp ) endif() diff --git a/tests/integration/ivf/iterator.cpp b/tests/integration/ivf/iterator.cpp new file mode 100644 index 000000000..b73d52323 --- /dev/null +++ b/tests/integration/ivf/iterator.cpp @@ -0,0 +1,269 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// svs +#include "svs/orchestrators/dynamic_ivf.h" +#include "svs/orchestrators/ivf.h" + +// catch2 +#include "catch2/catch_test_macros.hpp" + +// tests +#include "tests/utils/test_dataset.h" + +// stl +#include +#include + +namespace { + +const size_t NUM_CLUSTERS = 10; + +// Helper to build a static IVF index using the orchestrator +svs::IVF make_static_ivf_index() { + auto data = svs::data::SimpleData::load(test_dataset::data_svs_file()); + auto distance = svs::DistanceL2{}; + size_t num_threads = 2; + size_t intra_query_threads = 2; + + // Build clustering + auto build_params = svs::index::ivf::IVFBuildParameters(NUM_CLUSTERS, 10, false); + auto clustering = + svs::IVF::build_clustering(build_params, data, distance, num_threads); + + // Assemble the index + return svs::IVF::assemble_from_clustering( + std::move(clustering), data, distance, num_threads, intra_query_threads + ); +} + +// Helper to build a dynamic IVF index using the orchestrator +svs::DynamicIVF make_dynamic_ivf_index() { + auto data = svs::data::SimpleData::load(test_dataset::data_svs_file()); + auto distance = svs::DistanceL2{}; + size_t num_threads = 2; + size_t intra_query_threads = 2; + + // Generate IDs for all data points + std::vector initial_ids(data.size()); + std::iota(initial_ids.begin(), initial_ids.end(), 0); + + // Build clustering + auto build_params = svs::index::ivf::IVFBuildParameters(NUM_CLUSTERS, 10, false); + auto clustering = + svs::IVF::build_clustering(build_params, data, distance, num_threads); + + // Assemble the dynamic index + return svs::DynamicIVF::assemble_from_clustering( + std::move(clustering), + std::move(data), + initial_ids, + distance, + num_threads, + intra_query_threads + ); +} + +void test_static_iterator() { + auto index = make_static_ivf_index(); + auto queries = svs::data::SimpleData::load(test_dataset::query_file()); + + // Set batch size + size_t batchsize = 10; + + // Create a batch iterator over the index for the first query + auto query = queries.get_datum(0); + auto query_span = std::span(query.data(), query.size()); + auto itr = index.batch_iterator(query_span); + + // Ensure the iterator is initialized correctly. No search happens at this point. + CATCH_REQUIRE(itr.size() == 0); + CATCH_REQUIRE(itr.batch_number() == 0); + CATCH_REQUIRE(!itr.done()); + + // Get first batch + itr.next(batchsize); + CATCH_REQUIRE(itr.size() == batchsize); + CATCH_REQUIRE(itr.batch_number() == 1); + CATCH_REQUIRE(!itr.done()); + + // Get results and verify no duplicates + std::unordered_set seen_ids; + auto results = itr.results(); + CATCH_REQUIRE(results.size() == batchsize); + for (const auto& neighbor : results) { + CATCH_REQUIRE(seen_ids.find(neighbor.id()) == seen_ids.end()); + seen_ids.insert(neighbor.id()); + } + + // Get second batch + itr.next(batchsize); + CATCH_REQUIRE(itr.size() == batchsize); + CATCH_REQUIRE(itr.batch_number() == 2); + + // Verify no duplicates across batches + results = itr.results(); + for (const auto& neighbor : results) { + CATCH_REQUIRE(seen_ids.find(neighbor.id()) == seen_ids.end()); + seen_ids.insert(neighbor.id()); + } + + // Continue until done + size_t max_iterations = index.size() / batchsize + 10; + size_t iterations = 2; + while (!itr.done() && iterations < max_iterations) { + itr.next(batchsize); + for (const auto& neighbor : itr.results()) { + CATCH_REQUIRE(seen_ids.find(neighbor.id()) == seen_ids.end()); + seen_ids.insert(neighbor.id()); + } + ++iterations; + } + + // Should eventually be done + CATCH_REQUIRE(itr.done()); + + // Test update with new query + auto query2 = queries.get_datum(1); + auto query2_span = std::span(query2.data(), query2.size()); + itr.update(query2_span); + + // Iterator should be reset + CATCH_REQUIRE(itr.size() == 0); + CATCH_REQUIRE(itr.batch_number() == 0); + CATCH_REQUIRE(!itr.done()); + + // Should be able to get results for new query + itr.next(batchsize); + CATCH_REQUIRE(itr.size() == batchsize); + CATCH_REQUIRE(itr.batch_number() == 1); +} + +void test_dynamic_iterator() { + auto index = make_dynamic_ivf_index(); + auto queries = svs::data::SimpleData::load(test_dataset::query_file()); + + // Set batch size + size_t batchsize = 10; + + // Create a batch iterator over the index for the first query + auto query = queries.get_datum(0); + auto query_span = std::span(query.data(), query.size()); + auto itr = index.batch_iterator(query_span); + + // Ensure the iterator is initialized correctly + CATCH_REQUIRE(itr.size() == 0); + CATCH_REQUIRE(itr.batch_number() == 0); + CATCH_REQUIRE(!itr.done()); + + // Get first batch + itr.next(batchsize); + CATCH_REQUIRE(itr.size() == batchsize); + CATCH_REQUIRE(itr.batch_number() == 1); + CATCH_REQUIRE(!itr.done()); + + // Verify all returned IDs are valid (exist in the index) + auto results = itr.results(); + for (const auto& neighbor : results) { + CATCH_REQUIRE(index.has_id(neighbor.id())); + } + + // Get second batch and verify no duplicates + std::unordered_set seen_ids; + for (const auto& neighbor : results) { + seen_ids.insert(neighbor.id()); + } + + itr.next(batchsize); + CATCH_REQUIRE(itr.size() == batchsize); + CATCH_REQUIRE(itr.batch_number() == 2); + + results = itr.results(); + for (const auto& neighbor : results) { + CATCH_REQUIRE(seen_ids.find(neighbor.id()) == seen_ids.end()); + CATCH_REQUIRE(index.has_id(neighbor.id())); + seen_ids.insert(neighbor.id()); + } +} + +void test_iterator_restart() { + auto index = make_static_ivf_index(); + auto queries = svs::data::SimpleData::load(test_dataset::query_file()); + + size_t batchsize = 10; + auto query = queries.get_datum(0); + auto query_span = std::span(query.data(), query.size()); + auto itr = index.batch_iterator(query_span); + + // Get first batch + itr.next(batchsize); + CATCH_REQUIRE(itr.batch_number() == 1); + + auto first_results = std::vector(); + for (const auto& neighbor : itr.results()) { + first_results.push_back(neighbor.id()); + } + + // Signal restart + itr.restart_next_search(); + + // Get next batch + itr.next(batchsize); + CATCH_REQUIRE(itr.batch_number() == 2); + + // After restart, the new batch should not duplicate any IDs from first batch + for (const auto& neighbor : itr.results()) { + CATCH_REQUIRE( + std::find(first_results.begin(), first_results.end(), neighbor.id()) == + first_results.end() + ); + } +} + +void test_iterator_extra_buffer_capacity() { + auto index = make_static_ivf_index(); + auto queries = svs::data::SimpleData::load(test_dataset::query_file()); + + auto query = queries.get_datum(0); + auto query_span = std::span(query.data(), query.size()); + + // Create iterator with custom extra buffer capacity + size_t extra_buffer = 50; + auto itr = index.batch_iterator(query_span, extra_buffer); + + CATCH_REQUIRE(itr.size() == 0); + CATCH_REQUIRE(itr.batch_number() == 0); + + // Get first batch + itr.next(20); + CATCH_REQUIRE(itr.size() == 20); + CATCH_REQUIRE(itr.batch_number() == 1); + CATCH_REQUIRE(!itr.done()); +} + +} // namespace + +CATCH_TEST_CASE("IVF Iterator Integration", "[integration][ivf][iterator]") { + CATCH_SECTION("Static IVF Iterator") { test_static_iterator(); } + + CATCH_SECTION("Dynamic IVF Iterator") { test_dynamic_iterator(); } + + CATCH_SECTION("Iterator Restart") { test_iterator_restart(); } + + CATCH_SECTION("Iterator Extra Buffer Capacity") { + test_iterator_extra_buffer_capacity(); + } +} From b9ea5cb337669b1e13b490d720aa537ab9590d39 Mon Sep 17 00:00:00 2001 From: Ishwar Bhati Date: Tue, 16 Dec 2025 12:59:38 -0800 Subject: [PATCH 4/4] update size to point to number of vectors not clusters --- bindings/python/tests/test_ivf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/python/tests/test_ivf.py b/bindings/python/tests/test_ivf.py index 08968e607..f8c416f22 100644 --- a/bindings/python/tests/test_ivf.py +++ b/bindings/python/tests/test_ivf.py @@ -36,7 +36,7 @@ test_groundtruth_cosine, \ test_ivf_reference, \ test_ivf_clustering, \ - test_number_of_clusters, \ + test_number_of_vectors, \ test_dimensions, \ timed, \ get_test_set, \ @@ -167,7 +167,7 @@ def _test_basic_inner( test_get_distance(ivf, svs.DistanceType.L2, data) # Data interface - self.assertEqual(ivf.size, test_number_of_clusters) + self.assertEqual(ivf.size, test_number_of_vectors) # The dimensionality exposed by the index should always match the original # dataset dimensions.