From 688f06a3395f2110c7cbe3d0d5fe86e698572444 Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Fri, 26 Sep 2025 21:14:45 +0000 Subject: [PATCH 01/12] Chunked memory allocation --- CMakeLists.txt | 1 + hnswlib/hnswalg.h | 103 ++++++++++++++++----------------- hnswlib/hnswlib.h | 143 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+), 54 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a34a67f2..bb94b431 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,6 +200,7 @@ if(HNSWLIB_EXAMPLES) if(ENABLE_ASAN OR ENABLE_UBSAN) add_cxx_flags(-DHNSWLIB_USE_PREFETCH=0) endif() + add_cxx_flags(-DHNSWLIB_USE_PREFETCH=0) # TODO(mbautin): remove add_cxx_flags(-Wall -Wextra -Wpedantic -Werror) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index a04b2ed4..dbdccd94 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -35,6 +35,7 @@ class HierarchicalNSW : public AlgorithmInterface { size_t maxM0_{0}; size_t ef_construction_{0}; size_t ef_{ 0 }; + const size_t k_elements_per_chunk{10*1024}; double mult_{0.0}, revSize_{0.0}; int maxlevel_{0}; @@ -52,8 +53,8 @@ class HierarchicalNSW : public AlgorithmInterface { size_t size_links_level0_{0}; size_t offsetData_{0}, offsetLevel0_{0}, label_offset_{ 0 }; - char *data_level0_memory_{nullptr}; - char **linkLists_{nullptr}; + ChunkedArray data_level0_memory_; + ChunkedArray linkLists_; std::vector element_levels_; // keeps level of each element size_t data_size_{0}; @@ -128,10 +129,8 @@ class HierarchicalNSW : public AlgorithmInterface { label_offset_ = size_links_level0_ + data_size_; offsetLevel0_ = 0; - data_level0_memory_ = (char *) malloc(max_elements_ * size_data_per_element_); - if (data_level0_memory_ == nullptr) { - HNSWLIB_THROW_RUNTIME_ERROR("Not enough memory to allocate for level 0"); - } + data_level0_memory_ = ChunkedArray( + size_data_per_element_, k_elements_per_chunk, max_elements); cur_element_count = 0; @@ -141,11 +140,8 @@ class HierarchicalNSW : public AlgorithmInterface { enterpoint_node_ = -1; maxlevel_ = -1; - linkLists_ = (char **) malloc(sizeof(void *) * max_elements_); - if (linkLists_ == nullptr) { - HNSWLIB_THROW_RUNTIME_ERROR( - "Not enough memory: HierarchicalNSW failed to allocate linklists"); - } + linkLists_ = ChunkedArray( + sizeof(void *), k_elements_per_chunk, max_elements); size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); mult_ = 1 / log(1.0 * M_); @@ -157,17 +153,22 @@ class HierarchicalNSW : public AlgorithmInterface { clear(); } + char*& getLinkListPtrRef(tableint internal_id) { + return *reinterpret_cast(linkLists_[internal_id]); + } + + char* getLinkListPtr(tableint internal_id) const { + return *reinterpret_cast(linkLists_[internal_id]); + } + void clear() { - free(data_level0_memory_); - data_level0_memory_ = nullptr; + data_level0_memory_.clear(); for (tableint i = 0; i < cur_element_count; i++) { - if (element_levels_[i] > 0) - free(linkLists_[i]); - } - if (linkLists_) { - free(linkLists_); + if (element_levels_[i] > 0) { + free(getLinkListPtr(i)); + } } - linkLists_ = nullptr; + linkLists_.clear(); cur_element_count = 0; visited_list_pool_.reset(nullptr); } @@ -195,7 +196,7 @@ class HierarchicalNSW : public AlgorithmInterface { inline labeltype getExternalLabel(tableint internal_id) const { labeltype return_label; - memcpy(&return_label, (data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), sizeof(labeltype)); + memcpy(&return_label, data_level0_memory_[internal_id] + label_offset_, sizeof(labeltype)); return return_label; } @@ -212,17 +213,17 @@ class HierarchicalNSW : public AlgorithmInterface { inline void setExternalLabel(tableint internal_id, labeltype label) const { - memcpy((data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), &label, sizeof(labeltype)); + memcpy(data_level0_memory_[internal_id] + label_offset_, &label, sizeof(labeltype)); } inline labeltype *getExternalLabeLp(tableint internal_id) const { - return (labeltype *) (data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_); + return (labeltype *) (data_level0_memory_[internal_id] + label_offset_); } inline char *getDataByInternalId(tableint internal_id) const { - return (data_level0_memory_ + internal_id * size_data_per_element_ + offsetData_); + return (data_level0_memory_[internal_id] + offsetData_); } @@ -399,7 +400,7 @@ class HierarchicalNSW : public AlgorithmInterface { #if HNSWLIB_USE_PREFETCH _mm_prefetch((char *) (visited_array + *(data + 1)), _MM_HINT_T0); _mm_prefetch((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0); - _mm_prefetch(data_level0_memory_ + (*(data + 1)) * size_data_per_element_ + offsetData_, _MM_HINT_T0); + _mm_prefetch(data_level0_memory_[*(data + 1)] + offsetData_, _MM_HINT_T0); _mm_prefetch((char *) (data + 2), _MM_HINT_T0); #endif #endif @@ -410,7 +411,7 @@ class HierarchicalNSW : public AlgorithmInterface { #ifdef USE_SSE #if HNSWLIB_USE_PREFETCH _mm_prefetch((char *) (visited_array + *(data + j + 1)), _MM_HINT_T0); - _mm_prefetch(data_level0_memory_ + (*(data + j + 1)) * size_data_per_element_ + offsetData_, + _mm_prefetch(data_level0_memory_[*(data + j + 1)] + offsetData_, _MM_HINT_T0); //////////// #endif #endif @@ -431,7 +432,7 @@ class HierarchicalNSW : public AlgorithmInterface { candidate_set.emplace(-dist, candidate_id); #ifdef USE_SSE #if HNSWLIB_USE_PREFETCH - _mm_prefetch(data_level0_memory_ + candidate_set.top().second * size_data_per_element_ + + _mm_prefetch(data_level0_memory_[candidate_set.top().second] + offsetLevel0_, /////////// _MM_HINT_T0); //////////////////////// #endif @@ -518,17 +519,18 @@ class HierarchicalNSW : public AlgorithmInterface { linklistsizeint *get_linklist0(tableint internal_id) const { - return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_); + return (linklistsizeint *) (data_level0_memory_[internal_id] + offsetLevel0_); } linklistsizeint *get_linklist0(tableint internal_id, char *data_level0_memory_) const { - return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_); + return (linklistsizeint *) (data_level0_memory_[internal_id] + offsetLevel0_); } linklistsizeint *get_linklist(tableint internal_id, int level) const { - return (linklistsizeint *) (linkLists_[internal_id] + (level - 1) * size_links_per_element_); + assert(level > 0); + return (linklistsizeint *) (getLinkListPtr(internal_id) + (level - 1) * size_links_per_element_); } @@ -681,16 +683,10 @@ class HierarchicalNSW : public AlgorithmInterface { std::vector(new_max_elements).swap(link_list_locks_); // Reallocate base layer - char * data_level0_memory_new = (char *) realloc(data_level0_memory_, new_max_elements * size_data_per_element_); - if (data_level0_memory_new == nullptr) - return Status("Not enough memory: resizeIndex failed to allocate base layer"); - data_level0_memory_ = data_level0_memory_new; + data_level0_memory_.resize(new_max_elements); // Reallocate all other layers - char ** linkLists_new = (char **) realloc(linkLists_, sizeof(void *) * new_max_elements); - if (linkLists_new == nullptr) - return Status("Not enough memory: resizeIndex failed to allocate other layers"); - linkLists_ = linkLists_new; + linkLists_.resize(new_max_elements); max_elements_ = new_max_elements; return OkStatus(); @@ -742,13 +738,13 @@ class HierarchicalNSW : public AlgorithmInterface { writeBinaryPOD(output, mult_); writeBinaryPOD(output, ef_construction_); - output.write(data_level0_memory_, cur_element_count * size_data_per_element_); + data_level0_memory_.writeToStream(output, cur_element_count); for (size_t i = 0; i < cur_element_count; i++) { unsigned int linkListSize = element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0; writeBinaryPOD(output, linkListSize); if (linkListSize) - output.write(linkLists_[i], linkListSize); + output.write(getLinkListPtrRef(i), linkListSize); } output.close(); return OkStatus(); @@ -823,10 +819,11 @@ class HierarchicalNSW : public AlgorithmInterface { input.seekg(pos, input.beg); - data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_); - if (data_level0_memory_ == nullptr) - return Status("Not enough memory: loadIndex failed to allocate level0"); - input.read(data_level0_memory_, cur_element_count * size_data_per_element_); + data_level0_memory_ = ChunkedArray( + size_data_per_element_, + k_elements_per_chunk, + max_elements); + data_level0_memory_.readFromStream(input, cur_element_count); size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); @@ -836,9 +833,7 @@ class HierarchicalNSW : public AlgorithmInterface { visited_list_pool_.reset(new VisitedListPool(1, max_elements)); - linkLists_ = (char **) malloc(sizeof(void *) * max_elements); - if (linkLists_ == nullptr) - return Status("Not enough memory: loadIndex failed to allocate linklists"); + linkLists_.resize(max_elements); element_levels_ = std::vector(max_elements); revSize_ = 1.0 / mult_; ef_ = 10; @@ -848,13 +843,13 @@ class HierarchicalNSW : public AlgorithmInterface { readBinaryPOD(input, linkListSize); if (linkListSize == 0) { element_levels_[i] = 0; - linkLists_[i] = nullptr; + getLinkListPtrRef(i) = nullptr; } else { element_levels_[i] = linkListSize / size_links_per_element_; - linkLists_[i] = (char *) malloc(linkListSize); - if (linkLists_[i] == nullptr) + getLinkListPtrRef(i) = (char *) malloc(linkListSize); + if (getLinkListPtrRef(i) == nullptr) return Status("Not enough memory: loadIndex failed to allocate linklist"); - input.read(linkLists_[i], linkListSize); + input.read(getLinkListPtrRef(i), linkListSize); } } @@ -1262,18 +1257,18 @@ class HierarchicalNSW : public AlgorithmInterface { tableint currObj = enterpoint_node_; tableint enterpoint_copy = enterpoint_node_; - memset(data_level0_memory_ + cur_c * size_data_per_element_ + offsetLevel0_, 0, size_data_per_element_); + memset(data_level0_memory_[cur_c] + offsetLevel0_, 0, size_data_per_element_); // Initialisation of the data and label memcpy(getExternalLabeLp(cur_c), &label, sizeof(labeltype)); memcpy(getDataByInternalId(cur_c), data_point, data_size_); if (curlevel) { - linkLists_[cur_c] = (char *) malloc(size_links_per_element_ * curlevel + 1); - if (linkLists_[cur_c] == nullptr) { + getLinkListPtrRef(cur_c) = (char *) malloc(size_links_per_element_ * curlevel + 1); + if (getLinkListPtrRef(cur_c) == nullptr) { return Status("Not enough memory: addPoint failed to allocate linklist"); } - memset(linkLists_[cur_c], 0, size_links_per_element_ * curlevel + 1); + memset(getLinkListPtrRef(cur_c), 0, size_links_per_element_ * curlevel + 1); } if ((signed)currObj != -1) { diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 570e876b..c9ca2d30 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -26,6 +26,10 @@ #endif #endif +#include + +#include + #if defined(USE_AVX) || defined(USE_SSE) #ifdef _MSC_VER #include @@ -356,6 +360,145 @@ class AlgorithmInterface { } }; +namespace internal { + +struct FreeDeleter { + void operator()(void* ptr) const { + std::free(ptr); + } +}; + +using MallocUniqueCharArrayPtr = std::unique_ptr; + +// Allocates the given number of bytes as a special kind of a unique pointer. +// Does not initialize the memory. +MallocUniqueCharArrayPtr makeUniqueCharArray(size_t n_bytes) { + char* raw_ptr = static_cast(malloc(n_bytes)); + return MallocUniqueCharArrayPtr(raw_ptr); +} + +} // namespace internal + +class ChunkedArray { + public: + ChunkedArray() + : element_byte_size_(0), + elements_per_chunk_(0), + element_count_(0) { + } + + ChunkedArray(size_t element_byte_size, + size_t elements_per_chunk, + size_t element_count) : + element_byte_size_(element_byte_size), + elements_per_chunk_(elements_per_chunk), + element_count_(0) { + resize(element_count); + } + + ChunkedArray(const ChunkedArray& other) = delete; + ChunkedArray& operator=(const ChunkedArray& other) = delete; + + ChunkedArray(ChunkedArray&& other) noexcept { + swap(other); + } + + ChunkedArray& operator=(ChunkedArray&& other) noexcept { + if (this != &other) { + swap(other); + } + return *this; + } + + void swap(ChunkedArray& other) noexcept { + std::swap(element_byte_size_, other.element_byte_size_); + std::swap(elements_per_chunk_, other.elements_per_chunk_); + std::swap(element_count_, other.element_count_); + std::swap(chunks_, other.chunks_); + } + + ~ChunkedArray() { + } + + size_t getCapacity() const { + return element_count_; + } + + size_t getSizePerElement() const { + return element_byte_size_; + } + + size_t getSizePerChunk() const { + return elements_per_chunk_ * element_byte_size_; + } + + char* operator[](size_t i) const { + assert(i < getCapacity()); + if (i >= getCapacity()) return nullptr; + size_t chunk_index = i / elements_per_chunk_; + size_t index_in_chunk = i % elements_per_chunk_; + return chunks_[chunk_index].get() + element_byte_size_ * index_in_chunk; + } + + void clear() { + chunks_.clear(); + element_count_ = 0; + } + + void resize(size_t new_element_count) { + size_t chunk_count = getChunkCount(element_count_); + size_t new_chunk_count = getChunkCount(new_element_count); + + chunks_.resize(new_chunk_count); + for (size_t i = chunk_count; i < new_chunk_count; i++) { + chunks_[i] = internal::makeUniqueCharArray(getSizePerChunk()); + } + + element_count_ = new_element_count; + } + + void writeToStream(std::ostream& output, size_t num_elements_to_write) { + size_t num_chunks_to_write = getChunkCount(num_elements_to_write); + size_t last_chunk_bytes = + element_byte_size_ * (num_elements_to_write % elements_per_chunk_); + for (size_t i = 0; i < num_chunks_to_write; ++i) { + output.write( + chunks_[i].get(), + i + 1 == num_chunks_to_write ? last_chunk_bytes : getSizePerChunk()); + } + } + + void readFromStream(std::istream& input, size_t num_elements_to_read) { + assert(num_elements_to_read <= element_count_); + size_t num_chunks_to_read = getChunkCount(num_elements_to_read); + size_t last_chunk_bytes = + element_byte_size_ * (num_elements_to_read % elements_per_chunk_); + for (size_t i = 0; i < num_chunks_to_read; ++i) { + input.read( + chunks_[i].get(), + i + 1 == num_chunks_to_read ? last_chunk_bytes : getSizePerChunk()); + } + } + + std::deque::const_iterator begin_chunk() const { + return chunks_.begin(); + } + + std::deque::const_iterator end_chunk() const { + return chunks_.end(); + } + + private: + size_t getChunkCount(size_t element_count) const { + return (element_count + elements_per_chunk_ - 1) / elements_per_chunk_; + } + + size_t element_byte_size_; + size_t elements_per_chunk_; + size_t element_count_; + std::deque chunks_; +}; + } // namespace hnswlib #include "space_l2.h" From 5de0c432e74be7ef881ea65adacdf2142ae076a4 Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Sat, 27 Sep 2025 00:47:42 +0000 Subject: [PATCH 02/12] Turn prefetching back on --- CMakeLists.txt | 2 -- hnswlib/hnswalg.h | 87 +++++++++++++++++------------------------------ hnswlib/hnswlib.h | 42 +++++++++++++++-------- 3 files changed, 58 insertions(+), 73 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bb94b431..e3b8254f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,8 +200,6 @@ if(HNSWLIB_EXAMPLES) if(ENABLE_ASAN OR ENABLE_UBSAN) add_cxx_flags(-DHNSWLIB_USE_PREFETCH=0) endif() - add_cxx_flags(-DHNSWLIB_USE_PREFETCH=0) # TODO(mbautin): remove - add_cxx_flags(-Wall -Wextra -Wpedantic -Werror) # Unused functions in header files might still be used by other code diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index dbdccd94..2ea1fe1b 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -129,8 +129,10 @@ class HierarchicalNSW : public AlgorithmInterface { label_offset_ = size_links_level0_ + data_size_; offsetLevel0_ = 0; + // Allocate 64 more bytes for each chunk so we can safely prefetch a + // cache line beyond the chunk. data_level0_memory_ = ChunkedArray( - size_data_per_element_, k_elements_per_chunk, max_elements); + size_data_per_element_, k_elements_per_chunk, max_elements, 64); cur_element_count = 0; @@ -141,7 +143,7 @@ class HierarchicalNSW : public AlgorithmInterface { maxlevel_ = -1; linkLists_ = ChunkedArray( - sizeof(void *), k_elements_per_chunk, max_elements); + sizeof(void *), k_elements_per_chunk, max_elements, 0); size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); mult_ = 1 / log(1.0 * M_); @@ -226,7 +228,6 @@ class HierarchicalNSW : public AlgorithmInterface { return (data_level0_memory_[internal_id] + offsetData_); } - int getRandomLevel(double reverse_size) { std::uniform_real_distribution distribution(0.0, 1.0); double r = -log(distribution(level_generator_)) * reverse_size; @@ -286,24 +287,18 @@ class HierarchicalNSW : public AlgorithmInterface { } size_t size = getListCount((linklistsizeint*)data); tableint *datal = (tableint *) (data + 1); -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch((char *) (visited_array + *(data + 1)), _MM_HINT_T0); - _mm_prefetch((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0); - _mm_prefetch(getDataByInternalId(*datal), _MM_HINT_T0); - _mm_prefetch(getDataByInternalId(*(datal + 1)), _MM_HINT_T0); -#endif -#endif + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1)), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH(getDataByInternalId(*datal), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH(getDataByInternalId(*(datal + 1)), _MM_HINT_T0); for (size_t j = 0; j < size; j++) { tableint candidate_id = *(datal + j); // if (candidate_id == 0) continue; -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch((char *) (visited_array + *(datal + j + 1)), _MM_HINT_T0); - _mm_prefetch(getDataByInternalId(*(datal + j + 1)), _MM_HINT_T0); -#endif -#endif + if (j + 1 < size) { + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(datal + j + 1)), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH(getDataByInternalId(*(datal + j + 1)), _MM_HINT_T0); + } if (visited_array[candidate_id] == visited_array_tag) continue; visited_array[candidate_id] = visited_array_tag; char *currObj1 = (getDataByInternalId(candidate_id)); @@ -311,11 +306,7 @@ class HierarchicalNSW : public AlgorithmInterface { dist_t dist1 = fstdistfunc_(data_point, currObj1, dist_func_param_); if (top_candidates.size() < ef_construction_ || lowerBound > dist1) { candidateSet.emplace(-dist1, candidate_id); -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch(getDataByInternalId(candidateSet.top().second), _MM_HINT_T0); -#endif -#endif + HNSWLIB_MM_PREFETCH(getDataByInternalId(candidateSet.top().second), _MM_HINT_T0); if (!isMarkedDeleted(candidate_id)) top_candidates.emplace(dist1, candidate_id); @@ -396,25 +387,18 @@ class HierarchicalNSW : public AlgorithmInterface { metric_distance_computations+=size; } -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch((char *) (visited_array + *(data + 1)), _MM_HINT_T0); - _mm_prefetch((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0); - _mm_prefetch(data_level0_memory_[*(data + 1)] + offsetData_, _MM_HINT_T0); - _mm_prefetch((char *) (data + 2), _MM_HINT_T0); -#endif -#endif + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1)), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH(data_level0_memory_[*(data + 1)] + offsetData_, _MM_HINT_T0); + HNSWLIB_MM_PREFETCH((char *) (data + 2), _MM_HINT_T0); for (size_t j = 1; j <= size; j++) { int candidate_id = *(data + j); -// if (candidate_id == 0) continue; -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch((char *) (visited_array + *(data + j + 1)), _MM_HINT_T0); - _mm_prefetch(data_level0_memory_[*(data + j + 1)] + offsetData_, - _MM_HINT_T0); //////////// -#endif -#endif + if (j < size) { + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + j + 1)), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH(data_level0_memory_[*(data + j + 1)] + offsetData_, + _MM_HINT_T0); + } if (!(visited_array[candidate_id] == visited_array_tag)) { visited_array[candidate_id] = visited_array_tag; @@ -430,13 +414,9 @@ class HierarchicalNSW : public AlgorithmInterface { if (flag_consider_candidate) { candidate_set.emplace(-dist, candidate_id); -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch(data_level0_memory_[candidate_set.top().second] + + HNSWLIB_MM_PREFETCH(data_level0_memory_[candidate_set.top().second] + offsetLevel0_, /////////// _MM_HINT_T0); //////////////////////// -#endif -#endif if (bare_bone_search || (!isMarkedDeleted(candidate_id) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(candidate_id))))) { @@ -822,7 +802,8 @@ class HierarchicalNSW : public AlgorithmInterface { data_level0_memory_ = ChunkedArray( size_data_per_element_, k_elements_per_chunk, - max_elements); + max_elements, + 64); data_level0_memory_.readFromStream(input, cur_element_count); size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); @@ -833,7 +814,9 @@ class HierarchicalNSW : public AlgorithmInterface { visited_list_pool_.reset(new VisitedListPool(1, max_elements)); - linkLists_.resize(max_elements); + linkLists_ = ChunkedArray( + sizeof(void *), k_elements_per_chunk, max_elements, 0); + element_levels_ = std::vector(max_elements); revSize_ = 1.0 / mult_; ef_ = 10; @@ -1126,17 +1109,9 @@ class HierarchicalNSW : public AlgorithmInterface { data = get_linklist_at_level(currObj, level); int size = getListCount(data); tableint *datal = (tableint *) (data + 1); -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch(getDataByInternalId(*datal), _MM_HINT_T0); -#endif -#endif + HNSWLIB_MM_PREFETCH(getDataByInternalId(*datal), _MM_HINT_T0); for (int i = 0; i < size; i++) { -#ifdef USE_SSE -#if HNSWLIB_USE_PREFETCH - _mm_prefetch(getDataByInternalId(*(datal + i + 1)), _MM_HINT_T0); -#endif -#endif + HNSWLIB_MM_PREFETCH(getDataByInternalId(*(datal + i + 1)), _MM_HINT_T0); tableint cand = datal[i]; dist_t d = fstdistfunc_(dataPoint, getDataByInternalId(cand), dist_func_param_); if (d < curdist) { @@ -1523,7 +1498,7 @@ class HierarchicalNSW : public AlgorithmInterface { if (isMarkedDeleted(internalId)) { unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId)) + 2; *ll_cur &= ~DELETE_MARK; - num_deleted_ -= 1; + num_deleted_ -= 1; if (allow_replace_deleted_) { std::unique_lock lock_deleted_elements(deleted_elements_lock); deleted_elements.erase(internalId); diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index c9ca2d30..8c0721e6 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -384,15 +384,18 @@ class ChunkedArray { ChunkedArray() : element_byte_size_(0), elements_per_chunk_(0), - element_count_(0) { + element_count_(0), + chunk_padding_bytes_(0) { } ChunkedArray(size_t element_byte_size, size_t elements_per_chunk, - size_t element_count) : + size_t element_count, + size_t chunk_padding_bytes) : element_byte_size_(element_byte_size), elements_per_chunk_(elements_per_chunk), - element_count_(0) { + element_count_(0), + chunk_padding_bytes_(chunk_padding_bytes) { resize(element_count); } @@ -415,6 +418,7 @@ class ChunkedArray { std::swap(elements_per_chunk_, other.elements_per_chunk_); std::swap(element_count_, other.element_count_); std::swap(chunks_, other.chunks_); + std::swap(chunk_padding_bytes_, other.chunk_padding_bytes_); } ~ChunkedArray() { @@ -435,9 +439,7 @@ class ChunkedArray { char* operator[](size_t i) const { assert(i < getCapacity()); if (i >= getCapacity()) return nullptr; - size_t chunk_index = i / elements_per_chunk_; - size_t index_in_chunk = i % elements_per_chunk_; - return chunks_[chunk_index].get() + element_byte_size_ * index_in_chunk; + return getElementNoRangeChecking(i); } void clear() { @@ -451,7 +453,8 @@ class ChunkedArray { chunks_.resize(new_chunk_count); for (size_t i = chunk_count; i < new_chunk_count; i++) { - chunks_[i] = internal::makeUniqueCharArray(getSizePerChunk()); + chunks_[i] = internal::makeUniqueCharArray( + getSizePerChunk() + chunk_padding_bytes_); } element_count_ = new_element_count; @@ -479,14 +482,6 @@ class ChunkedArray { i + 1 == num_chunks_to_read ? last_chunk_bytes : getSizePerChunk()); } } - - std::deque::const_iterator begin_chunk() const { - return chunks_.begin(); - } - - std::deque::const_iterator end_chunk() const { - return chunks_.end(); - } private: size_t getChunkCount(size_t element_count) const { @@ -497,10 +492,27 @@ class ChunkedArray { size_t elements_per_chunk_; size_t element_count_; std::deque chunks_; + size_t chunk_padding_bytes_; }; } // namespace hnswlib +#if defined(USE_SSE) && HNSWLIB_USE_PREFETCH +#if HNSWLIB_DEBUG_PREFETCH +// This mode is used to find prefetch statements causing range check errors in +// tests. We only print line numbers, which makes the output compact enough to +// catch range check errors in some tests. +#define HNSWLIB_MM_PREFETCH(address, hint) do { \ + std::cout << __LINE__ << " "; \ + _mm_prefetch(address, hint); \ +} while (0) +#else +#define HNSWLIB_MM_PREFETCH(address, hint) _mm_prefetch(address, hint) +#endif +#else +#define HNSWLIB_MM_PREFETCH(address, hint) +#endif + #include "space_l2.h" #include "space_ip.h" #include "stop_condition.h" From e23f6ca3d80a8fa056ed1dcb715376b95ea5e52c Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Sat, 27 Sep 2025 01:04:21 +0000 Subject: [PATCH 03/12] Fix compilation --- hnswlib/hnswlib.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 8c0721e6..837ad66c 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -439,7 +439,9 @@ class ChunkedArray { char* operator[](size_t i) const { assert(i < getCapacity()); if (i >= getCapacity()) return nullptr; - return getElementNoRangeChecking(i); + size_t chunk_index = i / elements_per_chunk_; + size_t index_in_chunk = i % elements_per_chunk_; + return chunks_[chunk_index].get() + element_byte_size_ * index_in_chunk; } void clear() { From a070bc5dbdc615ae01cb14938e4c2bfa2d38de77 Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Sat, 27 Sep 2025 02:49:54 +0000 Subject: [PATCH 04/12] Fix another invalid prefetch. Better output on range check error. --- hnswlib/hnswalg.h | 4 +++- hnswlib/hnswlib.h | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 2ea1fe1b..3388a4a0 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -1111,7 +1111,9 @@ class HierarchicalNSW : public AlgorithmInterface { tableint *datal = (tableint *) (data + 1); HNSWLIB_MM_PREFETCH(getDataByInternalId(*datal), _MM_HINT_T0); for (int i = 0; i < size; i++) { - HNSWLIB_MM_PREFETCH(getDataByInternalId(*(datal + i + 1)), _MM_HINT_T0); + if (i + 1 < size) { + HNSWLIB_MM_PREFETCH(getDataByInternalId(*(datal + i + 1)), _MM_HINT_T0); + } tableint cand = datal[i]; dist_t d = fstdistfunc_(dataPoint, getDataByInternalId(cand), dist_func_param_); if (d < curdist) { diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 837ad66c..16b1b636 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -437,7 +437,13 @@ class ChunkedArray { } char* operator[](size_t i) const { +#ifndef NDEBUG + if (i >= getCapacity()) { + HNSWERR << "Chunked array index out of range: i=" << i + << ", capacity=" << getCapacity() << std::endl; + } assert(i < getCapacity()); +#endif if (i >= getCapacity()) return nullptr; size_t chunk_index = i / elements_per_chunk_; size_t index_in_chunk = i % elements_per_chunk_; From 11bffd725b2e24d4c2e800da03430561feed4e20 Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Sat, 27 Sep 2025 03:59:31 +0000 Subject: [PATCH 05/12] Attempting to fix Python bindings --- hnswlib/hnswlib.h | 11 +++++++++++ python_bindings/bindings.cpp | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 16b1b636..da88153b 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -491,6 +491,17 @@ class ChunkedArray { } } + void copyTo(char* destination, size_t num_bytes) { + size_t chunk_index = 0; + size_t bytes_per_chunk = getSizePerChunk(); + while (num_bytes > 0) { + size_t cur_size = std::min(bytes_per_chunk, num_bytes); + memcpy(destination, chunks_[chunk_index].get(), cur_size); + num_bytes -= cur_size; + destination += cur_size; + } + } + private: size_t getChunkCount(size_t element_count) const { return (element_count + elements_per_chunk_ - 1) / elements_per_chunk_; diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index babf9741..2eb71d9f 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -383,7 +383,7 @@ class Index { memset(link_list_npy, 0, link_npy_size); - memcpy(data_level0_npy, appr_alg->data_level0_memory_, level0_npy_size); + appr_alg->data_level0_memory_.copyTo(data_level0_npy, level0_npy_size); memcpy(element_levels_npy, appr_alg->element_levels_.data(), appr_alg->element_levels_.size() * sizeof(int)); for (size_t i = 0; i < appr_alg->cur_element_count; i++) { From 17ef9d3c7777d6ca33308e41d3be0633fcbbb144 Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Sat, 27 Sep 2025 04:59:03 +0000 Subject: [PATCH 06/12] Another attempt to fix the Python bindings build --- hnswlib/hnswlib.h | 11 +++++++++++ python_bindings/bindings.cpp | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index da88153b..ab3fbceb 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -502,6 +502,17 @@ class ChunkedArray { } } + void copyFrom(char* source, size_t num_bytes) { + size_t chunk_index = 0; + size_t bytes_per_chunk = getSizePerChunk(); + while (num_bytes > 0) { + size_t cur_size = std::min(bytes_per_chunk, num_bytes); + memcpy(chunks_[chunk_index].get(), source, cur_size); + num_bytes -= cur_size; + source += cur_size; + } + } + private: size_t getChunkCount(size_t element_count) const { return (element_count + elements_per_chunk_ - 1) / elements_per_chunk_; diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 2eb71d9f..a666dff5 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -576,7 +576,7 @@ class Index { link_npy_size += linkListSize; } - memcpy(appr_alg->data_level0_memory_, data_level0_npy.data(), data_level0_npy.nbytes()); + appr_alg->data_level0_memory_.copyFrom(data_level0_npy.data(), data_level0_npy.nbytes()); for (size_t i = 0; i < appr_alg->max_elements_; i++) { size_t linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; From 0912989924ed9247f6b38fd33334dc53e8bb3091 Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Sat, 27 Sep 2025 05:46:15 +0000 Subject: [PATCH 07/12] Fix copyFrom argument (should be const char*) --- hnswlib/hnswlib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index ab3fbceb..1056e418 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -502,7 +502,7 @@ class ChunkedArray { } } - void copyFrom(char* source, size_t num_bytes) { + void copyFrom(const char* source, size_t num_bytes) { size_t chunk_index = 0; size_t bytes_per_chunk = getSizePerChunk(); while (num_bytes > 0) { From 55644e8a65cb573ca58553ccb1d8f9d98574ee21 Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Sat, 27 Sep 2025 08:16:08 +0000 Subject: [PATCH 08/12] Simplify managing chunked array of neighbor list pointers. Another attempt to fix Python bindings. --- hnswlib/hnswalg.h | 21 +++++++++++---------- python_bindings/bindings.cpp | 6 +++--- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 3388a4a0..ff3de03c 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -155,8 +155,8 @@ class HierarchicalNSW : public AlgorithmInterface { clear(); } - char*& getLinkListPtrRef(tableint internal_id) { - return *reinterpret_cast(linkLists_[internal_id]); + void setLinkListPtr(tableint internal_id, char* data) { + *reinterpret_cast(linkLists_[internal_id]) = data; } char* getLinkListPtr(tableint internal_id) const { @@ -724,7 +724,7 @@ class HierarchicalNSW : public AlgorithmInterface { unsigned int linkListSize = element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0; writeBinaryPOD(output, linkListSize); if (linkListSize) - output.write(getLinkListPtrRef(i), linkListSize); + output.write(getLinkListPtr(i), linkListSize); } output.close(); return OkStatus(); @@ -826,13 +826,13 @@ class HierarchicalNSW : public AlgorithmInterface { readBinaryPOD(input, linkListSize); if (linkListSize == 0) { element_levels_[i] = 0; - getLinkListPtrRef(i) = nullptr; + setLinkListPtr(i, nullptr); } else { element_levels_[i] = linkListSize / size_links_per_element_; - getLinkListPtrRef(i) = (char *) malloc(linkListSize); - if (getLinkListPtrRef(i) == nullptr) + setLinkListPtr(i, (char *) malloc(linkListSize)); + if (getLinkListPtr(i) == nullptr) return Status("Not enough memory: loadIndex failed to allocate linklist"); - input.read(getLinkListPtrRef(i), linkListSize); + input.read(getLinkListPtr(i), linkListSize); } } @@ -1241,11 +1241,12 @@ class HierarchicalNSW : public AlgorithmInterface { memcpy(getDataByInternalId(cur_c), data_point, data_size_); if (curlevel) { - getLinkListPtrRef(cur_c) = (char *) malloc(size_links_per_element_ * curlevel + 1); - if (getLinkListPtrRef(cur_c) == nullptr) { + size_t link_list_num_bytes = size_links_per_element_ * curlevel + 1; + setLinkListPtr(cur_c, (char *) malloc(link_list_num_bytes)); + if (getLinkListPtr(cur_c) == nullptr) { return Status("Not enough memory: addPoint failed to allocate linklist"); } - memset(getLinkListPtrRef(cur_c), 0, size_links_per_element_ * curlevel + 1); + memset(getLinkListPtr(cur_c), 0, link_list_num_bytes); } if ((signed)currObj != -1) { diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index a666dff5..a30546d5 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -581,10 +581,10 @@ class Index { for (size_t i = 0; i < appr_alg->max_elements_; i++) { size_t linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; if (linkListSize == 0) { - appr_alg->linkLists_[i] = nullptr; + appr_alg->setLinkListPtr(i, nullptr); } else { - appr_alg->linkLists_[i] = (char*)malloc(linkListSize); - if (appr_alg->linkLists_[i] == nullptr) + appr_alg->setLinkListPtr(i, (char*)malloc(linkListSize)); + if (appr_alg->getLinkListPtr(i) == nullptr) HNSWLIB_THROW_RUNTIME_ERROR("Not enough memory: loadIndex failed to allocate linklist"); memcpy(appr_alg->linkLists_[i], link_list_npy.data() + link_npy_offsets[i], linkListSize); From cc68c8fc4f3b649e0e016c25a70b17dfb20b3495 Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Mon, 29 Sep 2025 04:44:11 +0000 Subject: [PATCH 09/12] Clear level 0 memory when calling clear(). Fix HNSW pickling/unpickling in Python bindings. --- hnswlib/hnswalg.h | 1 + python_bindings/bindings.cpp | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index ff3de03c..1bf58458 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -170,6 +170,7 @@ class HierarchicalNSW : public AlgorithmInterface { free(getLinkListPtr(i)); } } + data_level0_memory_.clear(); linkLists_.clear(); cur_element_count = 0; visited_list_pool_.reset(nullptr); diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index a30546d5..d1900d90 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -387,9 +387,9 @@ class Index { memcpy(element_levels_npy, appr_alg->element_levels_.data(), appr_alg->element_levels_.size() * sizeof(int)); for (size_t i = 0; i < appr_alg->cur_element_count; i++) { - size_t linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + size_t linkListSize = appr_alg->size_links_per_element_ * appr_alg->element_levels_[i]; if (linkListSize) { - memcpy(link_list_npy + link_npy_offsets[i], appr_alg->linkLists_[i], linkListSize); + memcpy(link_list_npy + link_npy_offsets[i], appr_alg->getLinkListPtr(i), linkListSize); } } @@ -583,11 +583,12 @@ class Index { if (linkListSize == 0) { appr_alg->setLinkListPtr(i, nullptr); } else { - appr_alg->setLinkListPtr(i, (char*)malloc(linkListSize)); - if (appr_alg->getLinkListPtr(i) == nullptr) + char* linkListPtr = reinterpret_cast(malloc(linkListSize)); + if (linkListPtr == nullptr) HNSWLIB_THROW_RUNTIME_ERROR("Not enough memory: loadIndex failed to allocate linklist"); + appr_alg->setLinkListPtr(i, linkListPtr); - memcpy(appr_alg->linkLists_[i], link_list_npy.data() + link_npy_offsets[i], linkListSize); + memcpy(linkListPtr, link_list_npy.data() + link_npy_offsets[i], linkListSize); } } From 517f5eb5264afc02e4ea2d57dc59784461522930 Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Mon, 29 Sep 2025 07:08:33 +0000 Subject: [PATCH 10/12] Make the number of elements per chunk configurable --- hnswlib/hnswalg.h | 47 +++++++++++++++++++++++++++++------------------ hnswlib/hnswlib.h | 24 ++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 20 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 1bf58458..e62a3a4b 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -19,12 +19,16 @@ typedef unsigned int tableint; constexpr tableint kInvalidInternalId = std::numeric_limits::max(); typedef unsigned int linklistsizeint; +static const size_t kCacheLineSize = 64; + template class HierarchicalNSW : public AlgorithmInterface { public: static const tableint MAX_LABEL_OPERATION_LOCKS = 65536; static const unsigned char DELETE_MARK = 0x01; + static const size_t kDefaultMaxElementsPerChunk = 10 * 1024; + size_t max_elements_{0}; mutable std::atomic cur_element_count{0}; // current number of elements size_t size_data_per_element_{0}; @@ -35,7 +39,7 @@ class HierarchicalNSW : public AlgorithmInterface { size_t maxM0_{0}; size_t ef_construction_{0}; size_t ef_{ 0 }; - const size_t k_elements_per_chunk{10*1024}; + size_t num_elements_per_chunk_{kDefaultMaxElementsPerChunk}; double mult_{0.0}, revSize_{0.0}; int maxlevel_{0}; @@ -53,8 +57,8 @@ class HierarchicalNSW : public AlgorithmInterface { size_t size_links_level0_{0}; size_t offsetData_{0}, offsetLevel0_{0}, label_offset_{ 0 }; - ChunkedArray data_level0_memory_; - ChunkedArray linkLists_; + ChunkedArray data_level0_memory_; + ChunkedArray linkLists_; std::vector element_levels_; // keeps level of each element size_t data_size_{0}; @@ -86,8 +90,10 @@ class HierarchicalNSW : public AlgorithmInterface { const std::string &location, bool nmslib = false, size_t max_elements = 0, - bool allow_replace_deleted = false) - : allow_replace_deleted_(allow_replace_deleted) { + bool allow_replace_deleted = false, + size_t num_elements_per_chunk = kDefaultMaxElementsPerChunk) + : allow_replace_deleted_(allow_replace_deleted), + num_elements_per_chunk_(num_elements_per_chunk) { loadIndex(location, s, max_elements); } @@ -98,11 +104,13 @@ class HierarchicalNSW : public AlgorithmInterface { size_t M = 16, size_t ef_construction = 200, size_t random_seed = 100, - bool allow_replace_deleted = false) + bool allow_replace_deleted = false, + size_t num_elements_per_chunk = kDefaultMaxElementsPerChunk) : label_op_locks_(MAX_LABEL_OPERATION_LOCKS), link_list_locks_(max_elements), element_levels_(max_elements), - allow_replace_deleted_(allow_replace_deleted) { + allow_replace_deleted_(allow_replace_deleted), + num_elements_per_chunk_(num_elements_per_chunk) { max_elements_ = max_elements; num_deleted_ = 0; data_size_ = s->get_data_size(); @@ -131,8 +139,9 @@ class HierarchicalNSW : public AlgorithmInterface { // Allocate 64 more bytes for each chunk so we can safely prefetch a // cache line beyond the chunk. - data_level0_memory_ = ChunkedArray( - size_data_per_element_, k_elements_per_chunk, max_elements, 64); + data_level0_memory_ = ChunkedArray( + size_data_per_element_, num_elements_per_chunk_, max_elements, + kCacheLineSize); cur_element_count = 0; @@ -142,8 +151,10 @@ class HierarchicalNSW : public AlgorithmInterface { enterpoint_node_ = -1; maxlevel_ = -1; - linkLists_ = ChunkedArray( - sizeof(void *), k_elements_per_chunk, max_elements, 0); + linkLists_ = ChunkedArray( + /* element_byte_size= */ sizeof(void *), + num_elements_per_chunk_, max_elements, + /* chunk_padding_bytes= */ 0); size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); mult_ = 1 / log(1.0 * M_); @@ -289,7 +300,7 @@ class HierarchicalNSW : public AlgorithmInterface { size_t size = getListCount((linklistsizeint*)data); tableint *datal = (tableint *) (data + 1); HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1)), _MM_HINT_T0); - HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1) + kCacheLineSize), _MM_HINT_T0); HNSWLIB_MM_PREFETCH(getDataByInternalId(*datal), _MM_HINT_T0); HNSWLIB_MM_PREFETCH(getDataByInternalId(*(datal + 1)), _MM_HINT_T0); @@ -389,7 +400,7 @@ class HierarchicalNSW : public AlgorithmInterface { } HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1)), _MM_HINT_T0); - HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0); + HNSWLIB_MM_PREFETCH((char *) (visited_array + *(data + 1) + kCacheLineSize), _MM_HINT_T0); HNSWLIB_MM_PREFETCH(data_level0_memory_[*(data + 1)] + offsetData_, _MM_HINT_T0); HNSWLIB_MM_PREFETCH((char *) (data + 2), _MM_HINT_T0); @@ -800,11 +811,11 @@ class HierarchicalNSW : public AlgorithmInterface { input.seekg(pos, input.beg); - data_level0_memory_ = ChunkedArray( + data_level0_memory_ = ChunkedArray( size_data_per_element_, - k_elements_per_chunk, + num_elements_per_chunk_, max_elements, - 64); + kCacheLineSize); data_level0_memory_.readFromStream(input, cur_element_count); size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); @@ -815,8 +826,8 @@ class HierarchicalNSW : public AlgorithmInterface { visited_list_pool_.reset(new VisitedListPool(1, max_elements)); - linkLists_ = ChunkedArray( - sizeof(void *), k_elements_per_chunk, max_elements, 0); + linkLists_ = ChunkedArray( + sizeof(void *), num_elements_per_chunk_, max_elements, 0); element_levels_ = std::vector(max_elements); revSize_ = 1.0 / mult_; diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 1056e418..f66f279e 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -29,6 +29,7 @@ #include #include +#include #if defined(USE_AVX) || defined(USE_SSE) #ifdef _MSC_VER @@ -379,8 +380,25 @@ MallocUniqueCharArrayPtr makeUniqueCharArray(size_t n_bytes) { } // namespace internal +// Manages a large, array-like data structure by allocating memory in smaller, +// fixed-size blocks called "chunks." This class provides a flat, array-like +// view over a large collection of elements without needing a single, massive +// contiguous memory allocation, which helps avoid memory fragmentation. +// +// It provides random access via `operator[]`, which internally maps an index +// to the correct chunk and the element's offset within it. The size of the +// elements and the number of elements per chunk are configured at construction. +// +// The class is non-copyable to prevent expensive deep copies but is movable for +// efficient transfers of ownership. The template parameter `ElementPointerType` +// specifies the pointer type used to access elements, e.g. `char*` if pointer +// arithmetics are required, or `void*` if the result would be immediately cast +// into another pointer type. +template class ChunkedArray { public: + static_assert(std::is_pointer::value, + "Template parameter ElementPointerType must be a pointer."); ChunkedArray() : element_byte_size_(0), elements_per_chunk_(0), @@ -436,7 +454,7 @@ class ChunkedArray { return elements_per_chunk_ * element_byte_size_; } - char* operator[](size_t i) const { + ElementPointerType operator[](size_t i) const { #ifndef NDEBUG if (i >= getCapacity()) { HNSWERR << "Chunked array index out of range: i=" << i @@ -447,7 +465,9 @@ class ChunkedArray { if (i >= getCapacity()) return nullptr; size_t chunk_index = i / elements_per_chunk_; size_t index_in_chunk = i % elements_per_chunk_; - return chunks_[chunk_index].get() + element_byte_size_ * index_in_chunk; + return reinterpret_cast( + chunks_[chunk_index].get() + element_byte_size_ * index_in_chunk + ); } void clear() { From 12fc750b4c36d558e8b13375726be28991a55472 Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Mon, 29 Sep 2025 16:07:42 +0000 Subject: [PATCH 11/12] Make field declaration order consistent with initialization order to fix build --- hnswlib/hnswalg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index e62a3a4b..0fcdb8d0 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -39,7 +39,6 @@ class HierarchicalNSW : public AlgorithmInterface { size_t maxM0_{0}; size_t ef_construction_{0}; size_t ef_{ 0 }; - size_t num_elements_per_chunk_{kDefaultMaxElementsPerChunk}; double mult_{0.0}, revSize_{0.0}; int maxlevel_{0}; @@ -80,6 +79,7 @@ class HierarchicalNSW : public AlgorithmInterface { std::mutex deleted_elements_lock; // lock for deleted_elements std::unordered_set deleted_elements; // contains internal ids of deleted elements + size_t num_elements_per_chunk_{kDefaultMaxElementsPerChunk}; HierarchicalNSW(SpaceInterface *s) { } From c32cc21ee5ef0797c9888bcc6519eba6304bcb4f Mon Sep 17 00:00:00 2001 From: Michael Bautin Date: Thu, 2 Oct 2025 16:33:39 +0000 Subject: [PATCH 12/12] Revert unintended indentation --- hnswlib/hnswalg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 0fcdb8d0..6c62498f 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -1513,7 +1513,7 @@ class HierarchicalNSW : public AlgorithmInterface { if (isMarkedDeleted(internalId)) { unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId)) + 2; *ll_cur &= ~DELETE_MARK; - num_deleted_ -= 1; + num_deleted_ -= 1; if (allow_replace_deleted_) { std::unique_lock lock_deleted_elements(deleted_elements_lock); deleted_elements.erase(internalId);