diff --git a/src/ailego/buffer/buffer_pool.cc b/src/ailego/buffer/buffer_pool.cc deleted file mode 100644 index 38f73f628..000000000 --- a/src/ailego/buffer/buffer_pool.cc +++ /dev/null @@ -1,328 +0,0 @@ -#include -#include - -#if defined(_MSC_VER) -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -static ssize_t zvec_pread(int fd, void *buf, size_t count, size_t offset) { - HANDLE handle = reinterpret_cast(_get_osfhandle(fd)); - if (handle == INVALID_HANDLE_VALUE) return -1; - OVERLAPPED ov = {}; - ov.Offset = static_cast(offset & 0xFFFFFFFF); - ov.OffsetHigh = static_cast(offset >> 32); - DWORD bytes_read = 0; - if (!ReadFile(handle, buf, static_cast(count), &bytes_read, &ov)) { - return -1; - } - return static_cast(bytes_read); -} -#endif - -namespace zvec { -namespace ailego { - -int LRUCache::init(size_t block_size) { - block_size_ = block_size; - for (size_t i = 0; i < CATCH_QUEUE_NUM; i++) { - queues_.push_back(ConcurrentQueue(block_size)); - } - return 0; -} - -bool LRUCache::evict_single_block(BlockType &item) { - bool found = false; - for (size_t i = 0; i < CATCH_QUEUE_NUM; i++) { - found = queues_[i].try_dequeue(item); - if (found) { - break; - } - } - return found; -} - -bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block, - int block_type) { - bool ok = queues_[block_type].enqueue(block); - if (!ok) { - LOG_ERROR("enqueue failed."); - return false; - } - evict_queue_insertions_.fetch_add(1, std::memory_order_relaxed); - if (evict_queue_insertions_ % block_size_ == 0) { - this->clear_dead_node(lp_map); - } - return true; -} - -void LRUCache::clear_dead_node(const LPMap *lp_map) { - for (size_t i = 0; i < CATCH_QUEUE_NUM; i++) { - size_t clear_size = block_size_ * 2; - if (queues_[i].size_approx() < clear_size * 4) { - continue; - } - size_t clear_count = 0; - ConcurrentQueue tmp(block_size_); - BlockType item; - while (queues_[i].try_dequeue(item) && (clear_count++ < clear_size)) { - if (!lp_map->isDeadBlock(item)) { - if (!tmp.enqueue(item)) { - LOG_ERROR("enqueue failed."); - } - } - } - while (tmp.try_dequeue(item)) { - if (!lp_map->isDeadBlock(item)) { - if (!queues_[i].enqueue(item)) { - LOG_ERROR("enqueue failed."); - } - } - } - } -} - -void LPMap::init(size_t entry_num) { - if (entries_) { - delete[] entries_; - } - entry_num_ = entry_num; - entries_ = new Entry[entry_num_]; - for (size_t i = 0; i < entry_num_; i++) { - entries_[i].ref_count.store(std::numeric_limits::min()); - entries_[i].load_count.store(0); - entries_[i].buffer = nullptr; - } - cache_.init(entry_num * 4); -} - -char *LPMap::acquire_block(block_id_t block_id, bool lru_mode) { - assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; - if (!lru_mode) { - return entry.buffer; - } - while (true) { - int current_count = entry.ref_count.load(std::memory_order_acquire); - if (current_count < 0) { - return nullptr; - } - if (entry.ref_count.compare_exchange_weak(current_count, current_count + 1, - std::memory_order_acq_rel, - std::memory_order_acquire)) { - if (current_count == 0) { - entry.load_count.fetch_add(1, std::memory_order_relaxed); - } - return entry.buffer; - } - } -} - -void LPMap::release_block(block_id_t block_id) { - assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; - - if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) { - std::atomic_thread_fence(std::memory_order_acquire); - LRUCache::BlockType block; - block.first = block_id; - block.second = entry.load_count.load(); - cache_.add_single_block(this, block, 0); - } -} - -char *LPMap::evict_block(block_id_t block_id) { - assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; - int expected = 0; - if (entry.ref_count.compare_exchange_strong( - expected, std::numeric_limits::min())) { - char *buffer = entry.buffer; - entry.buffer = nullptr; - return buffer; - } else { - return nullptr; - } -} - -char *LPMap::set_block_acquired(block_id_t block_id, char *buffer) { - assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; - while (true) { - int current_count = entry.ref_count.load(std::memory_order_relaxed); - if (current_count >= 0) { - if (entry.ref_count.compare_exchange_weak( - current_count, current_count + 1, std::memory_order_acq_rel, - std::memory_order_acquire)) { - return entry.buffer; - } - } else { - if (entry.ref_count.compare_exchange_weak(current_count, 1, - std::memory_order_acq_rel, - std::memory_order_acquire)) { - entry.buffer = buffer; - entry.load_count.fetch_add(1, std::memory_order_relaxed); - return entry.buffer; - } - } - } -} - -void LPMap::recycle(moodycamel::ConcurrentQueue &free_buffers) { - LRUCache::BlockType block; - do { - bool ok = cache_.evict_single_block(block); - if (!ok) { - return; - } - } while (isDeadBlock(block)); - char *buffer = evict_block(block.first); - if (buffer) { - if (!free_buffers.enqueue(buffer)) { - LOG_ERROR("recycle buffer enqueue failed."); - ailego_free(buffer); - } - } -} - -VecBufferPool::VecBufferPool(const std::string &filename) { -#if defined(_MSC_VER) - fd_ = _open(filename.c_str(), O_RDONLY | _O_BINARY); -#else - fd_ = open(filename.c_str(), O_RDONLY); -#endif - if (fd_ < 0) { - throw std::runtime_error("Failed to open file: " + filename); - } -#if defined(_MSC_VER) - struct _stat64 st; - if (_fstat64(fd_, &st) < 0) { - _close(fd_); -#else - struct stat st; - if (fstat(fd_, &st) < 0) { - ::close(fd_); -#endif - throw std::runtime_error("Failed to stat file: " + filename); - } - file_size_ = st.st_size; -} - -int VecBufferPool::init(size_t pool_capacity, size_t block_size, - size_t segment_count) { - if (block_size == 0) { - LOG_ERROR("block_size must not be 0"); - return -1; - } - pool_capacity_ = pool_capacity; - size_t buffer_num = pool_capacity_ / block_size + 10; - size_t block_num = segment_count + 10; - lp_map_.init(block_num); - mutex_vec_.reserve(block_num); - for (int i = 0; i < block_num; i++) { - mutex_vec_.emplace_back(std::make_unique()); - } - for (size_t i = 0; i < buffer_num; i++) { - char *buffer = (char *)ailego_malloc(block_size); - if (buffer != nullptr) { - if (!free_buffers_.enqueue(buffer)) { - LOG_ERROR("recycle buffer enqueue failed."); - ailego_free(buffer); - return -1; - } - } else { - LOG_ERROR("aligned_alloc %zu(size: %zu) failed", i, block_size); - return -1; - } - } - LOG_DEBUG("Buffer pool num: %zu, entry num: %zu", buffer_num, - lp_map_.entry_num()); - no_lru_mode_ = false; - if (lp_map_.entry_num() <= buffer_num) { - no_lru_mode_ = true; - } - return 0; -} - -VecBufferPoolHandle VecBufferPool::get_handle() { - return VecBufferPoolHandle(*this); -} - -char *VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset, - size_t size, int retry) { - char *buffer = lp_map_.acquire_block(block_id, !no_lru_mode()); - if (buffer) { - return buffer; - } - std::lock_guard lock(*mutex_vec_[block_id]); - buffer = lp_map_.acquire_block(block_id, !no_lru_mode()); - if (buffer) { - return buffer; - } - { - bool found = free_buffers_.try_dequeue(buffer); - if (!found && !no_lru_mode_) { - for (int i = 0; i < retry; i++) { - lp_map_.recycle(free_buffers_); - found = free_buffers_.try_dequeue(buffer); - if (found) { - break; - } - } - } - if (!found) { - LOG_ERROR("Buffer pool failed to get free buffer"); - return nullptr; - } - } - -#if defined(_MSC_VER) - ssize_t read_bytes = zvec_pread(fd_, buffer, size, offset); -#else - ssize_t read_bytes = pread(fd_, buffer, size, offset); -#endif - if (read_bytes != static_cast(size)) { - LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset); - free_buffers_.enqueue(buffer); - return nullptr; - } - return lp_map_.set_block_acquired(block_id, buffer); -} - -int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) { -#if defined(_MSC_VER) - ssize_t read_bytes = zvec_pread(fd_, buffer, length, offset); -#else - ssize_t read_bytes = pread(fd_, buffer, length, offset); -#endif - if (read_bytes != static_cast(length)) { - LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset); - return -1; - } - return 0; -} - -char *VecBufferPoolHandle::get_block(size_t offset, size_t size, - size_t block_id) { - char *buffer = pool_.acquire_buffer(block_id, offset, size, 5); - return buffer; -} - -int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *buffer) { - return pool_.get_meta(offset, length, buffer); -} - -void VecBufferPoolHandle::release_one(block_id_t block_id) { - if (!pool_.no_lru_mode()) { - pool_.lp_map_.release_block(block_id); - } -} - -void VecBufferPoolHandle::acquire_one(block_id_t block_id) { - if (!pool_.no_lru_mode()) { - pool_.lp_map_.acquire_block(block_id, true); - } -} - -} // namespace ailego -} // namespace zvec \ No newline at end of file diff --git a/src/ailego/buffer/lru_cache.cc b/src/ailego/buffer/lru_cache.cc new file mode 100644 index 000000000..86489e750 --- /dev/null +++ b/src/ailego/buffer/lru_cache.cc @@ -0,0 +1,194 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +namespace zvec { +namespace ailego { + +int LRUCache::init() { + evict_batch_size_ = 512; + for (size_t i = 0; i < CACHE_QUEUE_NUM; i++) { + evict_queues_.push_back(ConcurrentQueue(evict_batch_size_ * 200)); + } + return 0; +} + +bool LRUCache::evict_single_block(BlockType &item) { + bool found = false; + for (size_t i = 0; i < CACHE_QUEUE_NUM; i++) { + found = evict_queues_[i].try_dequeue(item); + if (found) { + break; + } + } + return found; +} + +bool LRUCache::is_valid_and_alive(const BlockType &item) { + std::shared_lock lock(valid_page_tables_mutex_); + if (valid_page_tables_.find(item.page_table) == valid_page_tables_.end()) { + return false; + } + // is_dead_block accesses entries_ under the same shared lock, so the + // VectorPageTable destructor (which holds the unique lock via set_invalid) + // cannot free entries_ while this check is in progress. + return !item.page_table->is_dead_block(item); +} + +bool LRUCache::evict_block(BlockType &item) { + bool ok = false; + do { + ok = evict_single_block(item); + if (!ok) { + return false; + } + if (item.page_table == nullptr) { + if (!ParquetBufferPool::get_instance().is_dead_node(item)) { + break; + } else { + continue; + } + } + } while (!is_valid_and_alive(item)); + return ok; +} + +void LRUCache::recycle() { + BlockType item; + while (MemoryLimitPool::get_instance().is_full() && evict_block(item)) { + if (item.page_table) { + // Hold the shared lock across the eviction call to prevent + // use-after-free if the VectorPageTable is concurrently destroyed. + std::shared_lock lock(valid_page_tables_mutex_); + if (valid_page_tables_.find(item.page_table) != + valid_page_tables_.end()) { + item.page_table->evict_block(item.vector_block.first); + } + } else { + ParquetBufferPool::get_instance().evict(item.parquet_buffer_block.first); + } + } +} + +bool LRUCache::add_single_block(const BlockType &block, int queue_index) { + bool ok = evict_queues_[queue_index].enqueue(block); + if (!ok) { + LOG_ERROR("enqueue failed."); + return false; + } + static thread_local int evict_queue_insertions = 0; + if (evict_queue_insertions++ > evict_batch_size_) { + this->clear_dead_node(); + evict_queue_insertions = 0; + } + return true; +} + +void LRUCache::clear_dead_node() { + for (size_t i = 0; i < CACHE_QUEUE_NUM; i++) { + size_t clear_size = evict_batch_size_; + if (evict_queues_[i].size_approx() < evict_batch_size_) { + continue; + } + if (evict_queues_[i].size_approx() > evict_batch_size_ * 8) { + clear_size *= 2; + } + size_t clear_count = 0; + BlockType item; + ConcurrentQueue live_blocks_queue(evict_batch_size_ * 200); + while (evict_queues_[i].try_dequeue(item) && (clear_count++ < clear_size)) { + if (item.page_table == nullptr) { + if (!ParquetBufferPool::get_instance().is_dead_node(item)) { + live_blocks_queue.enqueue(item); + } + } else if (is_valid_and_alive(item)) { + live_blocks_queue.enqueue(item); + } + } + while (live_blocks_queue.try_dequeue(item)) { + evict_queues_[i].enqueue(item); + } + } +} + +int MemoryLimitPool::init(size_t pool_size) { + pool_size_ = 0; + LRUCache::get_instance().recycle(); + pool_size_ = pool_size; + LOG_INFO("MemoryLimitPool initialized with pool size: %lu", pool_size_); + return 0; +} + +bool MemoryLimitPool::try_acquire_buffer(const size_t buffer_size, + char *&buffer) { + size_t expected, desired; + do { + expected = used_size_.load(); + if (expected >= pool_size_) { + return false; + } + desired = expected + buffer_size; + } while (!used_size_.compare_exchange_weak(expected, desired)); + buffer = (char *)ailego_malloc(buffer_size); + if (!buffer) { + used_size_.fetch_sub(buffer_size); + return false; + } + return true; +} + +void MemoryLimitPool::acquire_parquet(const size_t buffer_size) { + size_t expected, desired; + do { + expected = used_size_.load(); + desired = expected + buffer_size; + } while (!used_size_.compare_exchange_weak(expected, desired)); +} + +void MemoryLimitPool::release_buffer(char *buffer, const size_t buffer_size) { + size_t expected, desired; + do { + expected = used_size_.load(); + desired = expected - buffer_size; + assert(expected >= buffer_size); + } while (!used_size_.compare_exchange_weak(expected, desired)); + ailego_free(buffer); +} + +void MemoryLimitPool::release_parquet(const size_t buffer_size) { + size_t expected, desired; + do { + expected = used_size_.load(); + desired = expected - buffer_size; + assert(expected >= buffer_size); + } while (!used_size_.compare_exchange_weak(expected, desired)); +} + +bool MemoryLimitPool::is_full() { + return used_size_.load() >= pool_size_; +} + +bool MemoryLimitPool::is_hot_level1() { + return used_size_.load() >= pool_size_ * 3 / 5; +} + +bool MemoryLimitPool::is_hot_level2() { + return used_size_.load() >= pool_size_ * 4 / 5; +} + +} // namespace ailego +} // namespace zvec \ No newline at end of file diff --git a/src/ailego/buffer/parquet_hash_table.cc b/src/ailego/buffer/parquet_hash_table.cc new file mode 100644 index 000000000..e2f88cf52 --- /dev/null +++ b/src/ailego/buffer/parquet_hash_table.cc @@ -0,0 +1,256 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace zvec { +namespace ailego { + +ParquetBufferID::ParquetBufferID(std::string &filename, int column, + int row_group) + : filename(filename), column(column), row_group(row_group) { + struct stat file_stat; + if (stat(filename.c_str(), &file_stat) == 0) { + // file_stat.st_ino contains the inode number + // file_stat.st_dev contains the device ID + // Together they uniquely identify a file + file_id = file_stat.st_ino; + std::filesystem::path p(filename); + auto ftime = std::filesystem::last_write_time(p); + mtime = static_cast(ftime.time_since_epoch().count()); + } +} + +ParquetBufferContextHandle::ParquetBufferContextHandle( + const ParquetBufferContextHandle &handle_) + : buffer_id_(handle_.buffer_id_), arrow_(handle_.arrow_) { + if (arrow_) { + ParquetBufferPool::get_instance().acquire_one(buffer_id_); + } +} + +ParquetBufferContextHandle::~ParquetBufferContextHandle() { + if (arrow_) { + ParquetBufferPool::get_instance().release(buffer_id_); + } +} + +arrow::Status ParquetBufferPool::acquire(ParquetBufferID buffer_id, + ParquetBufferContext &context) { + // TODO: file handler and memory pool can be optimized + arrow::MemoryPool *mem_pool = arrow::default_memory_pool(); + + // Open file + std::shared_ptr input; + const auto &file_name = buffer_id.filename; + ARROW_ASSIGN_OR_RAISE(input, arrow::io::ReadableFile::Open(file_name)); + + // Open reader + std::unique_ptr reader; + ARROW_ASSIGN_OR_RAISE(reader, parquet::arrow::OpenFile(input, mem_pool)); + + // Perform read + int row_group = buffer_id.row_group; + int column = buffer_id.column; + auto s = reader->RowGroup(row_group)->Column(column)->Read(&context.arrow); + if (!s.ok()) { + LOG_ERROR("Failed to read parquet file[%s]", file_name.c_str()); + context.arrow = nullptr; + return s; + } + + context.size = 0; + context.arrow_refs.clear(); + // Compute the memory usage and hijack Arrow's buffers with our + // implementation + for (auto &array : context.arrow->chunks()) { + auto &buffers = array->data()->buffers; + for (size_t buf_idx = 0; buf_idx < buffers.size(); ++buf_idx) { + if (buffers[buf_idx] == nullptr) { + continue; + } + // Keep references to original buffers to prevent premature deletion + context.arrow_refs.emplace_back(buffers[buf_idx]); + context.size += buffers[buf_idx]->capacity(); + // Create hijacked buffer with custom deleter that notifies us when + // Arrow is finished with the buffer + std::shared_ptr hijacked_buffer( + buffers[buf_idx].get(), ArrowBufferDeleter(this, buffer_id)); + buffers[buf_idx] = hijacked_buffer; + } + } + + return arrow::Status::OK(); +} + +ParquetBufferContextHandle ParquetBufferPool::acquire_buffer( + ParquetBufferID buffer_id) { + std::shared_ptr arrow{nullptr}; + { + std::shared_lock lock(table_mutex_); + auto iter = table_.find(buffer_id); + if (iter != table_.end()) { + arrow = acquire(buffer_id); + if (arrow != nullptr) { + return ParquetBufferContextHandle(buffer_id, arrow); + } + } + } + { + bool found = !MemoryLimitPool::get_instance().is_full(); + if (!found) { + for (int i = 0; i < 5; i++) { + LRUCache::get_instance().recycle(); + found = !MemoryLimitPool::get_instance().is_full(); + if (found) { + break; + } + } + } + if (!found) { + LOG_ERROR("Failed to acquire parquet buffer"); + return ParquetBufferContextHandle(); + } + std::unique_lock lock(table_mutex_); + if (acquire(buffer_id, table_[buffer_id]).ok()) { + MemoryLimitPool::get_instance().acquire_parquet(table_[buffer_id].size); + arrow = set_block_acquired(buffer_id); + return ParquetBufferContextHandle(buffer_id, arrow); + } else { + LOG_ERROR("Failed to acquire parquet buffer"); + return ParquetBufferContextHandle(); + } + } +} + +std::shared_ptr ParquetBufferPool::set_block_acquired( + ParquetBufferID buffer_id) { + ParquetBufferContext &context = table_[buffer_id]; + while (true) { + int current_count = context.ref_count.load(std::memory_order_relaxed); + if (current_count >= 0) { + if (context.ref_count.compare_exchange_weak( + current_count, current_count + 1, std::memory_order_acq_rel, + std::memory_order_acquire)) { + return context.arrow; + } + } else { + if (context.ref_count.compare_exchange_weak(current_count, 1, + std::memory_order_acq_rel, + std::memory_order_acquire)) { + context.load_count.fetch_add(1, std::memory_order_relaxed); + return context.arrow; + } + } + } +} + +std::shared_ptr ParquetBufferPool::acquire( + ParquetBufferID buffer_id) { + auto iter = table_.find(buffer_id); + if (iter == table_.end()) { + return nullptr; + } + ParquetBufferContext &context = table_[buffer_id]; + while (true) { + int current_count = context.ref_count.load(std::memory_order_acquire); + if (current_count < 0) { + return nullptr; + } + if (context.ref_count.compare_exchange_weak( + current_count, current_count + 1, std::memory_order_acq_rel, + std::memory_order_acquire)) { + if (current_count == 0) { + context.load_count.fetch_add(1, std::memory_order_relaxed); + } + return context.arrow; + } + } + return nullptr; +} + +std::shared_ptr ParquetBufferPool::acquire_one( + ParquetBufferID buffer_id) { + std::shared_lock lock(table_mutex_); + auto iter = table_.find(buffer_id); + if (iter == table_.end()) { + return nullptr; + } + ParquetBufferContext &context = table_[buffer_id]; + while (true) { + int current_count = context.ref_count.load(std::memory_order_acquire); + if (current_count < 0) { + return nullptr; + } + if (context.ref_count.compare_exchange_weak( + current_count, current_count + 1, std::memory_order_acq_rel, + std::memory_order_acquire)) { + if (current_count == 0) { + context.load_count.fetch_add(1, std::memory_order_relaxed); + } + return context.arrow; + } + } +} + +void ParquetBufferPool::release(ParquetBufferID buffer_id) { + std::shared_lock lock(table_mutex_); + auto iter = table_.find(buffer_id); + if (iter == table_.end()) { + return; + } + ParquetBufferContext &context = table_[buffer_id]; + if (context.ref_count.fetch_sub(1, std::memory_order_release) == 1) { + std::atomic_thread_fence(std::memory_order_acquire); + LRUCache::BlockType block; + block.parquet_buffer_block.first = buffer_id; + block.parquet_buffer_block.second = context.load_count.load(); + LRUCache::get_instance().add_single_block(block, 0); + } +} + +void ParquetBufferPool::evict(ParquetBufferID buffer_id) { + std::unique_lock lock(table_mutex_); + auto iter = table_.find(buffer_id); + if (iter == table_.end()) { + return; + } + ParquetBufferContext &context = table_[buffer_id]; + int expected = 0; + if (context.ref_count.compare_exchange_strong( + expected, std::numeric_limits::min())) { + MemoryLimitPool::get_instance().release_parquet(context.size); + context.arrow = nullptr; + context.arrow_refs.clear(); + } +} + +bool ParquetBufferPool::is_dead_node(LRUCache::BlockType &block) { + std::shared_lock lock(table_mutex_); + auto iter = table_.find(block.parquet_buffer_block.first); + if (iter == table_.end()) { + return true; + } + return iter->second.load_count.load() != block.parquet_buffer_block.second; +} + +} // namespace ailego +} // namespace zvec \ No newline at end of file diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc new file mode 100644 index 000000000..bef47b194 --- /dev/null +++ b/src/ailego/buffer/vector_page_table.cc @@ -0,0 +1,299 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#if !defined(_MSC_VER) +#include +#endif + +#if defined(_MSC_VER) +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +static ssize_t zvec_pread(int fd, void *buf, size_t count, size_t offset) { + HANDLE handle = reinterpret_cast(_get_osfhandle(fd)); + if (handle == INVALID_HANDLE_VALUE) return -1; + OVERLAPPED ov = {}; + ov.Offset = static_cast(offset & 0xFFFFFFFF); + ov.OffsetHigh = static_cast(offset >> 32); + DWORD bytes_read = 0; + if (!ReadFile(handle, buf, static_cast(count), &bytes_read, &ov)) { + return -1; + } + return static_cast(bytes_read); +} +#endif + +namespace zvec { +namespace ailego { + +void VectorPageTable::init(size_t entry_num) { + if (entries_) { + delete[] entries_; + } + entry_num_ = entry_num; + entries_ = new Entry[entry_num_]; + for (size_t i = 0; i < entry_num_; i++) { + entries_[i].ref_count.store(std::numeric_limits::min()); + entries_[i].load_count.store(0); + entries_[i].lru_version.store(0); + entries_[i].buffer = nullptr; + } +} + +char *VectorPageTable::acquire_block(block_id_t block_id) { + assert(block_id < entry_num_); + Entry &entry = entries_[block_id]; + while (true) { + int current_count = entry.ref_count.load(std::memory_order_acquire); + if (current_count < 0) { + return nullptr; + } + if (entry.ref_count.compare_exchange_weak(current_count, current_count + 1, + std::memory_order_acq_rel, + std::memory_order_acquire)) { + if (current_count == 0) { + entry.load_count.fetch_add(1, std::memory_order_relaxed); + } + return entry.buffer; + } + } +} + +void VectorPageTable::release_block(block_id_t block_id) { + assert(block_id < entry_num_); + Entry &entry = entries_[block_id]; + + if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) { + std::atomic_thread_fence(std::memory_order_acquire); + if (MemoryLimitPool::get_instance().is_hot_level1()) { + LRUCache::BlockType block; + block.page_table = this; + block.vector_block.first = block_id; + version_t v = entry.load_count.load(std::memory_order_relaxed); + block.vector_block.second = v; + entry.lru_version.store(v, std::memory_order_relaxed); + LRUCache::get_instance().add_single_block(block, 0); + } else { + // Two separate relaxed loads: a concurrent acquire_block may increment + // load_count between the two reads, making the condition transiently + // false (missed enqueue). This is benign: the block will satisfy the + // condition again on the next release cycle, and hot_level1 pressure + // will add it to LRU directly regardless. + if (entry.lru_version.load(std::memory_order_relaxed) + 1 == + entry.load_count.load(std::memory_order_relaxed)) { + evict_cache_.enqueue(block_id); + } + } + } +} + +void VectorPageTable::evict_block(block_id_t block_id) { + assert(block_id < entry_num_); + Entry &entry = entries_[block_id]; + char *buffer = entry.buffer; + size_t size = entry.size; + int expected = 0; + if (entry.ref_count.compare_exchange_strong( + expected, std::numeric_limits::min())) { + if (buffer) { + MemoryLimitPool::get_instance().release_buffer(buffer, size); + } + } +} + +char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, + size_t size) { + assert(block_id < entry_num_); + Entry &entry = entries_[block_id]; + if (MemoryLimitPool::get_instance().is_hot_level2()) { + size_t evict_block_id = 0; + while (evict_cache_.try_dequeue(evict_block_id)) { + Entry &hot_entry = entries_[evict_block_id]; + if (hot_entry.ref_count.load() != 0) { + continue; + } + // Snapshot load_count once. We only need to advance lru_version to this + // snapshot version; chasing subsequent increments is unnecessary and can + // cause unbounded spinning under high concurrency. + // If the CAS fails, another thread has already advanced lru_version (to + // at least this version), so the block is already queued in LRU. + version_t desired = hot_entry.load_count.load(std::memory_order_relaxed); + version_t current = hot_entry.lru_version.load(std::memory_order_relaxed); + if (current != desired) { + if (hot_entry.lru_version.compare_exchange_strong( + current, desired, std::memory_order_acq_rel, + std::memory_order_acquire)) { + LRUCache::BlockType block; + block.page_table = this; + block.vector_block.first = evict_block_id; + block.vector_block.second = desired; + LRUCache::get_instance().add_single_block(block, 0); + } + } + } + } + while (true) { + int current_count = entry.ref_count.load(std::memory_order_relaxed); + if (current_count >= 0) { + // Defensive branch: in practice this path should never be reached. + // set_block_acquired() is always called under block_mutexes_[block_id], + // and the caller (acquire_buffer) re-checks acquire_block() inside the + // same lock before invoking this function. Therefore, if we get here, + // ref_count must still be negative (unloaded). This branch is retained + // as a safety net in case the locking contract is violated in the future, + // e.g. if set_block_acquired is called from an unlocked context. + if (entry.ref_count.compare_exchange_weak( + current_count, current_count + 1, std::memory_order_acq_rel, + std::memory_order_acquire)) { + MemoryLimitPool::get_instance().release_buffer(buffer, size); + return entry.buffer; + } + } else { + entry.buffer = buffer; + entry.size = size; + entry.load_count.fetch_add(1, std::memory_order_relaxed); + entry.ref_count.store(1, std::memory_order_release); + return entry.buffer; + } + } +} + +VecBufferPool::VecBufferPool(const std::string &filename) { +#if defined(_MSC_VER) + fd_ = _open(filename.c_str(), O_RDONLY | _O_BINARY); +#else + fd_ = open(filename.c_str(), O_RDONLY); +#endif + if (fd_ < 0) { + throw std::runtime_error("Failed to open file: " + filename); + } +#if defined(_MSC_VER) + struct _stat64 st; + if (_fstat64(fd_, &st) < 0) { + _close(fd_); +#else + struct stat st; + if (fstat(fd_, &st) < 0) { + ::close(fd_); +#endif + throw std::runtime_error("Failed to stat file: " + filename); + } + file_size_ = st.st_size; +} + +int VecBufferPool::init(size_t /*pool_capacity*/, size_t block_size, + size_t segment_count) { + if (block_size == 0) { + LOG_ERROR("block_size must not be 0"); + return -1; + } + size_t block_num = segment_count + 10; + page_table_.init(block_num); + block_mutexes_.clear(); + block_mutexes_.reserve(block_num); + for (size_t i = 0; i < block_num; i++) { + block_mutexes_.emplace_back(std::make_unique()); + } + LOG_DEBUG("entry num: %zu", page_table_.entry_num()); + return 0; +} + +VecBufferPoolHandle VecBufferPool::get_handle() { + return VecBufferPoolHandle(*this); +} + +char *VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset, + size_t size, int retry) { + assert(block_id < block_mutexes_.size()); + char *buffer = page_table_.acquire_block(block_id); + if (buffer) { + return buffer; + } + std::lock_guard lock(*block_mutexes_[block_id]); + buffer = page_table_.acquire_block(block_id); + if (buffer) { + return buffer; + } + { + bool found = + MemoryLimitPool::get_instance().try_acquire_buffer(size, buffer); + if (!found) { + for (int i = 0; i < retry; i++) { + LRUCache::get_instance().recycle(); + found = + MemoryLimitPool::get_instance().try_acquire_buffer(size, buffer); + if (found) { + break; + } + } + } + if (!found) { + LOG_ERROR("Buffer pool failed to get free buffer"); + return nullptr; + } + } + +#if defined(_MSC_VER) + ssize_t read_bytes = zvec_pread(fd_, buffer, size, offset); +#else + ssize_t read_bytes = pread(fd_, buffer, size, offset); +#endif + if (read_bytes != static_cast(size)) { + LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset); + MemoryLimitPool::get_instance().release_buffer(buffer, size); + return nullptr; + } + return page_table_.set_block_acquired(block_id, buffer, size); +} + +int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) { +#if defined(_MSC_VER) + ssize_t read_bytes = zvec_pread(fd_, buffer, length, offset); +#else + ssize_t read_bytes = pread(fd_, buffer, length, offset); +#endif + if (read_bytes != static_cast(length)) { + LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset); + return -1; + } + return 0; +} + +char *VecBufferPoolHandle::get_block(size_t offset, size_t size, + size_t block_id) { + char *buffer = pool_.acquire_buffer(block_id, offset, size, 5); + return buffer; +} + +int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *buffer) { + return pool_.get_meta(offset, length, buffer); +} + +void VecBufferPoolHandle::release_one(block_id_t block_id) { + pool_.page_table_.release_block(block_id); +} + +void VecBufferPoolHandle::acquire_one(block_id_t block_id) { + // The caller must guarantee the block is already loaded before calling + // acquire_one(). The return value of acquire_block() is intentionally + // ignored here, as a null return would indicate a contract violation. + pool_.page_table_.acquire_block(block_id); +} + +} // namespace ailego +} // namespace zvec \ No newline at end of file diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index a20a03160..da37e1d31 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include #include @@ -176,7 +176,8 @@ class BufferStorage : public IndexStorage { //! Initialize storage int init(const ailego::Params ¶ms) override { params.get(BUFFER_STORAGE_MEMORY_SIZE, &buffer_size_); - LOG_INFO("buffer size: %lu", buffer_size_); + LOG_INFO("buffer storage initialized"); + // LOG_DEBUG("buffer size: %lu", buffer_size_); return 0; } diff --git a/src/db/common/global_resource.cc b/src/db/common/global_resource.cc index 2f4ad1ca7..d0baf38c3 100644 --- a/src/db/common/global_resource.cc +++ b/src/db/common/global_resource.cc @@ -14,6 +14,7 @@ #include "db/common/global_resource.h" #include #include +#include #include namespace zvec { @@ -25,8 +26,8 @@ void GlobalResource::initialize() { new ailego::ThreadPool(GlobalConfig::Instance().query_thread_count())); this->optimize_thread_pool_.reset(new ailego::ThreadPool( GlobalConfig::Instance().optimize_thread_count())); - ailego::BufferManager::Instance().init( - GlobalConfig::Instance().memory_limit_bytes(), 1); + zvec::ailego::MemoryLimitPool::get_instance().init( + GlobalConfig::Instance().memory_limit_bytes()); }); } diff --git a/src/db/index/segment/segment.cc b/src/db/index/segment/segment.cc index 821d236e3..34894d18d 100644 --- a/src/db/index/segment/segment.cc +++ b/src/db/index/segment/segment.cc @@ -3415,8 +3415,8 @@ Status SegmentImpl::alter_column(const std::string &column_name, } if (!options_.enable_mmap_) { - ailego::BufferManager::Instance().init( - GlobalConfig::Instance().memory_limit_bytes(), 1); + zvec::ailego::MemoryLimitPool::get_instance().init( + GlobalConfig::Instance().memory_limit_bytes()); } // delete single column store file @@ -3510,8 +3510,8 @@ Status SegmentImpl::drop_column(const std::string &column_name) { } if (!options_.enable_mmap_) { - ailego::BufferManager::Instance().init( - GlobalConfig::Instance().memory_limit_bytes(), 1); + zvec::ailego::MemoryLimitPool::get_instance().init( + GlobalConfig::Instance().memory_limit_bytes()); } // delete single column store file diff --git a/src/db/index/storage/bufferpool_forward_store.cc b/src/db/index/storage/bufferpool_forward_store.cc index a8cbaee3f..4d2b2f6e2 100644 --- a/src/db/index/storage/bufferpool_forward_store.cc +++ b/src/db/index/storage/bufferpool_forward_store.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "db/index/storage/store_helper.h" #include "lazy_record_batch_reader.h" @@ -192,10 +193,11 @@ TablePtr BufferPoolForwardStore::fetch(const std::vector &columns, for (const auto &[rg_id, pairs] : rg_to_local) { for (size_t i = 0; i < col_indices.size(); ++i) { int col_idx = col_indices[i]; - auto buffer_id = ailego::BufferID::ParquetID(file_path_, col_idx, rg_id); - auto buffer_handle = buf_mgr.acquire(buffer_id); - auto col_chunked_array = buffer_handle.pin_parquet_data(); - + auto buffer_id = ailego::ParquetBufferID(file_path_, col_idx, rg_id); + auto buffer_handle = + ailego::ParquetBufferPool::get_instance().acquire_buffer(buffer_id); + std::shared_ptr col_chunked_array = + buffer_handle.data(); if (!col_chunked_array) { LOG_ERROR( "Failed to pin parquet data for file: %s, column: %d, row_group: " @@ -318,9 +320,11 @@ ExecBatchPtr BufferPoolForwardStore::fetch( auto &buf_mgr = ailego::BufferManager::Instance(); for (size_t i = 0; i < col_indices.size(); ++i) { int col_idx = col_indices[i]; - auto buffer_id = ailego::BufferID::ParquetID(file_path_, col_idx, rg_id); - auto buffer_handle = buf_mgr.acquire(buffer_id); - auto col_chunked_array = buffer_handle.pin_parquet_data(); + auto buffer_id = ailego::ParquetBufferID(file_path_, col_idx, rg_id); + auto buffer_handle = + ailego::ParquetBufferPool::get_instance().acquire_buffer(buffer_id); + std::shared_ptr col_chunked_array = + buffer_handle.data(); if (!col_chunked_array) { LOG_ERROR( diff --git a/src/db/index/storage/lazy_record_batch_reader.h b/src/db/index/storage/lazy_record_batch_reader.h index c9e124c5c..422708ed9 100644 --- a/src/db/index/storage/lazy_record_batch_reader.h +++ b/src/db/index/storage/lazy_record_batch_reader.h @@ -17,6 +17,7 @@ #include #include #include +#include #include "db/common/constants.h" @@ -128,10 +129,11 @@ class ParquetRecordBatchReader : public arrow::RecordBatchReader { if (with_cache_) { auto &buf_mgr = ailego::BufferManager::Instance(); for (size_t col_idx = 0; col_idx < col_indices_.size(); ++col_idx) { - auto buffer_id = ailego::BufferID::ParquetID( - file_path_, col_indices_[col_idx], rg_id); - auto buffer_handle = buf_mgr.acquire(buffer_id); - auto col_chunked_array = buffer_handle.pin_parquet_data(); + auto buffer_id = ailego::ParquetBufferID(file_path_, col_idx, rg_id); + auto buffer_handle = + ailego::ParquetBufferPool::get_instance().acquire_buffer(buffer_id); + std::shared_ptr col_chunked_array = + buffer_handle.data(); if (col_chunked_array) { std::shared_ptr concat; auto concat_result = arrow::Concatenate(col_chunked_array->chunks(), diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h deleted file mode 100644 index 69a01b2fc..000000000 --- a/src/include/zvec/ailego/buffer/buffer_pool.h +++ /dev/null @@ -1,173 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "concurrentqueue.h" - -#if defined(_MSC_VER) -#include -#endif - -namespace zvec { -namespace ailego { - -using block_id_t = size_t; -using version_t = size_t; - -class LPMap; - -class LRUCache { - public: - typedef std::pair BlockType; - typedef moodycamel::ConcurrentQueue ConcurrentQueue; - - int init(size_t block_size); - - bool evict_single_block(BlockType &item); - - bool add_single_block(const LPMap *lp_map, const BlockType &block, - int block_type); - - void clear_dead_node(const LPMap *lp_map); - - private: - constexpr static size_t CATCH_QUEUE_NUM = 3; - size_t block_size_{0}; - std::vector queues_; - alignas(64) std::atomic evict_queue_insertions_{0}; -}; - -class LPMap { - struct Entry { - alignas(64) std::atomic ref_count; - alignas(64) std::atomic load_count; - char *buffer; - }; - - public: - LPMap() : entry_num_(0), entries_(nullptr) {} - ~LPMap() { - delete[] entries_; - } - - void init(size_t entry_num); - - char *acquire_block(block_id_t block_id, bool lru_mode); - - void release_block(block_id_t block_id); - - char *evict_block(block_id_t block_id); - - char *set_block_acquired(block_id_t block_id, char *buffer); - - void recycle(moodycamel::ConcurrentQueue &free_buffers); - - size_t entry_num() const { - return entry_num_; - } - - inline bool isDeadBlock(LRUCache::BlockType block) const { - Entry &entry = entries_[block.first]; - return block.second != entry.load_count.load(); - } - - private: - size_t entry_num_{0}; - Entry *entries_{nullptr}; - LRUCache cache_; -}; - -class VecBufferPoolHandle; - -class VecBufferPool { - public: - typedef std::shared_ptr Pointer; - - VecBufferPool(const std::string &filename); - ~VecBufferPool() { - // Free all buffers in the free list - char *buf = nullptr; - while (free_buffers_.try_dequeue(buf)) { - ailego_free(buf); - } - // Free any buffers still pinned in the map - for (size_t i = 0; i < lp_map_.entry_num(); ++i) { - char *b = lp_map_.evict_block(i); - if (b) ailego_free(b); - } -#if defined(_MSC_VER) - _close(fd_); -#else - close(fd_); -#endif - } - - int init(size_t pool_capacity, size_t block_size, size_t segment_count); - - VecBufferPoolHandle get_handle(); - - char *acquire_buffer(block_id_t block_id, size_t offset, size_t size, - int retry = 0); - - int get_meta(size_t offset, size_t length, char *buffer); - - size_t file_size() const { - return file_size_; - } - - bool no_lru_mode() { - return no_lru_mode_; - } - - private: - int fd_; - size_t file_size_; - size_t pool_capacity_; - bool no_lru_mode_; - - public: - LPMap lp_map_; - - private: - std::vector> mutex_vec_; - moodycamel::ConcurrentQueue free_buffers_; -}; - -class VecBufferPoolHandle { - public: - VecBufferPoolHandle(VecBufferPool &pool) : pool_(pool) {} - VecBufferPoolHandle(VecBufferPoolHandle &&other) : pool_(other.pool_) {} - - ~VecBufferPoolHandle() = default; - - typedef std::shared_ptr Pointer; - - char *get_block(size_t offset, size_t size, size_t block_id); - - int get_meta(size_t offset, size_t length, char *buffer); - - void release_one(block_id_t block_id); - - void acquire_one(block_id_t block_id); - - private: - VecBufferPool &pool_; -}; - -} // namespace ailego -} // namespace zvec \ No newline at end of file diff --git a/src/include/zvec/ailego/buffer/lru_cache.h b/src/include/zvec/ailego/buffer/lru_cache.h new file mode 100644 index 000000000..68c6d3d16 --- /dev/null +++ b/src/include/zvec/ailego/buffer/lru_cache.h @@ -0,0 +1,162 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "concurrentqueue.h" + +#if defined(_MSC_VER) +#include +#endif + +namespace zvec { +namespace ailego { + +class VectorPageTable; + +using block_id_t = size_t; +using version_t = size_t; + +struct ParquetBufferID { + std::string filename; + int column; + int row_group; + uint64_t file_id; + long mtime; + ParquetBufferID() {} + ParquetBufferID(std::string &filename, int column, int row_group); +}; + +class LRUCache { + public: + struct BlockType { + // TODO: page_table & vector_block + std::pair vector_block; + std::pair parquet_buffer_block; + VectorPageTable *page_table{nullptr}; + }; + typedef moodycamel::ConcurrentQueue ConcurrentQueue; + + static LRUCache &get_instance() { + static LRUCache instance; + return instance; + } + LRUCache(const LRUCache &) = delete; + LRUCache &operator=(const LRUCache &) = delete; + LRUCache(LRUCache &&) = delete; + LRUCache &operator=(LRUCache &&) = delete; + + int init(); + + bool evict_single_block(BlockType &item); + + bool evict_block(BlockType &item); + + bool add_single_block(const BlockType &block, int queue_index); + + void clear_dead_node(); + + bool is_valid(VectorPageTable *page_table) { + std::shared_lock lock(valid_page_tables_mutex_); + return valid_page_tables_.find(page_table) != valid_page_tables_.end(); + } + + void set_valid(VectorPageTable *page_table) { + std::unique_lock lock(valid_page_tables_mutex_); + valid_page_tables_.insert(page_table); + } + + void set_invalid(VectorPageTable *page_table) { + std::unique_lock lock(valid_page_tables_mutex_); + valid_page_tables_.erase(page_table); + } + + // Atomically checks under the shared lock that the page table is still valid + // AND the block version has not been superseded, preventing TOCTOU races + // when a VectorPageTable is concurrently destroyed. + bool is_valid_and_alive(const BlockType &item); + + void recycle(); + + private: + LRUCache() { + init(); + } + + private: + constexpr static size_t CACHE_QUEUE_NUM = 3; + size_t evict_batch_size_{0}; + std::vector evict_queues_; + std::unordered_set valid_page_tables_; + std::shared_mutex valid_page_tables_mutex_; +}; + +class MemoryLimitPool { + public: + static MemoryLimitPool &get_instance() { + static MemoryLimitPool instance; + return instance; + } + MemoryLimitPool(const MemoryLimitPool &) = delete; + MemoryLimitPool &operator=(const MemoryLimitPool &) = delete; + MemoryLimitPool(MemoryLimitPool &&) = delete; + MemoryLimitPool &operator=(MemoryLimitPool &&) = delete; + + int init(size_t pool_size); + + bool try_acquire_buffer(const size_t buffer_size, char *&buffer); + + void acquire_parquet(const size_t buffer_size); + + void release_buffer(char *buffer, const size_t buffer_size); + + void release_parquet(const size_t buffer_size); + + bool is_full(); + + bool is_hot_level1(); + + bool is_hot_level2(); + + private: + MemoryLimitPool() = default; + + private: + size_t pool_size_{0}; + std::atomic used_size_{0}; +}; + +} // namespace ailego +} // namespace zvec \ No newline at end of file diff --git a/src/include/zvec/ailego/buffer/parquet_hash_table.h b/src/include/zvec/ailego/buffer/parquet_hash_table.h new file mode 100644 index 000000000..c734d76b1 --- /dev/null +++ b/src/include/zvec/ailego/buffer/parquet_hash_table.h @@ -0,0 +1,166 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lru_cache.h" + +namespace arrow { +class ChunkedArray; +class Array; +class DataType; +class Scalar; +template +class Result; +class Status; +class Buffer; +} // namespace arrow + +namespace zvec { +namespace ailego { + +using block_id_t = size_t; +using version_t = size_t; + +class LRUCache; + +struct IDHash { + size_t operator()(const ParquetBufferID &buffer_id) const { + size_t hash = std::hash{}(1); + hash = hash ^ (std::hash{}(buffer_id.file_id)); + hash = hash * 31 + std::hash{}(buffer_id.column); + hash = hash * 31 + std::hash{}(buffer_id.row_group); + return hash; + } +}; + +struct IDEqual { + bool operator()(const ParquetBufferID &a, const ParquetBufferID &b) const { + if (a.filename != b.filename) { + return false; + } + if (a.file_id != b.file_id) { + return false; + } + if (a.mtime != b.mtime) { + return false; + } + return a.column == b.column && a.row_group == b.row_group; + } +}; + +struct ParquetBufferContext { + // A shared pointer to the buffers allocated for arrow parquet data + std::shared_ptr arrow{nullptr}; + + // Guard original arrow buffers to prevent premature deletion + std::vector> arrow_refs{}; + + size_t size; + alignas(64) std::atomic ref_count{std::numeric_limits::min()}; + alignas(64) std::atomic load_count{0}; +}; + +class ParquetBufferContextHandle { + public: + ParquetBufferContextHandle() {} + ParquetBufferContextHandle(ParquetBufferID &buffer_id, + std::shared_ptr arrow) + : buffer_id_(buffer_id), arrow_(arrow) {} + ParquetBufferContextHandle(const ParquetBufferContextHandle &handle_); + ParquetBufferContextHandle(ParquetBufferContextHandle &&handle_) + : buffer_id_(std::move(handle_.buffer_id_)), + arrow_(std::move(handle_.arrow_)) {} + + ~ParquetBufferContextHandle(); + + std::shared_ptr data() { + return arrow_; + } + + private: + ParquetBufferID buffer_id_; + std::shared_ptr arrow_{nullptr}; +}; + +class ParquetBufferPool { + public: + typedef std::shared_ptr Pointer; + + struct ArrowBufferDeleter { + explicit ArrowBufferDeleter(ParquetBufferPool *c, ParquetBufferID i) + : pool(c), id(i) {} + ParquetBufferPool *pool; + ParquetBufferID id; + // Only reduces the reference count but does not actually release the + // buffer, since the buffer memory is managed by the BufferManager. + void operator()(arrow::Buffer *) { + return; + } + }; + + using Table = std::unordered_map; + + arrow::Status acquire(ParquetBufferID buffer_id, + ParquetBufferContext &context); + + ParquetBufferContextHandle acquire_buffer(ParquetBufferID buffer_id); + + std::shared_ptr set_block_acquired( + ParquetBufferID buffer_id); + + std::shared_ptr acquire(ParquetBufferID buffer_id); + + std::shared_ptr acquire_one(ParquetBufferID buffer_id); + + void release(ParquetBufferID buffer_id); + + void evict(ParquetBufferID buffer_id); + + bool is_dead_node(LRUCache::BlockType &block); + + static ParquetBufferPool &get_instance() { + static ParquetBufferPool instance; + return instance; + } + + ParquetBufferPool(const ParquetBufferPool &) = delete; + ParquetBufferPool &operator=(const ParquetBufferPool &) = delete; + ParquetBufferPool(ParquetBufferPool &&) = delete; + ParquetBufferPool &operator=(ParquetBufferPool &&) = delete; + + private: + ParquetBufferPool() = default; + + private: + Table table_; + std::shared_mutex table_mutex_; +}; + +} // namespace ailego +} // namespace zvec \ No newline at end of file diff --git a/src/include/zvec/ailego/buffer/vector_page_table.h b/src/include/zvec/ailego/buffer/vector_page_table.h new file mode 100644 index 000000000..f0c592334 --- /dev/null +++ b/src/include/zvec/ailego/buffer/vector_page_table.h @@ -0,0 +1,171 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "concurrentqueue.h" +#include "lru_cache.h" + +#if defined(_MSC_VER) +#include +#endif + +namespace zvec { +namespace ailego { + +using block_id_t = size_t; +using version_t = size_t; + +class VectorPageTable { + struct Entry { + alignas(64) std::atomic ref_count; + alignas(64) std::atomic load_count; + alignas(64) std::atomic lru_version; + char *buffer; + size_t size; + }; + + public: + VectorPageTable() : entry_num_(0), entries_(nullptr) { + LRUCache::get_instance().set_valid(this); + } + ~VectorPageTable() { + LRUCache::get_instance().set_invalid(this); + delete[] entries_; + } + + VectorPageTable(const VectorPageTable &) = delete; + VectorPageTable &operator=(const VectorPageTable &) = delete; + VectorPageTable(VectorPageTable &&) = delete; + VectorPageTable &operator=(VectorPageTable &&) = delete; + + void init(size_t entry_num); + + char *acquire_block(block_id_t block_id); + + void release_block(block_id_t block_id); + + void evict_block(block_id_t block_id); + + char *set_block_acquired(block_id_t block_id, char *buffer, size_t size); + + size_t entry_num() const { + return entry_num_; + } + + // Returns true if the block has no active references (ref_count <= 0). + // Used by VecBufferPool destructor to assert all handles are released. + bool is_released(block_id_t block_id) const { + assert(block_id < entry_num_); + return entries_[block_id].ref_count.load(std::memory_order_relaxed) <= 0; + } + + inline bool is_dead_block(LRUCache::BlockType block) const { + Entry &entry = entries_[block.vector_block.first]; + return block.vector_block.second != entry.load_count.load(); + } + + private: + size_t entry_num_{0}; + Entry *entries_{nullptr}; + moodycamel::ConcurrentQueue evict_cache_; +}; + +class VecBufferPoolHandle; + +class VecBufferPool { + public: + typedef std::shared_ptr Pointer; + + VecBufferPool(const std::string &filename); + ~VecBufferPool() { + for (size_t i = 0; i < page_table_.entry_num(); ++i) { + // A positive ref_count means a VecBufferPoolHandle is still alive, + // which is a contract violation: all handles must be destroyed before + // the pool itself is destroyed. + assert(page_table_.is_released(i)); + page_table_.evict_block(i); + } +#if defined(_MSC_VER) + _close(fd_); +#else + close(fd_); +#endif + } + + int init(size_t pool_capacity, size_t block_size, size_t segment_count); + + VecBufferPoolHandle get_handle(); + + char *acquire_buffer(block_id_t block_id, size_t offset, size_t size, + int retry = 0); + + int get_meta(size_t offset, size_t length, char *buffer); + + size_t file_size() const { + return file_size_; + } + + private: + int fd_; + size_t file_size_; + + public: + VectorPageTable page_table_; + + private: + std::vector> block_mutexes_; +}; + +class VecBufferPoolHandle { + public: + VecBufferPoolHandle(VecBufferPool &pool) : pool_(pool) {} + VecBufferPoolHandle(VecBufferPoolHandle &&other) : pool_(other.pool_) {} + + ~VecBufferPoolHandle() = default; + + typedef std::shared_ptr Pointer; + + char *get_block(size_t offset, size_t size, size_t block_id); + + int get_meta(size_t offset, size_t length, char *buffer); + + void release_one(block_id_t block_id); + + void acquire_one(block_id_t block_id); + + private: + VecBufferPool &pool_; +}; + +} // namespace ailego +} // namespace zvec \ No newline at end of file diff --git a/src/include/zvec/ailego/container/heap.h b/src/include/zvec/ailego/container/heap.h index fce03674d..33f4cb410 100644 --- a/src/include/zvec/ailego/container/heap.h +++ b/src/include/zvec/ailego/container/heap.h @@ -91,6 +91,9 @@ class Heap : public TBase { //! Pop the front element void pop(void) { + if (TBase::empty()) { + return; + } if (TBase::size() > 1) { auto last = TBase::end() - 1; this->replace_heap(TBase::begin(), last, std::move(*last)); diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h index 8273004a3..677838ca8 100644 --- a/src/include/zvec/core/framework/index_storage.h +++ b/src/include/zvec/core/framework/index_storage.h @@ -14,7 +14,7 @@ #pragma once -#include +#include #include #include #include diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc index 396e57616..d74c277e6 100644 --- a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc +++ b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc @@ -47,6 +47,7 @@ void FlatStreamerTest::TearDown(void) { } TEST_F(FlatStreamerTest, TestLinearSearch) { + MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL); IndexStreamer::Pointer write_streamer = IndexFactory::CreateStreamer("FlatStreamer"); ASSERT_TRUE(write_streamer != nullptr); @@ -168,6 +169,7 @@ TEST_F(FlatStreamerTest, TestLinearSearch) { } TEST_F(FlatStreamerTest, TestLinearSearchWithLRU) { + MemoryLimitPool::get_instance().init(100 * 1024UL * 1024UL); constexpr size_t static dim = 1600; IndexStreamer::Pointer write_streamer = IndexFactory::CreateStreamer("FlatStreamer"); @@ -187,7 +189,7 @@ TEST_F(FlatStreamerTest, TestLinearSearchWithLRU) { auto ctx = write_streamer->create_context(); ASSERT_TRUE(!!ctx); - size_t cnt = 1000000UL; + size_t cnt = 50000UL; IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim); for (size_t i = 0; i < cnt; i++) { NumericalVector vec(dim); diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc b/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc index a76d5c573..a3c006320 100644 --- a/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc +++ b/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc @@ -137,6 +137,7 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) { } TEST_F(FlatStreamerTest, TestLinearSearchBuffer) { + MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL); IndexStreamer::Pointer write_streamer = IndexFactory::CreateStreamer("FlatStreamer"); ASSERT_TRUE(write_streamer != nullptr); diff --git a/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc b/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc index 6f111a4bf..30f9d7cbb 100644 --- a/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc +++ b/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc @@ -48,6 +48,7 @@ void HnswStreamerTest::TearDown(void) { } TEST_F(HnswStreamerTest, TestHnswSearch) { + MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL); IndexStreamer::Pointer write_streamer = IndexFactory::CreateStreamer("HnswStreamer"); ASSERT_TRUE(write_streamer != nullptr); diff --git a/tests/core/interface/index_interface_test.cc b/tests/core/interface/index_interface_test.cc index 4d1aefd0b..a4c4abc5d 100644 --- a/tests/core/interface/index_interface_test.cc +++ b/tests/core/interface/index_interface_test.cc @@ -27,6 +27,7 @@ #include "zvec/core/interface/index_factory.h" #include "zvec/core/interface/index_param.h" #include "zvec/core/interface/index_param_builders.h" +#include #if defined(__GNUC__) || defined(__GNUG__) #pragma GCC diagnostic push @@ -155,6 +156,7 @@ TEST(IndexInterface, General) { } TEST(IndexInterface, BufferGeneral) { + zvec::ailego::MemoryLimitPool::get_instance().init(100 * 1024 * 1024); constexpr uint32_t kDimension = 64; const std::string index_name{"test.index"}; @@ -261,7 +263,7 @@ TEST(IndexInterface, BufferGeneral) { .with_fetch_vector(true) .with_ef_search(20) .build()); - zvec::ailego::BufferManager::Instance().cleanup(); + // zvec::ailego::BufferManager::Instance().cleanup(); } diff --git a/tests/db/index/column/vector_column_indexer_test.cc b/tests/db/index/column/vector_column_indexer_test.cc index cbaf2d502..b16c5cea1 100644 --- a/tests/db/index/column/vector_column_indexer_test.cc +++ b/tests/db/index/column/vector_column_indexer_test.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include "db/index/column/vector_column/vector_column_params.h" #include "tests/test_util.h" #include "zvec/ailego/utility/float_helper.h" @@ -2136,6 +2137,7 @@ TEST(VectorColumnIndexerTest, Failure) { // Test case 10: use_mmap = false { + zvec::ailego::MemoryLimitPool::get_instance().init(10 * 1024UL * 1024UL); auto indexer = std::make_shared( index_file_path, FieldSchema("test", DataType::VECTOR_FP32, 3, false, diff --git a/tests/db/index/segment/segment_test.cc b/tests/db/index/segment/segment_test.cc index 9530b8cf1..a3267fd9e 100644 --- a/tests/db/index/segment/segment_test.cc +++ b/tests/db/index/segment/segment_test.cc @@ -38,6 +38,7 @@ #include "db/index/storage/wal/wal_file.h" #include "utils/utils.h" #include "zvec/db/options.h" +#include using namespace zvec; @@ -49,7 +50,7 @@ class SegmentTest : public testing::TestWithParam { FileHelper::RemoveDirectory(col_path); FileHelper::CreateDirectory(col_path); - ailego::BufferManager::Instance().init(MIN_MEMORY_LIMIT_BYTES, 1); + zvec::ailego::MemoryLimitPool::get_instance().init(MIN_MEMORY_LIMIT_BYTES); std::string idmap_path = FileHelper::MakeFilePath(col_path, FileID::ID_FILE, 0); diff --git a/tests/db/index/storage/bufferpool_store_test.cc b/tests/db/index/storage/bufferpool_store_test.cc index 9d4ba1881..3ea9024c1 100644 --- a/tests/db/index/storage/bufferpool_store_test.cc +++ b/tests/db/index/storage/bufferpool_store_test.cc @@ -34,7 +34,7 @@ class BufferPoolStoreTest : public testing::Test { std::cout << "err: " << s.message() << std::endl; exit(1); } - ailego::BufferManager::Instance().init(10 * 1024 * 1024, 1); + zvec::ailego::MemoryLimitPool::get_instance().init(10 * 1024 * 1024); } void TearDown() override {