From 3221ce01648dee7e9cdfb1bbfd02ebc966b700a6 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sun, 23 Nov 2025 18:18:57 -0600 Subject: [PATCH 01/20] Initial commit --- src/common/Cargo.toml | 3 + src/common/src/lib.rs | 1 + src/common/src/memory/arena.rs | 121 +++++++++ src/common/src/memory/global_pool.rs | 0 src/common/src/memory/mod.rs | 6 + src/common/src/memory/page.rs | 133 ++++++++++ src/common/src/memory/pool.rs | 188 ++++++++++++++ src/common/src/memory/segment.rs | 108 ++++++++ src/common/src/memory/tcache.rs | 370 +++++++++++++++++++++++++++ 9 files changed, 930 insertions(+) create mode 100644 src/common/src/memory/arena.rs create mode 100644 src/common/src/memory/global_pool.rs create mode 100644 src/common/src/memory/mod.rs create mode 100644 src/common/src/memory/page.rs create mode 100644 src/common/src/memory/pool.rs create mode 100644 src/common/src/memory/segment.rs create mode 100644 src/common/src/memory/tcache.rs diff --git a/src/common/Cargo.toml b/src/common/Cargo.toml index 0b593883..a3844bef 100644 --- a/src/common/Cargo.toml +++ b/src/common/Cargo.toml @@ -13,6 +13,9 @@ async-trait = { workspace = true } bytes = { workspace = true } chrono = "0.4.42" futures = { workspace = true } +io-uring = "0.7.11" +libc = "0.2.177" +log.workspace = true object_store = { workspace = true } prost = { workspace = true } serde = { workspace = true } diff --git a/src/common/src/lib.rs b/src/common/src/lib.rs index 05c5d68e..9d9682df 100644 --- a/src/common/src/lib.rs +++ b/src/common/src/lib.rs @@ -5,3 +5,4 @@ pub mod mock_store; pub mod rpc; pub mod utils; pub use io_mode::IoMode; +pub mod memory; diff --git a/src/common/src/memory/arena.rs b/src/common/src/memory/arena.rs new file mode 100644 index 00000000..bd9321aa --- /dev/null +++ b/src/common/src/memory/arena.rs @@ -0,0 +1,121 @@ +use std::{io, os::raw::c_void, ptr::{null, null_mut}}; + +use io_uring::IoUring; + +use crate::memory::{ + page::Slice, + segment::{SEGMENT_SIZE, SEGMENT_SIZE_BITS, Segment}, tcache::MIN_SIZE_FROM_PAGES, +}; + +const FIXED_BUFFERS_PER_SEGMENT: usize = SEGMENT_SIZE / MIN_SIZE_FROM_PAGES; + +pub struct Arena { + size: usize, + slices: Vec, + used_bitmap: Vec, + start_ptr: *mut u8, + buffers_registered: bool, +} + +unsafe impl Send for Arena {} +unsafe impl Sync for Arena {} + +impl Arena { + pub fn new(capacity: usize) -> Arena { + let mem_start = Self::allocate_memory_from_os(capacity); + assert_ne!(mem_start, null_mut()); + let mem_end = mem_start.wrapping_add(capacity); + let ptr_aligned = (mem_start as usize >> SEGMENT_SIZE_BITS) << SEGMENT_SIZE_BITS; + let mut slice_start = ptr_aligned; + if ptr_aligned != (mem_start as usize) { + slice_start = ptr_aligned + SEGMENT_SIZE; + } + let mut slices = Vec::new(); + while slice_start < mem_end as usize { + slices.push(Slice { + ptr: slice_start as *mut u8, + size: SEGMENT_SIZE, + }); + slice_start += SEGMENT_SIZE; + } + let mut used_bitmap = Vec::new(); + used_bitmap.resize(slices.len(), 0); + + Arena { + size: capacity, + slices: slices, + used_bitmap: used_bitmap, + start_ptr: mem_start, + buffers_registered: false, + } + } + + fn allocate_memory_from_os(capacity: usize) -> *mut u8 { + let prot = libc::PROT_READ | libc::PROT_WRITE; + let flags = libc::MAP_ANONYMOUS | libc::MAP_PRIVATE; + unsafe { libc::mmap64(null_mut(), capacity, prot, flags, -1, 0) as *mut u8 } + } + + pub fn allocate_segment(self: &mut Self, size: usize) -> Option<*mut Segment> { + let num_slices = (size + SEGMENT_SIZE - 1) / SEGMENT_SIZE; + let mut contiguous = 0; + let mut result: i32 = -1; + + for index in 0..self.used_bitmap.len() { + let bit = self.used_bitmap[index]; + if bit == 0 { + contiguous += 1; + if contiguous == num_slices { + result = (index + 1 - contiguous) as i32; + break; + } + } else { + contiguous = 0; + } + } + if result == -1 { + return None; + } + for i in 0..contiguous { + self.used_bitmap[result as usize + i] = 1; + } + let combined_slice = Slice { + ptr: self.slices[result as usize].ptr, + size: num_slices * SEGMENT_SIZE, + }; + let start_buffer_id = if self.buffers_registered { + Some(result as usize * FIXED_BUFFERS_PER_SEGMENT) + } else { + None + }; + Some(Segment::new_from_slice(combined_slice, start_buffer_id)) + } + + pub(crate) fn retire_segment(self: &mut Self, segment: *mut Segment) { + debug_assert!((self.slices[0].ptr as usize) <= segment as usize); + let segment_idx = (segment as usize - self.slices[0].ptr as usize) / SEGMENT_SIZE; + self.used_bitmap[segment_idx] = 0; + } + + pub(crate) fn register_buffers_with_ring(self: &mut Self, ring: &IoUring) -> io::Result<()> { + let num_buffers = self.size / MIN_SIZE_FROM_PAGES; + let mut buffers = Vec::::new(); + buffers.reserve(num_buffers); + let mut base_ptr = self.start_ptr; + for _i in 0..num_buffers { + buffers.push(libc::iovec {iov_base: base_ptr as *mut std::ffi::c_void, iov_len: MIN_SIZE_FROM_PAGES}); + base_ptr = unsafe { base_ptr.add(MIN_SIZE_FROM_PAGES) }; + } + unsafe { + ring.submitter().register_buffers(&buffers) + } + } +} + +impl Drop for Arena { + fn drop(self: &mut Self) { + unsafe { + libc::munmap(self.start_ptr as *mut c_void, self.size); + } + } +} \ No newline at end of file diff --git a/src/common/src/memory/global_pool.rs b/src/common/src/memory/global_pool.rs new file mode 100644 index 00000000..e69de29b diff --git a/src/common/src/memory/mod.rs b/src/common/src/memory/mod.rs new file mode 100644 index 00000000..75406a8e --- /dev/null +++ b/src/common/src/memory/mod.rs @@ -0,0 +1,6 @@ +pub mod page; +pub mod pool; +mod segment; +mod arena; +mod global_pool; +mod tcache; \ No newline at end of file diff --git a/src/common/src/memory/page.rs b/src/common/src/memory/page.rs new file mode 100644 index 00000000..2e13b7ff --- /dev/null +++ b/src/common/src/memory/page.rs @@ -0,0 +1,133 @@ +use std::{collections::VecDeque, ptr::null_mut, sync::{Mutex, atomic::{AtomicUsize, Ordering}}}; + +use crate::memory::tcache::MIN_SIZE_FROM_PAGES; + +#[derive(Clone, Copy)] +pub struct Block { + ptr: *mut u8, +} + +pub const PAGE_SIZE: usize = 256<<10; + +pub struct Page { + pub(crate) block_size: usize, // Size of objects that are being allocated to this page + // TODO(): Remove dependency on dynamically allocated memory + free_list: Mutex>, + pub(crate) used: AtomicUsize, + // local_free_list: VecDeque, + // thread_free_list: VecDeque, + pub(crate) capacity: usize, + pub(crate) slice_count: usize, // No. of pages in the slice containing this page + pub(crate) slice_offset: usize, // Offset of this page from the start of this slice + pub(crate) page_start: *mut u8, + pub(crate) start_buffer_id: Option, +} + +impl Page { + pub fn from_slice(slice: Slice, start_buffer_id: Option) -> Page { + // let mut start_ptr = slice.ptr; + let free_list = VecDeque::::new(); + Page { + block_size: 0usize, + free_list: Mutex::new(free_list), + used: AtomicUsize::new(0), + // local_free_list: VecDeque::new(), + // thread_free_list: VecDeque::new(), + capacity: slice.size, + slice_count: 1, + slice_offset: 0, + page_start: slice.ptr, + start_buffer_id + } + } + + pub fn set_block_size(self: &mut Self, block_size: usize) { + self.block_size = block_size; + let mut offset: usize = 0; + let mut guard = self.free_list.lock().unwrap(); + while offset < self.capacity { + let ptr = unsafe { self.page_start.add(offset) }; + guard.push_back(Block {ptr}); + offset += self.block_size; + } + } + + /** + * Returns (block, buffer id) pair + */ + #[inline] + pub fn get_free_block(self: &mut Self) -> (*mut u8, Option) { + let mut guard = self.free_list.lock().unwrap(); + let block = guard.pop_front(); + if block.is_none() { + return (null_mut(), None); + } + self.used.fetch_add(1usize, Ordering::Relaxed); + let ptr = block.unwrap().ptr; + if self.start_buffer_id.is_none() { + return (ptr, None) + } + + let buffer_offset = (ptr as usize - self.page_start as usize) / MIN_SIZE_FROM_PAGES; + (ptr, Some(self.start_buffer_id.unwrap() + buffer_offset)) + } + + #[inline] + pub fn is_full(self: &Self) -> bool { + let guard = self.free_list.lock().unwrap(); + guard.is_empty() + } + + #[inline] + pub fn get_size(self: &Self) -> usize { + self.capacity + } + + #[inline] + pub fn free(self: &mut Self, ptr: *mut u8) { + let blk = Block {ptr}; + let mut guard = self.free_list.lock().unwrap(); + guard.push_back(blk); + self.used.fetch_sub(1usize, Ordering::Relaxed); + } +} + +pub struct Slice { + pub ptr: *mut u8, + pub size: usize, +} + +impl Slice { + pub fn split(self: Self) -> (Slice, Slice) { + let new_size = self.size >> 1; + let slice1 = Slice { + ptr: self.ptr, + size: new_size, + }; + let slice2 = Slice { + ptr: self.ptr.wrapping_add(new_size), + size: new_size, + }; + (slice1, slice2) + } +} + +// pub struct PageQueue { +// page: *mut Page, +// next: *mut PageQueue, +// } + +// impl PageQueue { +// pub fn new() -> PageQueue { +// PageQueue { page: null_mut(), next: null_mut() } +// } + +// pub(crate) fn get_page(self: &mut Self) -> Option<*mut Page> { +// if self.page.is_null() { +// return None; +// } +// let result = self.page; +// self = *self.next; +// Some(result) +// } +// } \ No newline at end of file diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs new file mode 100644 index 00000000..caa20a78 --- /dev/null +++ b/src/common/src/memory/pool.rs @@ -0,0 +1,188 @@ +extern crate io_uring; + +use std::sync::{Arc, Mutex, OnceLock, atomic::Ordering}; + +use futures::io; +use io_uring::IoUring; + +use crate::memory::{arena::Arena, segment::Segment, tcache::{TCache, TCacheStats}}; + +static FIXED_BUFFER_POOL: OnceLock = OnceLock::new(); + +pub struct FixedBufferPool { + local_caches: Vec>, + arena: Arc>, +} + +impl FixedBufferPool { + fn new(capacity_mb: usize) -> FixedBufferPool { + let num_cpus = std::thread::available_parallelism().unwrap(); + let arena = Self::allocate_arena(capacity_mb << 20); + let mut local_caches = Vec::>::new(); + for i in 0..num_cpus.get() { + local_caches.push(Mutex::new(TCache::new(arena.clone(), i))); + } + FixedBufferPool { local_caches, arena } + } + + pub fn allocate_arena(capacity: usize) -> Arc> { + Arc::new(Mutex::new(Arena::new(capacity))) + } + + pub fn init(capacity_mb: usize) { + FIXED_BUFFER_POOL.get_or_init(|| FixedBufferPool::new(capacity_mb)); + } + + fn get_thread_local_cache() -> &'static Mutex { + let cpu = unsafe { libc::sched_getcpu() }; + &FIXED_BUFFER_POOL.get().unwrap().local_caches[cpu as usize] + } + + pub fn malloc(size: usize) -> (*mut u8, Option) { + let local_cache = Self::get_thread_local_cache(); + local_cache.lock().unwrap().allocate(size) + } + + pub fn free(ptr: *mut u8) { + let segment_ptr = Segment::get_segment_from_ptr(ptr); + let page_ptr = unsafe { (*segment_ptr).get_page_from_ptr(ptr) }; + unsafe { + (*page_ptr).free(ptr); + } + // If page is local and unused after free, return it to segment + let thread_id = unsafe { (*segment_ptr).thread_id }; + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + if cur_cpu == thread_id { + let should_free_page = unsafe { (*page_ptr).used.load(Ordering::Relaxed) == 0 }; + if should_free_page { + let local_cache = Self::get_thread_local_cache(); + let mut guard = local_cache.lock().unwrap(); + guard.retire_page(page_ptr); + } + } + } + + pub fn register_buffers_with_ring(ring: &IoUring) -> io::Result<()> { + let pool = FIXED_BUFFER_POOL.get().unwrap(); + let mut arena_guard = pool.arena.lock().unwrap(); + arena_guard.register_buffers_with_ring(ring) + } + + pub(crate) fn get_stats(cpu: usize) -> TCacheStats { + let pool = FIXED_BUFFER_POOL.get().unwrap(); + let tcache = pool.local_caches[cpu].lock().unwrap(); + tcache.get_stats() + } +} + +impl Drop for FixedBufferPool { + fn drop(self: &mut Self) { + println!("Drop called"); + let arena = self.arena.lock().unwrap(); + drop(arena); + } +} + + +mod tests { + use std::ptr::null_mut; + + use crate::memory::pool::FixedBufferPool; + + #[test] + fn test_basic_alloc_and_free() { + FixedBufferPool::init(128); + + let buffer_lengths = [4096, 4096, 4096 * 4]; // 2 different size classes + let mut ptrs = Vec::<*mut u8>::new(); + for len in buffer_lengths { + let (ptr, fixed_buffer) = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + assert_eq!(fixed_buffer, None); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len-1] = 1; + ptrs.push(ptr); + } + + for ptr in ptrs { + FixedBufferPool::free(ptr); + } + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.fast_allocations, 1); + assert_eq!(stats.pages_retired, 2); + assert_eq!(stats.segments_retired, 1); + } + + #[test] + fn test_free_from_different_thread() { + FixedBufferPool::init(128); + + let buffer_lengths = [4096, 4096 * 4]; + let mut buffers = Vec::<&mut [u8]>::new(); + for len in buffer_lengths { + let (ptr, fixed_buffer) = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + assert_eq!(fixed_buffer, None); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len-1] = 1; + buffers.push(buffer); + } + + std::thread::spawn(move || { + for buffer in buffers { + let ptr = buffer.as_mut_ptr(); + FixedBufferPool::free(ptr); + } + }); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.allocations_from_segment, 1); + assert_eq!(stats.fast_allocations, 0); + assert_eq!(stats.pages_retired, 0); + assert_eq!(stats.segments_retired, 0); + } + + #[test] + fn test_large_alloc_and_free() { + FixedBufferPool::init(128); + let len = 1024 * 1024; // 1 MB + let (ptr, fixed_buffer) = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + assert_eq!(fixed_buffer, None); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len-1] = 1; + FixedBufferPool::free(ptr); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.pages_retired, 1); + assert_eq!(stats.segments_retired, 1); + } + + #[test] + fn test_very_large_alloc_fails() { + FixedBufferPool::init(128); + let len = 32 * 1024 * 1024; // 32 MB + let (ptr, fixed_buffer) = FixedBufferPool::malloc(len); + assert_eq!(ptr, null_mut()); + } +} \ No newline at end of file diff --git a/src/common/src/memory/segment.rs b/src/common/src/memory/segment.rs new file mode 100644 index 00000000..37224038 --- /dev/null +++ b/src/common/src/memory/segment.rs @@ -0,0 +1,108 @@ +use std::ptr::null_mut; + +use crate::memory::{page::{PAGE_SIZE, Page, Slice}, tcache::MIN_SIZE_FROM_PAGES}; + +pub const SEGMENT_SIZE: usize = 32 * 1024 * 1024; +pub const SEGMENT_SIZE_BITS: usize = SEGMENT_SIZE.ilog2() as usize; + +pub const PAGES_PER_SEGMENT: usize = SEGMENT_SIZE / PAGE_SIZE; +const FIXED_BUFFERS_PER_PAGE: usize = PAGE_SIZE / MIN_SIZE_FROM_PAGES; + +pub struct Segment { + pub(crate) allocated: usize, + pub(crate) num_slices: usize, + pub(crate) pages: [Page; PAGES_PER_SEGMENT - 1], + pub(crate) thread_id: usize, + pub(crate) start_buffer_id: Option, +} + +impl Segment { + pub fn new_from_slice(slice: Slice, start_buffer_id: Option) -> *mut Segment { + // First sizeof(Segment) bytes should hold the Segment object + let segment_ptr = slice.ptr as *mut Segment; + let mut start_ptr = unsafe { slice.ptr.add(PAGE_SIZE) }; + + unsafe { + (*segment_ptr).allocated = 0; + (*segment_ptr).num_slices = PAGES_PER_SEGMENT - 1; + for i in 0..(*segment_ptr).num_slices { + let page_start_buffer_id = if start_buffer_id.is_some() { + Some(start_buffer_id.unwrap() + i * FIXED_BUFFERS_PER_PAGE) + } else { + start_buffer_id + }; + (*segment_ptr).pages[i] = Page::from_slice(Slice {ptr: start_ptr, size: PAGE_SIZE}, page_start_buffer_id); + start_ptr = start_ptr.wrapping_add(PAGE_SIZE); + } + (*segment_ptr).start_buffer_id = start_buffer_id; + } + segment_ptr + } + + #[inline] + pub fn full(self: &mut Self) -> bool { + self.allocated == self.num_slices + } + + // pub fn try_allocate_page(self: &mut Self, page_size: usize) -> Slice { + // let min_bin = page_size / PAGE_SIZE; + // for i in min_bin..NUM_SPANS { + // let slice_opt = self.spans[i].pop_front(); + // if slice_opt.is_none() { + // continue; + // } + // let mut slice = slice_opt.unwrap(); + // let mut j = i; + // while j > min_bin && slice.size >= 2 * page_size { + // // split slice + // let (slice1, slice2) = slice.split(); + // self.spans[i-1].push_back(slice2); + // slice = slice1; + // j -= 1; + // } + // self.allocated += slice.size; + // return slice; + // } + // // Allocate from arena + + // Slice {ptr: null_mut(), size: 0} + // } + + pub fn retire(self: &mut Self) -> () { + todo!() + } + + pub fn get_segment_from_ptr(ptr: *mut u8) -> *mut Segment { + let aligned_ptr = (ptr as usize >> SEGMENT_SIZE_BITS) << SEGMENT_SIZE_BITS; + aligned_ptr as *mut Segment + } + + pub fn get_page_from_ptr(self: &mut Self, ptr: *mut u8) -> *mut Page { + let base_page_ptr = self.pages[0].page_start; + debug_assert!(ptr >= base_page_ptr); + let index = unsafe { + ptr.sub(base_page_ptr as usize) as usize / PAGE_SIZE + }; + debug_assert!(index < PAGES_PER_SEGMENT - 1); + &mut self.pages[index] as *mut Page + } + + /** + * Split `page` into 2, with the first partition having `num_slices` slices + */ + pub fn split_page(self: &mut Self, page: *mut Page, num_slices: usize) -> *mut Page { + debug_assert_ne!(page, null_mut()); + let page_ref = unsafe {&mut (*page)}; + let base_page_ptr = self.pages[0].page_start; + debug_assert!(page_ref.page_start >= base_page_ptr); + let index = unsafe { + page_ref.page_start.sub(base_page_ptr as usize) as usize / PAGE_SIZE + }; + debug_assert!(index < PAGES_PER_SEGMENT - 1); + let next_slice = &mut self.pages[index + num_slices]; + next_slice.slice_offset = 0; + next_slice.slice_count = page_ref.slice_count - num_slices; + page_ref.slice_count = num_slices; + next_slice as *mut Page + } +} \ No newline at end of file diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs new file mode 100644 index 00000000..af830f95 --- /dev/null +++ b/src/common/src/memory/tcache.rs @@ -0,0 +1,370 @@ +use std::{ + ptr::null_mut, + sync::{Arc, Mutex, atomic::Ordering}, +}; + +use crate::memory::{ + arena::Arena, + page::{PAGE_SIZE, Page}, + segment::{PAGES_PER_SEGMENT, SEGMENT_SIZE, Segment}, +}; + +const SIZE_CLASSES: &'static [usize] = &[ + 4 << 10, + 8 << 10, + 16 << 10, + 32 << 10, + 64 << 10, + 128 << 10, + 256 << 10, +]; + +const NUM_SIZE_CLASSES: usize = SIZE_CLASSES.len(); + +pub(crate) const MIN_SIZE_FROM_PAGES: usize = SIZE_CLASSES[0]; +const MAX_SIZE_FROM_PAGES: usize = SIZE_CLASSES[NUM_SIZE_CLASSES - 1]; + +const SEGMENT_BINS: usize = 8; // (SEGMENT_SIZE/PAGE_SIZE).log2() + 1 + +#[derive(Default, Clone)] +pub(crate) struct TCacheStats { + // Allocation stats + pub(crate) total_allocations: usize, + pub(crate) total_segments_allocated: usize, + pub(crate) fast_allocations: usize, // Allocations from self.free_pages + pub(crate) allocations_from_pages: usize, // Allocations from self.used_pages + pub(crate) allocations_from_segment: usize, + pub(crate) allocations_from_arena: usize, + + // Deallocation stats + pub(crate) pages_retired: usize, + pub(crate) segments_retired: usize, + // TODO(): Add more stats such as number of local frees and frees from another thread +} + +impl TCacheStats { + pub(crate) fn new() -> TCacheStats { + TCacheStats::default() + } + + #[allow(unused)] + pub(crate) fn print(self: &Self) { + println!("Total allocations: {}", self.total_allocations); + println!("Fast allocations: {}", self.fast_allocations); + println!("Allocations from pages: {}", self.allocations_from_pages); + println!("Allocations from segment: {}", self.allocations_from_segment); + println!("Allocations from arena: {}", self.allocations_from_arena); + println!("Pages retired: {}", self.pages_retired); + println!("Segments retired: {}", self.segments_retired); + } +} + +pub(crate) struct TCache { + free_pages: [*mut Page; NUM_SIZE_CLASSES], + used_pages: [Vec<*mut Page>; NUM_SIZE_CLASSES], + // TODO: Use a linked list for O(1) deletion + spans: [Vec<*mut Page>; SEGMENT_BINS], + arena: Arc>, + thread_id: usize, + stats: TCacheStats, +} + +unsafe impl Send for TCache {} +unsafe impl Sync for TCache {} + +impl TCache { + pub(crate) fn new(arena: Arc>, thread_id: usize) -> TCache { + TCache { + free_pages: [const { null_mut() }; NUM_SIZE_CLASSES], + used_pages: [const { Vec::<*mut Page>::new() }; NUM_SIZE_CLASSES], + spans: [const { Vec::<*mut Page>::new() }; SEGMENT_BINS], + arena: arena.clone(), + thread_id, + stats: TCacheStats::new(), + } + } + + #[inline] + fn get_size_class(size: usize) -> usize { + if size <= MIN_SIZE_FROM_PAGES { + return 0; + } + (size.next_power_of_two() / MIN_SIZE_FROM_PAGES).trailing_zeros() as usize + } + + // #[inline] + // fn get_span_idx_from_size(size: usize) -> usize { + // ((size + PAGE_SIZE - 1) / PAGE_SIZE) + // .next_power_of_two() + // .trailing_zeros() as usize + // } + + /** + * Get the smallest bin which could contiguous runs of at least `slice_count` pages + */ + #[inline] + fn get_span_idx_from_slice_count(slice_count: usize) -> usize { + slice_count.next_power_of_two().trailing_zeros() as usize + } + + /** + * Get the smallest bin holding continuous runs of at least `slice_count` pages + */ + #[inline] + fn get_smallest_bin_for_slice_count(slice_count: usize) -> usize { + (slice_count + 1).next_power_of_two().trailing_zeros() as usize - 1usize + } + + // TODO(): Use per-page pointers to speed up the removal + fn remove_slice_from_span(self: &mut Self, slice: &mut Page) -> bool { + let span_idx = Self::get_span_idx_from_slice_count(slice.slice_count); + for i in 0..self.spans[span_idx].len() { + if self.spans[span_idx][i] == slice { + self.spans[span_idx].remove(i); + break; + } + } + return true; + } + + fn coalesce_slices(self: &mut Self, left_slice: &mut Page, right_slice: &mut Page) { + left_slice.slice_offset = 0; + right_slice.slice_offset = left_slice.slice_count; + left_slice.slice_count += right_slice.slice_count; + } + + fn retire_segment(self: &mut Self, segment: *mut Segment) { + self.stats.segments_retired += 1; + let pages = unsafe { &mut (*segment).pages }; + let mut slice_idx: usize = 0; + while slice_idx < PAGES_PER_SEGMENT - 1 { + self.remove_slice_from_span(&mut pages[slice_idx]); + slice_idx += pages[slice_idx].slice_count; + } + let mut guard = self.arena.lock().unwrap(); + guard.retire_segment(segment); + } + + fn remove_page_from_used_queue(self: &mut Self, page_ptr: *mut Page) { + let size_class = Self::get_size_class(unsafe { (*page_ptr).block_size }); + if size_class >= NUM_SIZE_CLASSES { + return + } + for i in 0..self.used_pages[size_class].len() { + if self.used_pages[size_class][i] == page_ptr { + self.used_pages[size_class].remove(i); + return + } + } + } + + fn remove_page_from_free_queue(self: &mut Self, page_ptr: *mut Page) { + let size_class = Self::get_size_class(unsafe { (*page_ptr).block_size }); + if size_class < NUM_SIZE_CLASSES && self.free_pages[size_class] == page_ptr { + self.free_pages[size_class] = null_mut(); + } + } + + pub(crate) fn retire_page(self: &mut Self, page: *mut Page) { + self.stats.pages_retired += 1; + self.remove_page_from_used_queue(page); + self.remove_page_from_free_queue(page); + let page_ref = unsafe { &mut (*page) }; + page_ref.block_size = 0; + + let segment_ptr = Segment::get_segment_from_ptr(page as *mut u8); + let segment = unsafe { &mut *segment_ptr }; + segment.allocated -= page_ref.slice_count; + if segment.allocated == 0 { + // Return segment to arena + self.retire_segment(segment_ptr); + return; + } + + let next_slice = page.wrapping_add(page_ref.slice_count); + if next_slice < (&mut segment.pages[PAGES_PER_SEGMENT - 2]) as *mut Page { + let next_slice_ref = unsafe { &mut (*next_slice) }; + if next_slice_ref.block_size == 0 { + // Page is not in use, remove it + self.remove_slice_from_span(next_slice_ref); + self.coalesce_slices(page_ref, unsafe { &mut (*next_slice) }); + } + } + + let mut prev_slice = page.wrapping_sub(1); + if prev_slice >= (&mut segment.pages[0]) as *mut Page { + prev_slice = prev_slice.wrapping_sub(unsafe { (*prev_slice).slice_offset }); + let prev_slice_ref = unsafe { &mut (*prev_slice) }; + if prev_slice_ref.block_size == 0 { + // Merge with the previous slice + self.remove_slice_from_span(prev_slice_ref); + self.coalesce_slices(prev_slice_ref, page_ref); + let span_idx = Self::get_span_idx_from_slice_count(prev_slice_ref.slice_count); + self.spans[span_idx].push(prev_slice); + } + } + + let span_idx = Self::get_span_idx_from_slice_count(page_ref.slice_count); + self.spans[span_idx].push(page); + } + + fn cleanup_pages(self: &mut Self) { + for i in 0..self.used_pages.len() { + for page_idx in 0..self.used_pages[i].len() { + let page = self.used_pages[i][page_idx]; + unsafe { + if (*page).used.load(Ordering::Relaxed) == 0 { + self.retire_page(page); + self.used_pages[i].remove(page_idx); + } + } + } + } + } + + fn find_page_from_used(self: &mut Self, bin: usize) -> (*mut u8, Option) { + for i in 0..self.used_pages[bin].len() { + unsafe { + if (*self.used_pages[bin][i]).is_full() { + continue; + } + let page = self.used_pages[bin].remove(i); + let (block, buffer_id) = (*page).get_free_block(); + self.free_pages[bin] = page; + return (block, buffer_id) + } + } + (null_mut(), None) + } + + fn find_page_from_spans(self: &mut Self, num_slices_required: usize, block_size: usize) -> *mut Page { + debug_assert!(block_size >= MIN_SIZE_FROM_PAGES); + let min_bin = Self::get_span_idx_from_slice_count(num_slices_required); + println!("Min bin: {min_bin}"); + for i in min_bin..SEGMENT_BINS { + let bin = &mut self.spans[i]; + if bin.is_empty() { + continue; + } + let slice = bin.pop().unwrap(); + let num_slices_original = unsafe { (*slice).slice_count }; + println!("Slice count original: {num_slices_original}"); + println!("Slice count required: {num_slices_required}"); + + if num_slices_original > num_slices_required { + // split slice + let segment = Segment::get_segment_from_ptr(slice as *mut u8); + let next_slice = unsafe { (*segment).split_page(slice, num_slices_required) }; + let bin = Self::get_span_idx_from_slice_count(num_slices_original - num_slices_required); + unsafe { + (*segment).allocated += num_slices_required; + } + self.spans[bin].push(next_slice); + } + unsafe { + (*slice).set_block_size(block_size); + } + return slice; + } + null_mut() + } + + fn add_segment_to_spans(self: &mut Self, segment: *mut Segment) { + let segment_ref = unsafe { &mut (*segment) }; + let slice_count = segment_ref.num_slices; + let span_idx = Self::get_span_idx_from_slice_count(slice_count); + let page = &mut segment_ref.pages[0]; + page.slice_count = slice_count; + page.slice_offset = 0; + self.spans[span_idx].push(page as *mut Page); + } + + fn allocate_segment_from_arena(self: &mut Self, thread_id: usize) -> bool { + self.stats.total_segments_allocated += 1; + let segment_opt = { + let mut guard = self.arena.lock().unwrap(); + guard.allocate_segment(SEGMENT_SIZE) + }; + if segment_opt.is_none() { + return false; + } + unsafe { + (*segment_opt.unwrap()).thread_id = thread_id; + } + + self.add_segment_to_spans(segment_opt.unwrap()); + true + } + + pub(crate) fn allocate(self: &mut Self, size: usize) -> (*mut u8, Option) { + self.stats.total_allocations += 1; + if size > MAX_SIZE_FROM_PAGES { + // Directly get page from segment + let num_pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + let block_size = num_pages * PAGE_SIZE; + let mut free_page = self.find_page_from_spans(num_pages, block_size); + if free_page != null_mut() { + self.stats.allocations_from_segment += 1; + let (free_block, buffer_id) = unsafe { (*free_page).get_free_block() }; + return (free_block, buffer_id); + } + self.stats.allocations_from_arena += 1; + let res = self.allocate_segment_from_arena(self.thread_id); + if !res { + return (null_mut(), None); + } + free_page = self.find_page_from_spans(num_pages, block_size); + debug_assert_eq!(block_size, unsafe { (*free_page).block_size }); + assert_ne!(free_page, null_mut()); + let (free_block, buffer_id) = unsafe { (*free_page).get_free_block() }; + return (free_block, buffer_id) + } + let size_class = Self::get_size_class(size); + debug_assert!(size_class < NUM_SIZE_CLASSES); + + let block_size = SIZE_CLASSES[size_class]; + let mut free_page = self.free_pages[size_class]; + if !free_page.is_null() { + // allocate from free page + let page = free_page.clone(); + unsafe { + if (*page).is_full() { + self.used_pages[size_class].push(page); + self.free_pages[size_class] = null_mut(); + } else { + self.stats.fast_allocations += 1; + let (ptr, buffer_id) = (*page).get_free_block(); + return (ptr, buffer_id) + } + } + } + let (block, buffer_id) = self.find_page_from_used(size_class); + if !block.is_null() { + self.stats.allocations_from_pages += 1; + return (block, buffer_id); + } + free_page = self.find_page_from_spans(1, block_size); + if free_page != null_mut() { + self.stats.allocations_from_segment += 1; + let (free_block, buffer_id) = unsafe { (*free_page).get_free_block() }; + self.free_pages[size_class] = free_page; + return (free_block, buffer_id); + } + // No space available in segments, allocate a new one + self.stats.allocations_from_arena += 1; + let res = self.allocate_segment_from_arena(self.thread_id); + if !res { + return (null_mut(), None); + } + free_page = self.find_page_from_spans(1, size); + assert_ne!(free_page, null_mut()); + let (free_block, buffer_id) = unsafe { (*free_page).get_free_block() }; + self.free_pages[size_class] = free_page; + return (free_block, buffer_id) + } + + #[allow(unused)] + pub(crate) fn get_stats(self: &Self) -> TCacheStats { + self.stats.clone() + } +} \ No newline at end of file From 22dfee8595e9573c51db2b64bc7fdd1e04807a84 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sun, 23 Nov 2025 18:49:10 -0600 Subject: [PATCH 02/20] Fix bin computation for spans --- src/common/src/memory/pool.rs | 24 +++++++++++++- src/common/src/memory/tcache.rs | 56 +++++++++++++++------------------ 2 files changed, 49 insertions(+), 31 deletions(-) diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs index caa20a78..46bd49d7 100644 --- a/src/common/src/memory/pool.rs +++ b/src/common/src/memory/pool.rs @@ -178,11 +178,33 @@ mod tests { assert_eq!(stats.segments_retired, 1); } + #[test] + fn test_large_alloc_and_free2() { + FixedBufferPool::init(128); + let len = 3 * 1024 * 1024; // 1 MB + let (ptr, fixed_buffer) = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + assert_eq!(fixed_buffer, None); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len-1] = 1; + FixedBufferPool::free(ptr); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.pages_retired, 1); + assert_eq!(stats.segments_retired, 1); + } + #[test] fn test_very_large_alloc_fails() { FixedBufferPool::init(128); let len = 32 * 1024 * 1024; // 32 MB - let (ptr, fixed_buffer) = FixedBufferPool::malloc(len); + let (ptr, _fixed_buffer) = FixedBufferPool::malloc(len); assert_eq!(ptr, null_mut()); } } \ No newline at end of file diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index af830f95..178903d1 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -24,7 +24,7 @@ const NUM_SIZE_CLASSES: usize = SIZE_CLASSES.len(); pub(crate) const MIN_SIZE_FROM_PAGES: usize = SIZE_CLASSES[0]; const MAX_SIZE_FROM_PAGES: usize = SIZE_CLASSES[NUM_SIZE_CLASSES - 1]; -const SEGMENT_BINS: usize = 8; // (SEGMENT_SIZE/PAGE_SIZE).log2() + 1 +const SEGMENT_BINS: usize = 7; // (SEGMENT_SIZE/PAGE_SIZE).log2() + 1 #[derive(Default, Clone)] pub(crate) struct TCacheStats { @@ -100,18 +100,10 @@ impl TCache { // } /** - * Get the smallest bin which could contiguous runs of at least `slice_count` pages + * Get the smallest bin which can hold contiguous runs of `slice_count` pages */ #[inline] fn get_span_idx_from_slice_count(slice_count: usize) -> usize { - slice_count.next_power_of_two().trailing_zeros() as usize - } - - /** - * Get the smallest bin holding continuous runs of at least `slice_count` pages - */ - #[inline] - fn get_smallest_bin_for_slice_count(slice_count: usize) -> usize { (slice_count + 1).next_power_of_two().trailing_zeros() as usize - 1usize } @@ -240,31 +232,32 @@ impl TCache { fn find_page_from_spans(self: &mut Self, num_slices_required: usize, block_size: usize) -> *mut Page { debug_assert!(block_size >= MIN_SIZE_FROM_PAGES); let min_bin = Self::get_span_idx_from_slice_count(num_slices_required); - println!("Min bin: {min_bin}"); for i in min_bin..SEGMENT_BINS { let bin = &mut self.spans[i]; - if bin.is_empty() { - continue; - } - let slice = bin.pop().unwrap(); - let num_slices_original = unsafe { (*slice).slice_count }; - println!("Slice count original: {num_slices_original}"); - println!("Slice count required: {num_slices_required}"); - - if num_slices_original > num_slices_required { - // split slice - let segment = Segment::get_segment_from_ptr(slice as *mut u8); - let next_slice = unsafe { (*segment).split_page(slice, num_slices_required) }; - let bin = Self::get_span_idx_from_slice_count(num_slices_original - num_slices_required); + for j in 0..bin.len() { + let slice = bin[j]; + let num_slices_original = unsafe { (*slice).slice_count }; + debug_assert!(num_slices_original >= 1 << i); + if num_slices_original < num_slices_required { + continue; + } + bin.remove(j); + if num_slices_original > num_slices_required { + // split slice + let segment = Segment::get_segment_from_ptr(slice as *mut u8); + let next_slice = unsafe { (*segment).split_page(slice, num_slices_required) }; + let bin = Self::get_span_idx_from_slice_count(num_slices_original - num_slices_required); + unsafe { + (*segment).allocated += num_slices_required; + } + self.spans[bin].push(next_slice); + } unsafe { - (*segment).allocated += num_slices_required; + (*slice).set_block_size(block_size); } - self.spans[bin].push(next_slice); - } - unsafe { - (*slice).set_block_size(block_size); + return slice; + } - return slice; } null_mut() } @@ -314,6 +307,9 @@ impl TCache { return (null_mut(), None); } free_page = self.find_page_from_spans(num_pages, block_size); + if free_page == null_mut() { + return (null_mut(), None) + } debug_assert_eq!(block_size, unsafe { (*free_page).block_size }); assert_ne!(free_page, null_mut()); let (free_block, buffer_id) = unsafe { (*free_page).get_free_block() }; From ff98c601ce869ce3b1242f534687ee5351837f84 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 24 Nov 2025 01:51:00 -0600 Subject: [PATCH 03/20] Basic uring case working --- src/common/Cargo.toml | 2 ++ src/common/src/memory/arena.rs | 24 ++++++++----- src/common/src/memory/pool.rs | 59 +++++++++++++++++++++++++++++++- src/common/src/memory/segment.rs | 3 +- 4 files changed, 78 insertions(+), 10 deletions(-) diff --git a/src/common/Cargo.toml b/src/common/Cargo.toml index a3844bef..f93cad81 100644 --- a/src/common/Cargo.toml +++ b/src/common/Cargo.toml @@ -18,7 +18,9 @@ libc = "0.2.177" log.workspace = true object_store = { workspace = true } prost = { workspace = true } +rand = "0.9.2" serde = { workspace = true } +tempfile.workspace = true thiserror = "2.0.17" tokio = { workspace = true } url = { workspace = true } diff --git a/src/common/src/memory/arena.rs b/src/common/src/memory/arena.rs index bd9321aa..4e06c47b 100644 --- a/src/common/src/memory/arena.rs +++ b/src/common/src/memory/arena.rs @@ -13,7 +13,12 @@ pub struct Arena { size: usize, slices: Vec, used_bitmap: Vec, - start_ptr: *mut u8, + /** + * Segments need to be aligned to 32MB boundaries. Hence the first segment's starting address + * could be different from the starting address of the allocated memory + */ + aligned_start_ptr: *mut u8, + actual_start_ptr: *mut u8, buffers_registered: bool, } @@ -31,7 +36,7 @@ impl Arena { slice_start = ptr_aligned + SEGMENT_SIZE; } let mut slices = Vec::new(); - while slice_start < mem_end as usize { + while slice_start + SEGMENT_SIZE <= mem_end as usize { slices.push(Slice { ptr: slice_start as *mut u8, size: SEGMENT_SIZE, @@ -45,7 +50,8 @@ impl Arena { size: capacity, slices: slices, used_bitmap: used_bitmap, - start_ptr: mem_start, + aligned_start_ptr: ptr_aligned as *mut u8, + actual_start_ptr: mem_start, buffers_registered: false, } } @@ -101,21 +107,23 @@ impl Arena { let num_buffers = self.size / MIN_SIZE_FROM_PAGES; let mut buffers = Vec::::new(); buffers.reserve(num_buffers); - let mut base_ptr = self.start_ptr; + let mut base_ptr = self.aligned_start_ptr; for _i in 0..num_buffers { buffers.push(libc::iovec {iov_base: base_ptr as *mut std::ffi::c_void, iov_len: MIN_SIZE_FROM_PAGES}); - base_ptr = unsafe { base_ptr.add(MIN_SIZE_FROM_PAGES) }; + base_ptr = base_ptr.wrapping_add(MIN_SIZE_FROM_PAGES); } - unsafe { + let res = unsafe { ring.submitter().register_buffers(&buffers) - } + }; + self.buffers_registered = res.is_ok(); + res } } impl Drop for Arena { fn drop(self: &mut Self) { unsafe { - libc::munmap(self.start_ptr as *mut c_void, self.size); + libc::munmap(self.actual_start_ptr as *mut c_void, self.size); } } } \ No newline at end of file diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs index 46bd49d7..65edc951 100644 --- a/src/common/src/memory/pool.rs +++ b/src/common/src/memory/pool.rs @@ -85,7 +85,11 @@ impl Drop for FixedBufferPool { mod tests { - use std::ptr::null_mut; + use std::{io::Write, os::fd::AsRawFd, ptr::{null, null_mut}}; + + use io_uring::{IoUring, cqueue, opcode, squeue}; + use libc::rlimit; + use rand::RngCore as _; use crate::memory::pool::FixedBufferPool; @@ -207,4 +211,57 @@ mod tests { let (ptr, _fixed_buffer) = FixedBufferPool::malloc(len); assert_eq!(ptr, null_mut()); } + + #[test] + fn test_with_uring_basic() { + let mut rlimit = libc::rlimit{ + rlim_cur: 0, + rlim_max: 0, + }; + unsafe { libc::getrlimit(libc::RLIMIT_MEMLOCK, &mut rlimit); } + assert!(64 * 1024 <= rlimit.rlim_max, "rlimit.MEMLOCK should be at least 64 MB to test the fixed-buffer pool. Current rlimit is: {} KB", rlimit.rlim_max); + FixedBufferPool::init(64); + + let mut ring = IoUring::::builder().build(32).unwrap(); + let res = FixedBufferPool::register_buffers_with_ring(&ring); + assert!(res.is_ok()); + + const LEN: usize = 4096; + let mut file = tempfile::tempfile().unwrap(); + let (ptr, id) = FixedBufferPool::malloc(LEN); + assert_ne!(ptr, null_mut()); + assert!(id.is_some()); + + let mut random_bytes = [0u8; LEN]; + let mut rng = rand::rng(); + rng.fill_bytes(&mut random_bytes); + let mut res = file.write(&random_bytes); + assert!(res.is_ok(), "Failed to write to temp file"); + + { + let sqe = opcode::ReadFixed::new( + io_uring::types::Fd(file.as_raw_fd()), + ptr, + LEN as u32, + id.unwrap() as u16) + .offset(0).build(); + let mut sq = ring.submission(); + let res = unsafe { sq.push(&sqe) }; + assert!(res.is_ok(), "Failed to submit to io uring"); + sq.sync(); + } + res = ring.submit_and_wait(1); + assert!(res.is_ok(), "Failed to submit"); + + { + let mut cq = ring.completion(); + let cqe = cq.next(); + assert!(cqe.is_some()); + assert_eq!(cqe.as_ref().unwrap().result(), LEN as i32, "{}", std::io::Error::from_raw_os_error(-cqe.unwrap().result())); + } + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, LEN) }; + assert_eq!(buffer, random_bytes); + } + + // Test with vector maybe? } \ No newline at end of file diff --git a/src/common/src/memory/segment.rs b/src/common/src/memory/segment.rs index 37224038..c957a893 100644 --- a/src/common/src/memory/segment.rs +++ b/src/common/src/memory/segment.rs @@ -27,7 +27,8 @@ impl Segment { (*segment_ptr).num_slices = PAGES_PER_SEGMENT - 1; for i in 0..(*segment_ptr).num_slices { let page_start_buffer_id = if start_buffer_id.is_some() { - Some(start_buffer_id.unwrap() + i * FIXED_BUFFERS_PER_PAGE) + let offset = (start_ptr as usize - segment_ptr as usize) / MIN_SIZE_FROM_PAGES; + Some(start_buffer_id.unwrap() + offset) } else { start_buffer_id }; From a8f4e74d8e06a57e66a293edaa3fc11522df1a52 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sun, 30 Nov 2025 15:29:08 -0600 Subject: [PATCH 04/20] Add uring glue code --- src/common/src/memory/arena.rs | 24 ++-- src/common/src/memory/page.rs | 18 +-- src/common/src/memory/pool.rs | 211 +++++++++++++++++++++++++------ src/common/src/memory/segment.rs | 15 +-- src/common/src/memory/tcache.rs | 39 +++--- 5 files changed, 205 insertions(+), 102 deletions(-) diff --git a/src/common/src/memory/arena.rs b/src/common/src/memory/arena.rs index 4e06c47b..8258ba63 100644 --- a/src/common/src/memory/arena.rs +++ b/src/common/src/memory/arena.rs @@ -1,14 +1,11 @@ -use std::{io, os::raw::c_void, ptr::{null, null_mut}}; +use std::{io, os::raw::c_void, ptr::null_mut}; use io_uring::IoUring; use crate::memory::{ - page::Slice, - segment::{SEGMENT_SIZE, SEGMENT_SIZE_BITS, Segment}, tcache::MIN_SIZE_FROM_PAGES, + page::Slice, pool::{FIXED_BUFFER_BITS, FIXED_BUFFER_SIZE_BYTES}, segment::{SEGMENT_SIZE, SEGMENT_SIZE_BITS, Segment} }; -const FIXED_BUFFERS_PER_SEGMENT: usize = SEGMENT_SIZE / MIN_SIZE_FROM_PAGES; - pub struct Arena { size: usize, slices: Vec, @@ -89,12 +86,11 @@ impl Arena { ptr: self.slices[result as usize].ptr, size: num_slices * SEGMENT_SIZE, }; - let start_buffer_id = if self.buffers_registered { - Some(result as usize * FIXED_BUFFERS_PER_SEGMENT) - } else { - None - }; - Some(Segment::new_from_slice(combined_slice, start_buffer_id)) + Some(Segment::new_from_slice(combined_slice)) + } + + pub(crate) fn start_ptr(self: &Self) -> *mut u8 { + self.aligned_start_ptr } pub(crate) fn retire_segment(self: &mut Self, segment: *mut Segment) { @@ -104,13 +100,13 @@ impl Arena { } pub(crate) fn register_buffers_with_ring(self: &mut Self, ring: &IoUring) -> io::Result<()> { - let num_buffers = self.size / MIN_SIZE_FROM_PAGES; + let num_buffers = self.size >> FIXED_BUFFER_BITS; let mut buffers = Vec::::new(); buffers.reserve(num_buffers); let mut base_ptr = self.aligned_start_ptr; for _i in 0..num_buffers { - buffers.push(libc::iovec {iov_base: base_ptr as *mut std::ffi::c_void, iov_len: MIN_SIZE_FROM_PAGES}); - base_ptr = base_ptr.wrapping_add(MIN_SIZE_FROM_PAGES); + buffers.push(libc::iovec {iov_base: base_ptr as *mut std::ffi::c_void, iov_len: FIXED_BUFFER_SIZE_BYTES}); + base_ptr = base_ptr.wrapping_add(FIXED_BUFFER_SIZE_BYTES); } let res = unsafe { ring.submitter().register_buffers(&buffers) diff --git a/src/common/src/memory/page.rs b/src/common/src/memory/page.rs index 2e13b7ff..0dfebf01 100644 --- a/src/common/src/memory/page.rs +++ b/src/common/src/memory/page.rs @@ -1,7 +1,5 @@ use std::{collections::VecDeque, ptr::null_mut, sync::{Mutex, atomic::{AtomicUsize, Ordering}}}; -use crate::memory::tcache::MIN_SIZE_FROM_PAGES; - #[derive(Clone, Copy)] pub struct Block { ptr: *mut u8, @@ -20,11 +18,10 @@ pub struct Page { pub(crate) slice_count: usize, // No. of pages in the slice containing this page pub(crate) slice_offset: usize, // Offset of this page from the start of this slice pub(crate) page_start: *mut u8, - pub(crate) start_buffer_id: Option, } impl Page { - pub fn from_slice(slice: Slice, start_buffer_id: Option) -> Page { + pub fn from_slice(slice: Slice) -> Page { // let mut start_ptr = slice.ptr; let free_list = VecDeque::::new(); Page { @@ -37,7 +34,6 @@ impl Page { slice_count: 1, slice_offset: 0, page_start: slice.ptr, - start_buffer_id } } @@ -56,20 +52,14 @@ impl Page { * Returns (block, buffer id) pair */ #[inline] - pub fn get_free_block(self: &mut Self) -> (*mut u8, Option) { + pub fn get_free_block(self: &mut Self) -> *mut u8 { let mut guard = self.free_list.lock().unwrap(); let block = guard.pop_front(); if block.is_none() { - return (null_mut(), None); + return null_mut() } self.used.fetch_add(1usize, Ordering::Relaxed); - let ptr = block.unwrap().ptr; - if self.start_buffer_id.is_none() { - return (ptr, None) - } - - let buffer_offset = (ptr as usize - self.page_start as usize) / MIN_SIZE_FROM_PAGES; - (ptr, Some(self.start_buffer_id.unwrap() + buffer_offset)) + block.unwrap().ptr } #[inline] diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs index 65edc951..63bd3fcc 100644 --- a/src/common/src/memory/pool.rs +++ b/src/common/src/memory/pool.rs @@ -1,6 +1,7 @@ extern crate io_uring; -use std::sync::{Arc, Mutex, OnceLock, atomic::Ordering}; +use core::slice; +use std::{cmp::min, sync::{Arc, Mutex, OnceLock, atomic::{AtomicBool, Ordering}}}; use futures::io; use io_uring::IoUring; @@ -9,20 +10,66 @@ use crate::memory::{arena::Arena, segment::Segment, tcache::{TCache, TCacheStats static FIXED_BUFFER_POOL: OnceLock = OnceLock::new(); +pub const FIXED_BUFFER_SIZE_BYTES: usize = 1 << 20; +pub const FIXED_BUFFER_BITS: u32 = FIXED_BUFFER_SIZE_BYTES.trailing_zeros(); + +pub struct FixedBuffer { + pub ptr: *mut u8, + pub buf_id: usize, + pub bytes: usize, +} + +pub struct FixedBufferAllocation { + pub ptr: *mut u8, + pub size: usize, +} + +unsafe impl Send for FixedBufferAllocation {} + +impl AsRef<[u8]> for FixedBufferAllocation { + fn as_ref(&self) -> &[u8] { + unsafe { slice::from_raw_parts(self.ptr, self.size) } + } +} + +impl Drop for FixedBufferAllocation { + fn drop(&mut self) { + FixedBufferPool::free(self.ptr); + } +} + pub struct FixedBufferPool { local_caches: Vec>, arena: Arc>, + start_ptr: *mut u8, + capacity: usize, + registered: AtomicBool, // Whether buffers have been registered } +unsafe impl Send for FixedBufferPool {} + +unsafe impl Sync for FixedBufferPool {} + impl FixedBufferPool { fn new(capacity_mb: usize) -> FixedBufferPool { let num_cpus = std::thread::available_parallelism().unwrap(); - let arena = Self::allocate_arena(capacity_mb << 20); + let capacity = capacity_mb << 20; + let arena = Self::allocate_arena(capacity.clone()); + let start_ptr = { + let guard = arena.try_lock().unwrap(); + guard.start_ptr() + }; let mut local_caches = Vec::>::new(); for i in 0..num_cpus.get() { local_caches.push(Mutex::new(TCache::new(arena.clone(), i))); } - FixedBufferPool { local_caches, arena } + FixedBufferPool { + local_caches, + arena, + start_ptr, + capacity, + registered: AtomicBool::new(false) + } } pub fn allocate_arena(capacity: usize) -> Arc> { @@ -38,12 +85,60 @@ impl FixedBufferPool { &FIXED_BUFFER_POOL.get().unwrap().local_caches[cpu as usize] } - pub fn malloc(size: usize) -> (*mut u8, Option) { + pub fn malloc(size: usize) -> *mut u8 { let local_cache = Self::get_thread_local_cache(); local_cache.lock().unwrap().allocate(size) } - pub fn free(ptr: *mut u8) { + pub fn register_buffers_with_ring(ring: &IoUring) -> io::Result<()> { + let pool = FIXED_BUFFER_POOL.get().unwrap(); + let mut arena_guard = pool.arena.lock().unwrap(); + let res = arena_guard.register_buffers_with_ring(ring); + if res.is_ok() { + pool.registered.store(true, Ordering::Relaxed); + } + res + } + + pub(crate) fn get_stats(cpu: usize) -> TCacheStats { + let pool = FIXED_BUFFER_POOL.get().unwrap(); + let tcache = pool.local_caches[cpu].lock().unwrap(); + tcache.get_stats() + } + + pub fn get_fixed_buffers(alloc: &FixedBufferAllocation) -> Vec { + let ptr = alloc.ptr; + let size = alloc.size; + let pool = FIXED_BUFFER_POOL.get().unwrap(); + debug_assert!(ptr >= pool.start_ptr && ptr < pool.start_ptr.wrapping_add(pool.capacity), + "Pointer doesn't lie within the arena"); + let mut remaining = size; + let mut vec = Vec::::new(); + let mut current = ptr.clone(); + let mut buffer_id = (current.wrapping_sub(pool.start_ptr as usize) as usize) >> FIXED_BUFFER_BITS; + while remaining > 0 { + let next_buffer_start = pool.start_ptr.wrapping_add((buffer_id + 1) << FIXED_BUFFER_BITS); + let bytes = min(remaining, next_buffer_start as usize - current as usize); + let fb = FixedBuffer { + ptr: current, + buf_id: buffer_id, + bytes: bytes, + }; + current = next_buffer_start; + vec.push(fb); + remaining -= bytes; + buffer_id += 1; + } + vec + } + + #[inline] + pub fn buffers_registered() -> bool { + let pool = FIXED_BUFFER_POOL.get().unwrap(); + pool.registered.load(Ordering::Relaxed) + } + + fn free(ptr: *mut u8) { let segment_ptr = Segment::get_segment_from_ptr(ptr); let page_ptr = unsafe { (*segment_ptr).get_page_from_ptr(ptr) }; unsafe { @@ -61,18 +156,6 @@ impl FixedBufferPool { } } } - - pub fn register_buffers_with_ring(ring: &IoUring) -> io::Result<()> { - let pool = FIXED_BUFFER_POOL.get().unwrap(); - let mut arena_guard = pool.arena.lock().unwrap(); - arena_guard.register_buffers_with_ring(ring) - } - - pub(crate) fn get_stats(cpu: usize) -> TCacheStats { - let pool = FIXED_BUFFER_POOL.get().unwrap(); - let tcache = pool.local_caches[cpu].lock().unwrap(); - tcache.get_stats() - } } impl Drop for FixedBufferPool { @@ -83,15 +166,15 @@ impl Drop for FixedBufferPool { } } - mod tests { use std::{io::Write, os::fd::AsRawFd, ptr::{null, null_mut}}; + use bytes::Bytes; use io_uring::{IoUring, cqueue, opcode, squeue}; use libc::rlimit; use rand::RngCore as _; - use crate::memory::pool::FixedBufferPool; + use crate::memory::pool::{FIXED_BUFFER_SIZE_BYTES, FixedBufferAllocation, FixedBufferPool}; #[test] fn test_basic_alloc_and_free() { @@ -100,9 +183,8 @@ mod tests { let buffer_lengths = [4096, 4096, 4096 * 4]; // 2 different size classes let mut ptrs = Vec::<*mut u8>::new(); for len in buffer_lengths { - let (ptr, fixed_buffer) = FixedBufferPool::malloc(len); + let ptr = FixedBufferPool::malloc(len); assert_ne!(ptr, null_mut()); - assert_eq!(fixed_buffer, None); // 4096 byte alignment is necessary for direct IO assert_eq!(ptr as usize % 4096, 0); @@ -125,6 +207,38 @@ mod tests { assert_eq!(stats.segments_retired, 1); } + #[test] + fn test_basic_alloc_and_free_bytes() { + FixedBufferPool::init(128); + + let buffer_lengths = [4096, 4096, 4096 * 4]; // 2 different size classes + // let mut ptrs = Vec::<*mut u8>::new(); + let mut bytes_vec = Vec::::new(); + for len in buffer_lengths { + let ptr = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len-1] = 1; + let alloc = FixedBufferAllocation {ptr: ptr, size: len}; + let bytes = Bytes::from_owner(alloc); + bytes_vec.push(bytes); + } + + drop(bytes_vec); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.fast_allocations, 1); + assert_eq!(stats.pages_retired, 2); + assert_eq!(stats.segments_retired, 1); + } + #[test] fn test_free_from_different_thread() { FixedBufferPool::init(128); @@ -132,9 +246,8 @@ mod tests { let buffer_lengths = [4096, 4096 * 4]; let mut buffers = Vec::<&mut [u8]>::new(); for len in buffer_lengths { - let (ptr, fixed_buffer) = FixedBufferPool::malloc(len); + let ptr = FixedBufferPool::malloc(len); assert_ne!(ptr, null_mut()); - assert_eq!(fixed_buffer, None); // 4096 byte alignment is necessary for direct IO assert_eq!(ptr as usize % 4096, 0); @@ -164,9 +277,8 @@ mod tests { fn test_large_alloc_and_free() { FixedBufferPool::init(128); let len = 1024 * 1024; // 1 MB - let (ptr, fixed_buffer) = FixedBufferPool::malloc(len); + let ptr = FixedBufferPool::malloc(len); assert_ne!(ptr, null_mut()); - assert_eq!(fixed_buffer, None); // 4096 byte alignment is necessary for direct IO assert_eq!(ptr as usize % 4096, 0); let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; @@ -186,9 +298,8 @@ mod tests { fn test_large_alloc_and_free2() { FixedBufferPool::init(128); let len = 3 * 1024 * 1024; // 1 MB - let (ptr, fixed_buffer) = FixedBufferPool::malloc(len); + let ptr = FixedBufferPool::malloc(len); assert_ne!(ptr, null_mut()); - assert_eq!(fixed_buffer, None); // 4096 byte alignment is necessary for direct IO assert_eq!(ptr as usize % 4096, 0); let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; @@ -208,7 +319,8 @@ mod tests { fn test_very_large_alloc_fails() { FixedBufferPool::init(128); let len = 32 * 1024 * 1024; // 32 MB - let (ptr, _fixed_buffer) = FixedBufferPool::malloc(len); + let ptr = FixedBufferPool::malloc(len); + assert_eq!(ptr, null_mut()); } @@ -226,42 +338,57 @@ mod tests { let res = FixedBufferPool::register_buffers_with_ring(&ring); assert!(res.is_ok()); - const LEN: usize = 4096; + const LEN: usize = 1 << 20; // 1 MB let mut file = tempfile::tempfile().unwrap(); - let (ptr, id) = FixedBufferPool::malloc(LEN); + let ptr = FixedBufferPool::malloc(LEN); assert_ne!(ptr, null_mut()); - assert!(id.is_some()); + let alloc = FixedBufferAllocation {ptr: ptr, size: LEN}; + let buffers = FixedBufferPool::get_fixed_buffers(&alloc); + assert!(buffers.len() <= (LEN / FIXED_BUFFER_SIZE_BYTES) + 1); + + let mut total = 0; + for fixed_buffer in buffers.iter().as_ref() { + total += fixed_buffer.bytes; + } + assert_eq!(total, LEN); let mut random_bytes = [0u8; LEN]; let mut rng = rand::rng(); rng.fill_bytes(&mut random_bytes); let mut res = file.write(&random_bytes); assert!(res.is_ok(), "Failed to write to temp file"); + assert_eq!(res.unwrap(), LEN, "Failed to write to temp file"); - { + let mut file_offset = 0; + for fixed_buffer in buffers.iter().as_ref() { let sqe = opcode::ReadFixed::new( io_uring::types::Fd(file.as_raw_fd()), - ptr, - LEN as u32, - id.unwrap() as u16) - .offset(0).build(); + fixed_buffer.ptr, + fixed_buffer.bytes as u32, + fixed_buffer.buf_id as u16) + .offset(file_offset).build(); + file_offset += fixed_buffer.bytes as u64; let mut sq = ring.submission(); let res = unsafe { sq.push(&sqe) }; assert!(res.is_ok(), "Failed to submit to io uring"); sq.sync(); } - res = ring.submit_and_wait(1); + + res = ring.submit_and_wait(buffers.len()); assert!(res.is_ok(), "Failed to submit"); + let mut total_bytes_read = 0; - { + for _i in 0..buffers.len() { let mut cq = ring.completion(); let cqe = cq.next(); assert!(cqe.is_some()); - assert_eq!(cqe.as_ref().unwrap().result(), LEN as i32, "{}", std::io::Error::from_raw_os_error(-cqe.unwrap().result())); + let res = cqe.as_ref().unwrap().result(); + assert!( res > 0, "Read failed: {}", std::io::Error::from_raw_os_error(-cqe.unwrap().result())); + total_bytes_read += res as usize; } - let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, LEN) }; - assert_eq!(buffer, random_bytes); + assert_eq!(total_bytes_read, LEN, "Expected to read {} bytes, but read {}", LEN, total_bytes_read); + let buffer = Bytes::from_owner(alloc); + assert_eq!(buffer, &random_bytes[..]); } - // Test with vector maybe? } \ No newline at end of file diff --git a/src/common/src/memory/segment.rs b/src/common/src/memory/segment.rs index c957a893..cb4e198d 100644 --- a/src/common/src/memory/segment.rs +++ b/src/common/src/memory/segment.rs @@ -1,23 +1,21 @@ use std::ptr::null_mut; -use crate::memory::{page::{PAGE_SIZE, Page, Slice}, tcache::MIN_SIZE_FROM_PAGES}; +use crate::memory::{page::{PAGE_SIZE, Page, Slice}}; pub const SEGMENT_SIZE: usize = 32 * 1024 * 1024; pub const SEGMENT_SIZE_BITS: usize = SEGMENT_SIZE.ilog2() as usize; pub const PAGES_PER_SEGMENT: usize = SEGMENT_SIZE / PAGE_SIZE; -const FIXED_BUFFERS_PER_PAGE: usize = PAGE_SIZE / MIN_SIZE_FROM_PAGES; pub struct Segment { pub(crate) allocated: usize, pub(crate) num_slices: usize, pub(crate) pages: [Page; PAGES_PER_SEGMENT - 1], pub(crate) thread_id: usize, - pub(crate) start_buffer_id: Option, } impl Segment { - pub fn new_from_slice(slice: Slice, start_buffer_id: Option) -> *mut Segment { + pub fn new_from_slice(slice: Slice) -> *mut Segment { // First sizeof(Segment) bytes should hold the Segment object let segment_ptr = slice.ptr as *mut Segment; let mut start_ptr = unsafe { slice.ptr.add(PAGE_SIZE) }; @@ -26,16 +24,9 @@ impl Segment { (*segment_ptr).allocated = 0; (*segment_ptr).num_slices = PAGES_PER_SEGMENT - 1; for i in 0..(*segment_ptr).num_slices { - let page_start_buffer_id = if start_buffer_id.is_some() { - let offset = (start_ptr as usize - segment_ptr as usize) / MIN_SIZE_FROM_PAGES; - Some(start_buffer_id.unwrap() + offset) - } else { - start_buffer_id - }; - (*segment_ptr).pages[i] = Page::from_slice(Slice {ptr: start_ptr, size: PAGE_SIZE}, page_start_buffer_id); + (*segment_ptr).pages[i] = Page::from_slice(Slice {ptr: start_ptr, size: PAGE_SIZE}); start_ptr = start_ptr.wrapping_add(PAGE_SIZE); } - (*segment_ptr).start_buffer_id = start_buffer_id; } segment_ptr } diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index 178903d1..badc363b 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -214,19 +214,19 @@ impl TCache { } } - fn find_page_from_used(self: &mut Self, bin: usize) -> (*mut u8, Option) { + fn find_page_from_used(self: &mut Self, bin: usize) -> *mut u8 { for i in 0..self.used_pages[bin].len() { unsafe { if (*self.used_pages[bin][i]).is_full() { continue; } let page = self.used_pages[bin].remove(i); - let (block, buffer_id) = (*page).get_free_block(); + let block = (*page).get_free_block(); self.free_pages[bin] = page; - return (block, buffer_id) + return block } } - (null_mut(), None) + null_mut() } fn find_page_from_spans(self: &mut Self, num_slices_required: usize, block_size: usize) -> *mut Page { @@ -289,7 +289,7 @@ impl TCache { true } - pub(crate) fn allocate(self: &mut Self, size: usize) -> (*mut u8, Option) { + pub(crate) fn allocate(self: &mut Self, size: usize) -> *mut u8 { self.stats.total_allocations += 1; if size > MAX_SIZE_FROM_PAGES { // Directly get page from segment @@ -298,22 +298,22 @@ impl TCache { let mut free_page = self.find_page_from_spans(num_pages, block_size); if free_page != null_mut() { self.stats.allocations_from_segment += 1; - let (free_block, buffer_id) = unsafe { (*free_page).get_free_block() }; - return (free_block, buffer_id); + let free_block = unsafe { (*free_page).get_free_block() }; + return free_block } self.stats.allocations_from_arena += 1; let res = self.allocate_segment_from_arena(self.thread_id); if !res { - return (null_mut(), None); + return null_mut() } free_page = self.find_page_from_spans(num_pages, block_size); if free_page == null_mut() { - return (null_mut(), None) + return null_mut() } debug_assert_eq!(block_size, unsafe { (*free_page).block_size }); assert_ne!(free_page, null_mut()); - let (free_block, buffer_id) = unsafe { (*free_page).get_free_block() }; - return (free_block, buffer_id) + let free_block = unsafe { (*free_page).get_free_block() }; + return free_block } let size_class = Self::get_size_class(size); debug_assert!(size_class < NUM_SIZE_CLASSES); @@ -329,34 +329,33 @@ impl TCache { self.free_pages[size_class] = null_mut(); } else { self.stats.fast_allocations += 1; - let (ptr, buffer_id) = (*page).get_free_block(); - return (ptr, buffer_id) + return (*page).get_free_block() } } } - let (block, buffer_id) = self.find_page_from_used(size_class); + let block = self.find_page_from_used(size_class); if !block.is_null() { self.stats.allocations_from_pages += 1; - return (block, buffer_id); + return block } free_page = self.find_page_from_spans(1, block_size); if free_page != null_mut() { self.stats.allocations_from_segment += 1; - let (free_block, buffer_id) = unsafe { (*free_page).get_free_block() }; + let free_block = unsafe { (*free_page).get_free_block() }; self.free_pages[size_class] = free_page; - return (free_block, buffer_id); + return free_block; } // No space available in segments, allocate a new one self.stats.allocations_from_arena += 1; let res = self.allocate_segment_from_arena(self.thread_id); if !res { - return (null_mut(), None); + return null_mut() } free_page = self.find_page_from_spans(1, size); assert_ne!(free_page, null_mut()); - let (free_block, buffer_id) = unsafe { (*free_page).get_free_block() }; + let free_block = unsafe { (*free_page).get_free_block() }; self.free_pages[size_class] = free_page; - return (free_block, buffer_id) + return free_block } #[allow(unused)] From e1c1ec555d45f46f904267f5dbb8501ad34b10c0 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 1 Dec 2025 00:34:09 -0600 Subject: [PATCH 05/20] Integrate fixed buffer pool into liquid cache --- benchmark/bench_server.rs | 4 + examples/example_server.rs | 1 + src/common/src/memory/pool.rs | 1 + src/local/src/lib.rs | 1 + src/parquet/bench/filter_pushdown.rs | 1 + src/parquet/src/cache/mod.rs | 4 +- src/parquet/src/cache/stats.rs | 1 + src/parquet/src/io/io_backend.rs | 6 +- .../src/io/io_uring/multi_async_uring.rs | 6 +- .../src/io/io_uring/multi_blocking_uring.rs | 6 +- src/parquet/src/io/io_uring/single_uring.rs | 6 +- src/parquet/src/io/io_uring/tasks.rs | 147 +++++++++++++++--- src/parquet/src/io/io_uring/tests.rs | 6 +- .../src/io/io_uring/thread_pool_uring.rs | 137 +++++++++++++--- src/parquet/src/io/mod.rs | 10 +- src/parquet/src/optimizers/lineage_opt.rs | 1 + src/parquet/src/optimizers/mod.rs | 1 + .../src/reader/runtime/liquid_cache_reader.rs | 1 + .../src/reader/runtime/liquid_stream.rs | 1 + src/server/src/lib.rs | 3 + src/server/src/service.rs | 3 + src/server/src/tests/mod.rs | 1 + 22 files changed, 290 insertions(+), 58 deletions(-) diff --git a/benchmark/bench_server.rs b/benchmark/bench_server.rs index ff71ed52..c4014481 100644 --- a/benchmark/bench_server.rs +++ b/benchmark/bench_server.rs @@ -51,6 +51,9 @@ struct CliArgs { /// IO mode, available options: uring, uring-direct, std-blocking, tokio, std-spawn-blocking #[arg(long = "io-mode", default_value = "uring-multi-async")] io_mode: IoMode, + + #[arg(long = "fixed-buffer-pool-size-mb", default_value = "0")] + fixed_buffer_pool_size_mb: usize, } #[tokio::main] @@ -80,6 +83,7 @@ async fn main() -> Result<(), Box> { Box::new(LiquidPolicy::new()), squeeze_policy, Some(args.io_mode), + args.fixed_buffer_pool_size_mb, )?; let liquid_cache_server = Arc::new(liquid_cache_server); diff --git a/examples/example_server.rs b/examples/example_server.rs index d214ffa8..05407ff2 100644 --- a/examples/example_server.rs +++ b/examples/example_server.rs @@ -15,6 +15,7 @@ async fn main() -> Result<(), Box> { Box::new(LruPolicy::new()), Box::new(TranscodeSqueezeEvict), Some(IoMode::default()), + 0, )?; let flight = FlightServiceServer::new(liquid_cache); diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs index 63bd3fcc..8c8a8bb8 100644 --- a/src/common/src/memory/pool.rs +++ b/src/common/src/memory/pool.rs @@ -19,6 +19,7 @@ pub struct FixedBuffer { pub bytes: usize, } +#[derive(Debug)] pub struct FixedBufferAllocation { pub ptr: *mut u8, pub size: usize, diff --git a/src/local/src/lib.rs b/src/local/src/lib.rs index 75a3eed7..107016a4 100644 --- a/src/local/src/lib.rs +++ b/src/local/src/lib.rs @@ -153,6 +153,7 @@ impl LiquidCacheLocalBuilder { self.cache_policy, self.squeeze_policy, self.io_mode, + 0, ); let cache_ref = Arc::new(cache); diff --git a/src/parquet/bench/filter_pushdown.rs b/src/parquet/bench/filter_pushdown.rs index b9f62d67..43fa9d87 100644 --- a/src/parquet/bench/filter_pushdown.rs +++ b/src/parquet/bench/filter_pushdown.rs @@ -47,6 +47,7 @@ fn setup_cache(tmp_dir: &TempDir) -> Arc { Box::new(LiquidPolicy::new()), Box::new(TranscodeSqueezeEvict), IoMode::Uring, + 0, ); let field = Arc::new(Field::new("test_column", DataType::Int32, false)); let schema = Arc::new(Schema::new(vec![field.clone()])); diff --git a/src/parquet/src/cache/mod.rs b/src/parquet/src/cache/mod.rs index 461b1c4c..2d6658cc 100644 --- a/src/parquet/src/cache/mod.rs +++ b/src/parquet/src/cache/mod.rs @@ -247,9 +247,10 @@ impl LiquidCache { cache_policy: Box, squeeze_policy: Box, io_mode: IoMode, + fixed_buffer_pool_size_mb: usize, ) -> Self { assert!(batch_size.is_power_of_two()); - let io_context = Arc::new(ParquetIoContext::new(cache_dir.clone(), io_mode)); + let io_context = Arc::new(ParquetIoContext::new(cache_dir.clone(), io_mode, fixed_buffer_pool_size_mb)); let cache_storage_builder = CacheStorageBuilder::new() .with_batch_size(batch_size) .with_max_cache_bytes(max_cache_bytes) @@ -379,6 +380,7 @@ mod tests { Box::new(LiquidPolicy::new()), Box::new(TranscodeSqueezeEvict), IoMode::Uring, + 0, ); let file = cache.register_or_get_file("test".to_string(), schema); file.create_row_group(0) diff --git a/src/parquet/src/cache/stats.rs b/src/parquet/src/cache/stats.rs index 73ec502c..5e70ee40 100644 --- a/src/parquet/src/cache/stats.rs +++ b/src/parquet/src/cache/stats.rs @@ -189,6 +189,7 @@ mod tests { Box::new(LiquidPolicy::new()), Box::new(Evict), IoMode::Uring, + 0, ); let fields: Vec = (0..8) .map(|i| Field::new(format!("test_{i}"), DataType::Int32, false)) diff --git a/src/parquet/src/io/io_backend.rs b/src/parquet/src/io/io_backend.rs index f915ded3..d1be9b7c 100644 --- a/src/parquet/src/io/io_backend.rs +++ b/src/parquet/src/io/io_backend.rs @@ -16,7 +16,7 @@ pub(super) async fn read( IoMode::Uring => { #[cfg(target_os = "linux")] { - super::io_uring::thread_pool_uring::read(path, range, false).await + super::io_uring::thread_pool_uring::read(path, range, false,true).await } #[cfg(not(target_os = "linux"))] { @@ -38,7 +38,7 @@ pub(super) async fn read( IoMode::UringDirect => { #[cfg(target_os = "linux")] { - super::io_uring::thread_pool_uring::read(path, range, true).await + super::io_uring::thread_pool_uring::read(path, range, true, true).await } #[cfg(not(target_os = "linux"))] { @@ -82,7 +82,7 @@ pub(super) async fn write( IoMode::Uring | IoMode::UringDirect => { #[cfg(target_os = "linux")] { - super::io_uring::thread_pool_uring::write(path, &data).await + super::io_uring::thread_pool_uring::write(path, &data, false).await } #[cfg(not(target_os = "linux"))] { diff --git a/src/parquet/src/io/io_uring/multi_async_uring.rs b/src/parquet/src/io/io_uring/multi_async_uring.rs index e2c26ec4..1f499d2e 100644 --- a/src/parquet/src/io/io_uring/multi_async_uring.rs +++ b/src/parquet/src/io/io_uring/multi_async_uring.rs @@ -39,7 +39,7 @@ impl AsyncRing { fn submit_task(&mut self, task: &mut dyn IoTask) { { let mut sq = self.ring.submission(); - let entry = task.prepare_sqe().user_data(0); + let entry = task.prepare_sqe()[0].clone().user_data(0); unsafe { sq.push(&entry) .expect("failed to push entry to io-uring submission queue"); @@ -207,7 +207,7 @@ where } State::Pending { mut ring, mut task } => { if let Some(cqe) = ring.as_mut().take_completion() { - task.complete(&cqe); + task.complete(vec![&cqe]); return Poll::Ready(task); } this.state = State::Pending { ring, task }; @@ -264,6 +264,6 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), false); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/io/io_uring/multi_blocking_uring.rs b/src/parquet/src/io/io_uring/multi_blocking_uring.rs index 86d2f6a5..955e7c93 100644 --- a/src/parquet/src/io/io_uring/multi_blocking_uring.rs +++ b/src/parquet/src/io/io_uring/multi_blocking_uring.rs @@ -29,7 +29,7 @@ impl BlockingRing { { { let mut sq = self.ring.submission(); - let entry = task.prepare_sqe().user_data(0); + let entry = task.prepare_sqe()[0].clone().user_data(0); unsafe { sq.push(&entry).expect("Failed to push to submission queue"); } @@ -44,7 +44,7 @@ impl BlockingRing { let cqe = cq .next() .ok_or_else(|| io::Error::other("io-uring completion queue empty"))?; - task.complete(&cqe); + task.complete(vec![&cqe]); } Ok(task) @@ -176,6 +176,6 @@ pub(crate) fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { .truncate(true) .write(true) .open(path)?; - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), false); run_blocking_task(Box::new(write_task))?.into_result() } diff --git a/src/parquet/src/io/io_uring/single_uring.rs b/src/parquet/src/io/io_uring/single_uring.rs index 6f5cd55d..d7867947 100644 --- a/src/parquet/src/io/io_uring/single_uring.rs +++ b/src/parquet/src/io/io_uring/single_uring.rs @@ -161,7 +161,7 @@ impl SharedRingInner { fn submit_task(&mut self, task: &mut dyn IoTask, token: u16) { { let mut sq = self.ring.submission(); - let entry = task.prepare_sqe().user_data(token as u64); + let entry = task.prepare_sqe()[0].clone().user_data(token as u64); unsafe { sq.push(&entry) .expect("Failed to push entry to io-uring submission queue"); @@ -264,7 +264,7 @@ where match state { State::Pending { token, mut task } => { if let Some(cqe) = ring.take_completion(token) { - task.complete(&cqe); + task.complete(vec![&cqe]); return Poll::Ready(task); } // Not ready yet, restore state @@ -339,6 +339,6 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), false); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/io/io_uring/tasks.rs b/src/parquet/src/io/io_uring/tasks.rs index 9f79bc5a..68efb295 100644 --- a/src/parquet/src/io/io_uring/tasks.rs +++ b/src/parquet/src/io/io_uring/tasks.rs @@ -12,16 +12,18 @@ use std::{ use bytes::Bytes; use io_uring::{cqueue, opcode, squeue}; +use liquid_cache_common::memory::pool::{FixedBufferAllocation, FixedBufferPool}; pub(crate) const BLOCK_ALIGN: usize = 4096; /// Represents an IO request to the uring worker thread. pub(crate) trait IoTask: Send + Any + std::fmt::Debug { /// Convert the request to an io-uring submission queue entry. - fn prepare_sqe(&mut self) -> squeue::Entry; + fn prepare_sqe(&mut self) -> Vec; + // TODO(): Can we pass completion queue entries on the stack? /// Record the outcome of the completion queue entry. - fn complete(&mut self, cqe: &cqueue::Entry); + fn complete(&mut self, cqe: Vec<&cqueue::Entry>); /// Convert the boxed task to a boxed `Any` so callers can recover the original type. fn into_any(self: Box) -> Box; @@ -67,7 +69,7 @@ impl FileOpenTask { impl IoTask for FileOpenTask { #[inline] - fn prepare_sqe(&mut self) -> squeue::Entry { + fn prepare_sqe(&mut self) -> Vec { let mut flags = libc::O_RDONLY | libc::O_CLOEXEC; if self.direct_io { flags |= libc::O_DIRECT; @@ -75,12 +77,13 @@ impl IoTask for FileOpenTask { let open_op = opcode::OpenAt::new(io_uring::types::Fd(libc::AT_FDCWD), self.path.as_ptr()) .flags(flags); - open_op.build() + vec![open_op.build()] } #[inline] - fn complete(&mut self, cqe: &cqueue::Entry) { - let result = cqe.result(); + fn complete(&mut self, cqe: Vec<&cqueue::Entry>) { + debug_assert_eq!(cqe.len(), 1, "Should receive a single completion for a file open task"); + let result = cqe[0].result(); if result < 0 { self.error = Some(std::io::Error::from_raw_os_error(-result)); } else { @@ -184,7 +187,7 @@ impl FileReadTask { impl IoTask for FileReadTask { #[inline] - fn prepare_sqe(&mut self) -> squeue::Entry { + fn prepare_sqe(&mut self) -> Vec { let num_bytes = (self.range.end - self.range.start) as usize; let (start_padding, end_padding) = self.padding(); let num_bytes_aligned = num_bytes + start_padding + end_padding; @@ -198,15 +201,119 @@ impl IoTask for FileReadTask { num_bytes_aligned as u32, ); - read_op + vec![read_op .offset(self.range.start - start_padding as u64) - .build() + .build()] } #[inline] - fn complete(&mut self, cqe: &cqueue::Entry) { - if cqe.result() < 0 { - self.error = Some(std::io::Error::from_raw_os_error(-cqe.result())); + fn complete(&mut self, cqe: Vec<&cqueue::Entry>) { + debug_assert_eq!(cqe.len(), 1, "Should receive a single completion for a FileRead task"); + let result = cqe[0].result(); + if result < 0 { + self.error = Some(std::io::Error::from_raw_os_error(-result)); + } + } + + fn into_any(self: Box) -> Box { + self + } +} + +#[derive(Debug)] +pub(crate) struct FixedFileReadTask { + fixed_buffer: FixedBufferAllocation, + file: RawFd, + range: Range, + direct_io: bool, + error: Option, +} + +impl FixedFileReadTask { + #[inline] + fn compute_padding(range: &Range, direct_io: bool) -> (usize, usize) { + if direct_io { + let start_padding = range.start as usize & (BLOCK_ALIGN - 1); + let end_mod = range.end as usize & (BLOCK_ALIGN - 1); + let end_padding = if end_mod == 0 { + 0 + } else { + BLOCK_ALIGN - end_mod + }; + (start_padding, end_padding) + } else { + (0, 0) + } + } + + #[inline] + fn padding(&self) -> (usize, usize) { + Self::compute_padding(&self.range, self.direct_io) + } + + pub(crate) fn build(range: Range, file: &fs::File, direct_io: bool) -> Result { + let (start_padding, end_padding) = Self::compute_padding(&range, direct_io); + let requested_bytes = (range.end - range.start) as usize; + let num_bytes_aligned = requested_bytes + start_padding + end_padding; + + // Fixed buffers are aligned to the block size. Don't worry about alignment here + let ptr = FixedBufferPool::malloc(num_bytes_aligned); + if ptr.is_null() { + return Err(std::io::Error::from(std::io::ErrorKind::OutOfMemory)); + } + let alloc = FixedBufferAllocation {ptr, size: num_bytes_aligned}; + + Ok(FixedFileReadTask { + fixed_buffer: alloc, + file: file.as_raw_fd(), + range, + direct_io, + error: None, + }) + } + + /// Return a bytes object holding the result of the read operation. + #[inline] + pub(crate) fn into_result(self: Box) -> Result { + let mut this = self; + if let Some(err) = this.error.take() { + return Err(err); + } + + let (start_padding, _) = this.padding(); + let range_len = (this.range.end - this.range.start) as usize; + let data_end = start_padding + range_len; + let bytes = Bytes::from_owner(this.fixed_buffer); + + Ok(bytes.slice(start_padding..data_end)) + } +} + +impl IoTask for FixedFileReadTask { + #[inline] + fn prepare_sqe(&mut self) -> Vec { + let buffers = FixedBufferPool::get_fixed_buffers(&self.fixed_buffer); + let mut sqes = Vec::::new(); + let mut file_offset = 0; + for buffer in buffers { + let sqe = opcode::ReadFixed::new( + io_uring::types::Fd(self.file), + buffer.ptr, + buffer.bytes as u32, + buffer.buf_id as u16) + .offset(file_offset).build(); + file_offset += buffer.bytes as u64; + sqes.push(sqe); + } + sqes + } + + #[inline] + fn complete(&mut self, cqes: Vec<&cqueue::Entry>) { + for cqe in cqes.iter().as_ref() { + if cqe.result() < 0 { + self.error = Some(std::io::Error::from_raw_os_error(-cqe.result())); + } } } @@ -220,14 +327,16 @@ pub(crate) struct FileWriteTask { data: Bytes, fd: RawFd, error: Option, + use_fixed_buffers: bool, } impl FileWriteTask { - pub(crate) fn build(data: Bytes, fd: RawFd) -> FileWriteTask { + pub(crate) fn build(data: Bytes, fd: RawFd, use_fixed_buffers: bool) -> FileWriteTask { FileWriteTask { data, fd, error: None, + use_fixed_buffers, } } @@ -242,20 +351,22 @@ impl FileWriteTask { impl IoTask for FileWriteTask { #[inline] - fn prepare_sqe(&mut self) -> squeue::Entry { + fn prepare_sqe(&mut self) -> Vec { let write_op = opcode::Write::new( io_uring::types::Fd(self.fd), self.data.as_ptr(), self.data.len() as u32, ); - write_op.offset(0u64).build() + vec![write_op.offset(0u64).build()] } #[inline] - fn complete(&mut self, cqe: &cqueue::Entry) { - if cqe.result() < 0 { - self.error = Some(std::io::Error::from_raw_os_error(-cqe.result())); + fn complete(&mut self, cqes: Vec<&cqueue::Entry>) { + debug_assert_eq!(cqes.len(), 1, "Should receive a single completion for a FileRead task"); + let result = cqes[0].result(); + if result < 0 { + self.error = Some(std::io::Error::from_raw_os_error(-result)); } } diff --git a/src/parquet/src/io/io_uring/tests.rs b/src/parquet/src/io/io_uring/tests.rs index 94cb808d..ba40a0e7 100644 --- a/src/parquet/src/io/io_uring/tests.rs +++ b/src/parquet/src/io/io_uring/tests.rs @@ -55,7 +55,7 @@ impl BackendKind { } else { IoMode::Uring }; - initialize_uring_pool(mode); + initialize_uring_pool(mode, false); }); } BackendKind::MultiBlocking => { @@ -79,7 +79,7 @@ impl BackendKind { BackendKind::MultiBlocking => { async move { multi_blocking_uring::read(path, range, direct_io) }.boxed() } - BackendKind::ThreadPool => thread_pool_uring::read(path, range, direct_io).boxed(), + BackendKind::ThreadPool => thread_pool_uring::read(path, range, direct_io, true).boxed(), } } @@ -93,7 +93,7 @@ impl BackendKind { async move { multi_blocking_uring::write(path, &data) }.boxed() } BackendKind::ThreadPool => { - async move { thread_pool_uring::write(path, &data).await }.boxed() + async move { thread_pool_uring::write(path, &data, false).await }.boxed() } } } diff --git a/src/parquet/src/io/io_uring/thread_pool_uring.rs b/src/parquet/src/io/io_uring/thread_pool_uring.rs index b63cd2e1..71820018 100644 --- a/src/parquet/src/io/io_uring/thread_pool_uring.rs +++ b/src/parquet/src/io/io_uring/thread_pool_uring.rs @@ -16,9 +16,11 @@ use std::{ use bytes::Bytes; use io_uring::{IoUring, cqueue, squeue}; -use liquid_cache_common::IoMode; +use liquid_cache_common::{IoMode, memory::pool::FixedBufferPool}; use tokio::sync::oneshot; +use crate::io::io_uring::tasks::FixedFileReadTask; + use super::tasks::{FileOpenTask, FileReadTask, FileWriteTask, IoTask}; pub(crate) const URING_NUM_ENTRIES: u32 = 256; @@ -28,6 +30,8 @@ static ENABLED: AtomicBool = AtomicBool::new(true); struct Submission { task: Box, completion_tx: oneshot::Sender>, + pending_completions: usize, // No. of pending completions. Will be populated later by the uring worker + completions: Vec, } impl Submission { @@ -35,15 +39,32 @@ impl Submission { Submission { task, completion_tx, + pending_completions: 0, + completions: Vec::new(), } } - fn send_back(mut self, cqe: &cqueue::Entry) { - self.task.complete(cqe); + fn send_back(mut self) { + self.task.complete(self.completions.iter().collect()); self.completion_tx .send(self.task) .expect("Failed to send task back to caller"); } + + #[inline] + fn set_completions(&mut self, count: usize) { + self.pending_completions = count; + } + + #[inline] + fn reduce_completions(&mut self) { + self.pending_completions -= 1; + } + + #[inline] + fn push_completion(&mut self, cqe: cqueue::Entry) { + self.completions.push(cqe); + } } struct JoinOnDropHandle(Option>); @@ -74,9 +95,9 @@ unsafe impl Sync for IoUringThreadpool {} static IO_URING_THREAD_POOL_INST: OnceLock = OnceLock::new(); -pub(crate) fn initialize_uring_pool(io_mode: IoMode) { +pub(crate) fn initialize_uring_pool(io_mode: IoMode, register_buffers: bool) { if matches!(io_mode, IoMode::Uring | IoMode::UringDirect) { - IO_URING_THREAD_POOL_INST.get_or_init(|| IoUringThreadpool::new(io_mode)); + IO_URING_THREAD_POOL_INST.get_or_init(|| IoUringThreadpool::new(io_mode, register_buffers)); } if matches!(io_mode, IoMode::UringBlocking) { super::multi_blocking_uring::initialize_blocking_rings(); @@ -84,7 +105,7 @@ pub(crate) fn initialize_uring_pool(io_mode: IoMode) { } impl IoUringThreadpool { - fn new(io_type: IoMode) -> IoUringThreadpool { + fn new(io_type: IoMode, register_buffers: bool) -> IoUringThreadpool { let (sender, receiver) = crossbeam_channel::unbounded::(); let builder = IoUring::::builder(); @@ -92,6 +113,13 @@ impl IoUringThreadpool { .build(URING_NUM_ENTRIES) .expect("Failed to build IoUring instance"); + if register_buffers { + let res = FixedBufferPool::register_buffers_with_ring(&ring); + if res.is_err() { + log::error!("Failed to register buffers with io-uring ring: {:?}", res); + } + } + let worker = thread::Builder::new() .name("lc-io-worker".to_string()) .spawn(move || { @@ -134,6 +162,12 @@ struct UringWorker { ring: IoUring, tokens: VecDeque, submitted_tasks: Vec>, + /** + * When using fixed buffers, a single task can produce multiple submission queue entries. + * It is possible that we aren't able to submit all of them at one go. Hold them in an + * intermediate queue in that case + */ + queued_entries: VecDeque, io_performed: AtomicUsize, } @@ -149,6 +183,7 @@ impl UringWorker { tokens, submitted_tasks: tasks, io_performed: AtomicUsize::new(0), + queued_entries: VecDeque::with_capacity(URING_NUM_ENTRIES as usize), } } @@ -158,26 +193,61 @@ impl UringWorker { break; } + self.drain_intermediate_queue(); self.drain_submissions(); self.poll_completions(); } } + fn drain_intermediate_queue(&mut self) { + let mut need_submit = false; + { + let sq = &mut self.ring.submission(); + while !sq.is_full() && !self.queued_entries.is_empty() { + let sqe = self.queued_entries.pop_front().unwrap(); + unsafe { + sq.push(&sqe).expect("Failed to push to submission queue"); + } + sq.sync(); + need_submit = true; + } + } + if need_submit { + self.ring.submit().expect("Failed to submit"); + } + } + #[inline(never)] fn drain_submissions(&mut self) { let mut need_submit = false; while !self.receiver.is_empty() && !self.tokens.is_empty() { - let mut submission = self.receiver.recv().unwrap(); + let sq = &mut self.ring.submission(); + sq.sync(); + if sq.is_full() { + // A single token might have multiple associated sqes. Free token doesn't always imply that we have free submission slots + break; + } + let token = self.tokens.pop_front().unwrap(); - { - let sq = &mut self.ring.submission(); - let task = submission.task.as_mut(); - let sqe = task.prepare_sqe().user_data(token as u64); - unsafe { - sq.push(&sqe).expect("Failed to push to submission queue"); + let mut submission = self.receiver.recv().unwrap(); + let task = submission.task.as_mut(); + let sqes = task.prepare_sqe(); + submission.set_completions(sqes.len()); + let mut tasks_submitted = 0; + + for sqe in sqes.iter().as_ref() { + let res = unsafe { + sq.push(&sqe) + }; + if res.is_err() { + break; } + tasks_submitted += 1; sq.sync(); } + for i in tasks_submitted..sqes.len() { + self.queued_entries.push_back(sqes[i].clone().user_data(token as u64)); + } self.submitted_tasks[token as usize] = Some(submission); need_submit = true; } @@ -194,12 +264,26 @@ impl UringWorker { match cq.next() { Some(cqe) => { let token = cqe.user_data() as usize; - let submission = self.submitted_tasks[token] - .take() - .expect("Task not found in submitted tasks"); - submission.send_back(&cqe); - self.tokens.push_back(token as u16); - self.io_performed.fetch_add(1, Ordering::Relaxed); + let pending_completions = self.submitted_tasks[token] + .as_ref() + .expect("Task not found in submitted tasks") + .pending_completions; + + if pending_completions == 1 { + let mut submission = self.submitted_tasks[token] + .take() + .expect("Task not found in submitted tasks"); + submission.push_completion(cqe); + submission.send_back(); + self.tokens.push_back(token as u16); + self.io_performed.fetch_add(1, Ordering::Relaxed); + } else { + let submission = self.submitted_tasks[token] + .as_mut() + .expect("Task not found in submitted tasks"); + submission.reduce_completions(); + submission.push_completion(cqe); + } } None => break, } @@ -286,6 +370,7 @@ pub(crate) async fn read( path: PathBuf, range: Option>, direct_io: bool, + use_fixed_buffers: bool, ) -> Result { let open_task = FileOpenTask::build(path, direct_io)?; let file = submit_async_task(open_task).await.into_result()?; @@ -297,11 +382,19 @@ pub(crate) async fn read( 0..len }; + if use_fixed_buffers { + let read_task = FixedFileReadTask::build(effective_range.clone(), &file, direct_io); + // Fall back to normal read if fixed buffers are not available + if read_task.is_ok() { + return submit_async_task(read_task.unwrap()).await.into_result() + } + log::error!("Failed to allocate fixed buffers for read. Falling back to normal read"); + } let read_task = FileReadTask::build(effective_range, file, direct_io); - submit_async_task(read_task).await.into_result() + return submit_async_task(read_task).await.into_result() } -pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) async fn write(path: PathBuf, data: &Bytes, use_fixed_buffers: bool) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) @@ -309,6 +402,6 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), use_fixed_buffers); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/io/mod.rs b/src/parquet/src/io/mod.rs index cf5d3f42..570db794 100644 --- a/src/parquet/src/io/mod.rs +++ b/src/parquet/src/io/mod.rs @@ -23,19 +23,25 @@ pub(crate) struct ParquetIoContext { } impl ParquetIoContext { - pub fn new(base_dir: PathBuf, io_mode: IoMode) -> Self { + pub fn new(base_dir: PathBuf, io_mode: IoMode, fixed_buffer_pool_size_mb: usize) -> Self { if matches!( io_mode, IoMode::UringDirect | IoMode::Uring | IoMode::UringBlocking ) { #[cfg(target_os = "linux")] { - crate::io::io_uring::initialize_uring_pool(io_mode); + use liquid_cache_common::memory::pool::FixedBufferPool; + if fixed_buffer_pool_size_mb > 0 { + FixedBufferPool::init(fixed_buffer_pool_size_mb); + } + crate::io::io_uring::initialize_uring_pool(io_mode, fixed_buffer_pool_size_mb > 0); } #[cfg(not(target_os = "linux"))] { panic!("io_mode {:?} is only supported on Linux", io_mode); } + } else if fixed_buffer_pool_size_mb > 0 { + panic!("Fixed buffers are only supported for UringDirect, Uring and UringBlocking"); } Self { diff --git a/src/parquet/src/optimizers/lineage_opt.rs b/src/parquet/src/optimizers/lineage_opt.rs index c0606926..79b80314 100644 --- a/src/parquet/src/optimizers/lineage_opt.rs +++ b/src/parquet/src/optimizers/lineage_opt.rs @@ -982,6 +982,7 @@ mod tests { Box::new(LiquidPolicy::new()), Box::new(TranscodeSqueezeEvict), IoMode::Uring, + 0, ))); let state = SessionStateBuilder::new() diff --git a/src/parquet/src/optimizers/mod.rs b/src/parquet/src/optimizers/mod.rs index b4be2d33..1908aaef 100644 --- a/src/parquet/src/optimizers/mod.rs +++ b/src/parquet/src/optimizers/mod.rs @@ -232,6 +232,7 @@ mod tests { Box::new(LiquidPolicy::new()), Box::new(TranscodeSqueezeEvict), IoMode::Uring, + 0, )); let rewritten = rewrite_data_source_plan(plan, &liquid_cache); diff --git a/src/parquet/src/reader/runtime/liquid_cache_reader.rs b/src/parquet/src/reader/runtime/liquid_cache_reader.rs index e6118757..246ec79d 100644 --- a/src/parquet/src/reader/runtime/liquid_cache_reader.rs +++ b/src/parquet/src/reader/runtime/liquid_cache_reader.rs @@ -297,6 +297,7 @@ mod tests { Box::new(LiquidPolicy::new()), Box::new(Evict), IoMode::Uring, + 0, ); let field = Arc::new(Field::new("col0", DataType::Int32, false)); let schema = Arc::new(Schema::new(vec![field.clone()])); diff --git a/src/parquet/src/reader/runtime/liquid_stream.rs b/src/parquet/src/reader/runtime/liquid_stream.rs index 2e633eb6..2218eee3 100644 --- a/src/parquet/src/reader/runtime/liquid_stream.rs +++ b/src/parquet/src/reader/runtime/liquid_stream.rs @@ -670,6 +670,7 @@ mod tests { Box::new(LiquidPolicy::new()), Box::new(Evict), IoMode::Uring, + 0, ); let file = cache.register_or_get_file("test.parquet".to_string(), schema); file.create_row_group(0) diff --git a/src/server/src/lib.rs b/src/server/src/lib.rs index bddbad77..44a23739 100644 --- a/src/server/src/lib.rs +++ b/src/server/src/lib.rs @@ -116,6 +116,7 @@ impl LiquidCacheService { Box::new(LiquidPolicy::new()), Box::new(TranscodeSqueezeEvict), None, + 0, ) } @@ -134,6 +135,7 @@ impl LiquidCacheService { cache_policy: Box, squeeze_policy: Box, io_mode: Option, + fixed_buffer_pool_size_mb: usize, ) -> anyhow::Result { let disk_cache_dir = match disk_cache_dir { Some(dir) => dir, @@ -155,6 +157,7 @@ impl LiquidCacheService { cache_policy, squeeze_policy, io_mode, + fixed_buffer_pool_size_mb, ), }) } diff --git a/src/server/src/service.rs b/src/server/src/service.rs index 67f240de..7992cd16 100644 --- a/src/server/src/service.rs +++ b/src/server/src/service.rs @@ -51,6 +51,7 @@ impl LiquidCacheServiceInner { cache_policy: Box, squeeze_policy: Box, io_mode: IoMode, + fixed_buffer_pool_size_mb: usize, ) -> Self { let batch_size = default_ctx.state().config().batch_size(); @@ -64,6 +65,7 @@ impl LiquidCacheServiceInner { cache_policy, squeeze_policy, io_mode, + fixed_buffer_pool_size_mb, )); Self { @@ -220,6 +222,7 @@ mod tests { Box::new(LiquidPolicy::new()), Box::new(TranscodeSqueezeEvict), IoMode::Uring, + 0, ); let url = Url::parse("file:///").unwrap(); server diff --git a/src/server/src/tests/mod.rs b/src/server/src/tests/mod.rs index e79c1d26..cf3faf7c 100644 --- a/src/server/src/tests/mod.rs +++ b/src/server/src/tests/mod.rs @@ -42,6 +42,7 @@ async fn run_sql( Box::new(LiquidPolicy::new()), squeeze_policy, IoMode::Uring, + 0, ); async fn get_result(service: &LiquidCacheServiceInner, sql: &str) -> String { let handle = Uuid::new_v4(); From a575b307a3bbd40d3bb6727efa541a31f19aa1fd Mon Sep 17 00:00:00 2001 From: proteet Date: Mon, 1 Dec 2025 02:15:56 -0700 Subject: [PATCH 06/20] Some fixes and debug statements --- src/common/src/memory/pool.rs | 19 +++-- src/common/src/memory/segment.rs | 72 ++++++++++++++++--- src/common/src/memory/tcache.rs | 20 +++--- .../src/io/io_uring/thread_pool_uring.rs | 7 +- 4 files changed, 91 insertions(+), 27 deletions(-) diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs index 8c8a8bb8..cc8327ca 100644 --- a/src/common/src/memory/pool.rs +++ b/src/common/src/memory/pool.rs @@ -1,7 +1,7 @@ extern crate io_uring; use core::slice; -use std::{cmp::min, sync::{Arc, Mutex, OnceLock, atomic::{AtomicBool, Ordering}}}; +use std::{cmp::min, sync::{Arc, Mutex, OnceLock, atomic::{AtomicBool, AtomicU64, Ordering}}}; use futures::io; use io_uring::IoUring; @@ -45,6 +45,7 @@ pub struct FixedBufferPool { start_ptr: *mut u8, capacity: usize, registered: AtomicBool, // Whether buffers have been registered + foreign_free: AtomicU64, } unsafe impl Send for FixedBufferPool {} @@ -53,6 +54,7 @@ unsafe impl Sync for FixedBufferPool {} impl FixedBufferPool { fn new(capacity_mb: usize) -> FixedBufferPool { + log::info!("Initializing fixed buffer pool with capacity: {} MB", capacity_mb); let num_cpus = std::thread::available_parallelism().unwrap(); let capacity = capacity_mb << 20; let arena = Self::allocate_arena(capacity.clone()); @@ -69,7 +71,8 @@ impl FixedBufferPool { arena, start_ptr, capacity, - registered: AtomicBool::new(false) + registered: AtomicBool::new(false), + foreign_free: AtomicU64::new(0), } } @@ -88,7 +91,12 @@ impl FixedBufferPool { pub fn malloc(size: usize) -> *mut u8 { let local_cache = Self::get_thread_local_cache(); - local_cache.lock().unwrap().allocate(size) + let ptr = local_cache.lock().unwrap().allocate(size); + if ptr.is_null() { + let pool = FIXED_BUFFER_POOL.get().unwrap(); + log::info!("Foreign frees: {}", pool.foreign_free.load(Ordering::Relaxed)); + } + ptr } pub fn register_buffers_with_ring(ring: &IoUring) -> io::Result<()> { @@ -96,6 +104,7 @@ impl FixedBufferPool { let mut arena_guard = pool.arena.lock().unwrap(); let res = arena_guard.register_buffers_with_ring(ring); if res.is_ok() { + log::info!("Registered buffers with io-uring ring"); pool.registered.store(true, Ordering::Relaxed); } res @@ -155,13 +164,15 @@ impl FixedBufferPool { let mut guard = local_cache.lock().unwrap(); guard.retire_page(page_ptr); } + } else { + let pool = FIXED_BUFFER_POOL.get().unwrap(); + pool.foreign_free.fetch_add(1, Ordering::Relaxed); } } } impl Drop for FixedBufferPool { fn drop(self: &mut Self) { - println!("Drop called"); let arena = self.arena.lock().unwrap(); drop(arena); } diff --git a/src/common/src/memory/segment.rs b/src/common/src/memory/segment.rs index cb4e198d..70b239fb 100644 --- a/src/common/src/memory/segment.rs +++ b/src/common/src/memory/segment.rs @@ -5,12 +5,13 @@ use crate::memory::{page::{PAGE_SIZE, Page, Slice}}; pub const SEGMENT_SIZE: usize = 32 * 1024 * 1024; pub const SEGMENT_SIZE_BITS: usize = SEGMENT_SIZE.ilog2() as usize; -pub const PAGES_PER_SEGMENT: usize = SEGMENT_SIZE / PAGE_SIZE; +// The metadata is stored at the beginning of the slice. So we don't get the entirety of it for pages +pub const PAGES_PER_SEGMENT: usize = (SEGMENT_SIZE / PAGE_SIZE) - 1; pub struct Segment { pub(crate) allocated: usize, pub(crate) num_slices: usize, - pub(crate) pages: [Page; PAGES_PER_SEGMENT - 1], + pub(crate) pages: [Page; PAGES_PER_SEGMENT], pub(crate) thread_id: usize, } @@ -22,7 +23,7 @@ impl Segment { unsafe { (*segment_ptr).allocated = 0; - (*segment_ptr).num_slices = PAGES_PER_SEGMENT - 1; + (*segment_ptr).num_slices = PAGES_PER_SEGMENT; for i in 0..(*segment_ptr).num_slices { (*segment_ptr).pages[i] = Page::from_slice(Slice {ptr: start_ptr, size: PAGE_SIZE}); start_ptr = start_ptr.wrapping_add(PAGE_SIZE); @@ -60,8 +61,11 @@ impl Segment { // Slice {ptr: null_mut(), size: 0} // } - pub fn retire(self: &mut Self) -> () { - todo!() + pub fn reset(self: &mut Self) -> () { + for page in self.pages.iter_mut() { + page.slice_count = 0; + page.slice_offset = 0; + } } pub fn get_segment_from_ptr(ptr: *mut u8) -> *mut Segment { @@ -75,7 +79,7 @@ impl Segment { let index = unsafe { ptr.sub(base_page_ptr as usize) as usize / PAGE_SIZE }; - debug_assert!(index < PAGES_PER_SEGMENT - 1); + debug_assert!(index < PAGES_PER_SEGMENT); &mut self.pages[index] as *mut Page } @@ -90,11 +94,57 @@ impl Segment { let index = unsafe { page_ref.page_start.sub(base_page_ptr as usize) as usize / PAGE_SIZE }; - debug_assert!(index < PAGES_PER_SEGMENT - 1); - let next_slice = &mut self.pages[index + num_slices]; - next_slice.slice_offset = 0; - next_slice.slice_count = page_ref.slice_count - num_slices; + debug_assert!(index + num_slices < PAGES_PER_SEGMENT); + /* + * ASSUMPTION: Pointer to the beginning of the slice is passed in free(). + * We don't need to modify all the intermediate pages while splitting. Only update the following: + * - slice_offsets for last pages in each slice. + * - slice_count for the first pages in each slice. + */ + let last_page_in_slice1 = &mut self.pages[index + num_slices - 1]; + last_page_in_slice1.slice_offset = num_slices - 1; + + let last_page_in_slice2 = &mut self.pages[index + page_ref.slice_count - 1]; + last_page_in_slice2.slice_offset = page_ref.slice_count - num_slices - 1; + + let slice2 = &mut self.pages[index + num_slices]; + slice2.slice_offset = 0; + slice2.slice_count = page_ref.slice_count - num_slices; page_ref.slice_count = num_slices; - next_slice as *mut Page + slice2 as *mut Page + } + + pub fn coalesce_slices(self: &mut Self, left_slice: &mut Page, right_slice: &mut Page) { + debug_assert!(left_slice.page_start >= self.pages[0].page_start && + left_slice.page_start <= self.pages[PAGES_PER_SEGMENT - 1].page_start); + debug_assert!(right_slice.page_start >= self.pages[0].page_start && + right_slice.page_start <= self.pages[PAGES_PER_SEGMENT - 1].page_start); + + let left_slice_idx = (left_slice.page_start as usize - self.pages[0].page_start as usize) / PAGE_SIZE; + + /* + * ASSUMPTION: Pointer to the beginning of the slice is passed in free(). + * We don't need to modify all the intermediate pages while splitting. Only update the following: + * - slice_count for the first pages in combined slice. + * - slice_offset for the last page in the combined slice. + */ + left_slice.slice_offset = 0; + right_slice.slice_offset = left_slice.slice_count; + left_slice.slice_count += right_slice.slice_count; + + let last_page = &mut self.pages[left_slice_idx + left_slice.slice_count - 1]; + last_page.slice_offset = left_slice.slice_count - 1; + + } + + pub fn debug_print(self: &mut Self) { + log::info!("------Segment debug print--------"); + let mut idx = 0; + while idx < PAGES_PER_SEGMENT { + let page = &mut self.pages[idx]; + log::info!("Page {}: slice_count: {}, slice_offset: {}, block_size: {}", idx, page.slice_count, page.slice_offset, page.block_size); + idx += page.slice_count; + } + log::info!("------end--------"); } } \ No newline at end of file diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index badc363b..b173a6e5 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -22,9 +22,8 @@ const SIZE_CLASSES: &'static [usize] = &[ const NUM_SIZE_CLASSES: usize = SIZE_CLASSES.len(); pub(crate) const MIN_SIZE_FROM_PAGES: usize = SIZE_CLASSES[0]; -const MAX_SIZE_FROM_PAGES: usize = SIZE_CLASSES[NUM_SIZE_CLASSES - 1]; -const SEGMENT_BINS: usize = 7; // (SEGMENT_SIZE/PAGE_SIZE).log2() + 1 +const SEGMENT_BINS: usize = (SEGMENT_SIZE/PAGE_SIZE).ilog2() as usize + 1; #[derive(Default, Clone)] pub(crate) struct TCacheStats { @@ -126,13 +125,16 @@ impl TCache { } fn retire_segment(self: &mut Self, segment: *mut Segment) { + log::info!("Retiring segment from thread with id: {}", self.thread_id); + unsafe { (*segment).debug_print(); } self.stats.segments_retired += 1; let pages = unsafe { &mut (*segment).pages }; let mut slice_idx: usize = 0; - while slice_idx < PAGES_PER_SEGMENT - 1 { + while slice_idx < PAGES_PER_SEGMENT { self.remove_slice_from_span(&mut pages[slice_idx]); slice_idx += pages[slice_idx].slice_count; } + unsafe { (*segment).reset(); } let mut guard = self.arena.lock().unwrap(); guard.retire_segment(segment); } @@ -174,12 +176,12 @@ impl TCache { } let next_slice = page.wrapping_add(page_ref.slice_count); - if next_slice < (&mut segment.pages[PAGES_PER_SEGMENT - 2]) as *mut Page { + if next_slice <= (&mut segment.pages[PAGES_PER_SEGMENT - 1]) as *mut Page { let next_slice_ref = unsafe { &mut (*next_slice) }; if next_slice_ref.block_size == 0 { // Page is not in use, remove it self.remove_slice_from_span(next_slice_ref); - self.coalesce_slices(page_ref, unsafe { &mut (*next_slice) }); + segment.coalesce_slices(page_ref, unsafe { &mut (*next_slice) }); } } @@ -189,8 +191,10 @@ impl TCache { let prev_slice_ref = unsafe { &mut (*prev_slice) }; if prev_slice_ref.block_size == 0 { // Merge with the previous slice + log::info!("Slice count before merge: {}", prev_slice_ref.slice_count); self.remove_slice_from_span(prev_slice_ref); - self.coalesce_slices(prev_slice_ref, page_ref); + segment.coalesce_slices(prev_slice_ref, page_ref); + log::info!("Slice count after merge: {}", prev_slice_ref.slice_count); let span_idx = Self::get_span_idx_from_slice_count(prev_slice_ref.slice_count); self.spans[span_idx].push(prev_slice); } @@ -256,7 +260,6 @@ impl TCache { (*slice).set_block_size(block_size); } return slice; - } } null_mut() @@ -281,6 +284,7 @@ impl TCache { if segment_opt.is_none() { return false; } + log::info!("Allocating segment to thread with id: {}", thread_id); unsafe { (*segment_opt.unwrap()).thread_id = thread_id; } @@ -291,7 +295,7 @@ impl TCache { pub(crate) fn allocate(self: &mut Self, size: usize) -> *mut u8 { self.stats.total_allocations += 1; - if size > MAX_SIZE_FROM_PAGES { + if size > PAGE_SIZE { // Directly get page from segment let num_pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; let block_size = num_pages * PAGE_SIZE; diff --git a/src/parquet/src/io/io_uring/thread_pool_uring.rs b/src/parquet/src/io/io_uring/thread_pool_uring.rs index 71820018..b03a4327 100644 --- a/src/parquet/src/io/io_uring/thread_pool_uring.rs +++ b/src/parquet/src/io/io_uring/thread_pool_uring.rs @@ -231,13 +231,13 @@ impl UringWorker { let token = self.tokens.pop_front().unwrap(); let mut submission = self.receiver.recv().unwrap(); let task = submission.task.as_mut(); - let sqes = task.prepare_sqe(); + let mut sqes = task.prepare_sqe(); submission.set_completions(sqes.len()); let mut tasks_submitted = 0; - for sqe in sqes.iter().as_ref() { + for sqe in sqes.iter_mut() { let res = unsafe { - sq.push(&sqe) + sq.push(&sqe.clone().user_data(token as u64)) }; if res.is_err() { break; @@ -388,7 +388,6 @@ pub(crate) async fn read( if read_task.is_ok() { return submit_async_task(read_task.unwrap()).await.into_result() } - log::error!("Failed to allocate fixed buffers for read. Falling back to normal read"); } let read_task = FileReadTask::build(effective_range, file, direct_io); return submit_async_task(read_task).await.into_result() From 1b67f1a565343fd5a88701ca483896a7024fddaf Mon Sep 17 00:00:00 2001 From: proteet Date: Fri, 2 Jan 2026 00:23:01 -0700 Subject: [PATCH 07/20] Fix slice_count inconsistencies --- src/common/src/memory/pool.rs | 23 ++++++- src/common/src/memory/segment.rs | 110 ++++++++++++++++--------------- src/common/src/memory/tcache.rs | 39 ++++++----- 3 files changed, 97 insertions(+), 75 deletions(-) diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs index cc8327ca..dacd124c 100644 --- a/src/common/src/memory/pool.rs +++ b/src/common/src/memory/pool.rs @@ -6,7 +6,7 @@ use std::{cmp::min, sync::{Arc, Mutex, OnceLock, atomic::{AtomicBool, AtomicU64, use futures::io; use io_uring::IoUring; -use crate::memory::{arena::Arena, segment::Segment, tcache::{TCache, TCacheStats}}; +use crate::memory::{arena::Arena, page::PAGE_SIZE, segment::Segment, tcache::{TCache, TCacheStats}}; static FIXED_BUFFER_POOL: OnceLock = OnceLock::new(); @@ -403,4 +403,25 @@ mod tests { assert_eq!(buffer, &random_bytes[..]); } + #[test] + fn test_edge_case() { + FixedBufferPool::init(128); + let len = 4 * 1024; + let ptr1 = FixedBufferPool::malloc(len); + let ptr2 = FixedBufferPool::malloc(len << 1); + let ptr3 = FixedBufferPool::malloc(len << 2); + let ptr4 = FixedBufferPool::malloc(len << 4); + + FixedBufferPool::free(ptr1); + FixedBufferPool::free(ptr3); + FixedBufferPool::free(ptr2); + FixedBufferPool::free(ptr4); + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.pages_retired, 4); + assert_eq!(stats.segments_retired, 1); + // assert_eq + } } \ No newline at end of file diff --git a/src/common/src/memory/segment.rs b/src/common/src/memory/segment.rs index 70b239fb..e56e247d 100644 --- a/src/common/src/memory/segment.rs +++ b/src/common/src/memory/segment.rs @@ -37,33 +37,9 @@ impl Segment { self.allocated == self.num_slices } - // pub fn try_allocate_page(self: &mut Self, page_size: usize) -> Slice { - // let min_bin = page_size / PAGE_SIZE; - // for i in min_bin..NUM_SPANS { - // let slice_opt = self.spans[i].pop_front(); - // if slice_opt.is_none() { - // continue; - // } - // let mut slice = slice_opt.unwrap(); - // let mut j = i; - // while j > min_bin && slice.size >= 2 * page_size { - // // split slice - // let (slice1, slice2) = slice.split(); - // self.spans[i-1].push_back(slice2); - // slice = slice1; - // j -= 1; - // } - // self.allocated += slice.size; - // return slice; - // } - // // Allocate from arena - - // Slice {ptr: null_mut(), size: 0} - // } - pub fn reset(self: &mut Self) -> () { for page in self.pages.iter_mut() { - page.slice_count = 0; + page.slice_count = 1; page.slice_offset = 0; } } @@ -84,34 +60,54 @@ impl Segment { } /** - * Split `page` into 2, with the first partition having `num_slices` slices + * Split `page` into 2, with the first partition having `num_slices` pages. + * Returns a pointer to the first page of the second slice. */ pub fn split_page(self: &mut Self, page: *mut Page, num_slices: usize) -> *mut Page { debug_assert_ne!(page, null_mut()); - let page_ref = unsafe {&mut (*page)}; - let base_page_ptr = self.pages[0].page_start; - debug_assert!(page_ref.page_start >= base_page_ptr); + let base_page_ptr = unsafe { (*page).page_start }; + let base_segment_page_ptr = self.pages[0].page_start; + debug_assert!(base_page_ptr >= base_segment_page_ptr); let index = unsafe { - page_ref.page_start.sub(base_page_ptr as usize) as usize / PAGE_SIZE + base_page_ptr.sub(base_segment_page_ptr as usize) as usize / PAGE_SIZE }; - debug_assert!(index + num_slices < PAGES_PER_SEGMENT); + + // Read original slice_count before modifying anything + let original_slice_count = unsafe { (*page).slice_count }; + debug_assert!(num_slices > 0 && num_slices < original_slice_count, + "num_slices: {}, slice_count: {}", num_slices, original_slice_count); + debug_assert!(index + original_slice_count <= PAGES_PER_SEGMENT); + // log::info!("[thread_id: {}, segment_id: {}] Splitting page with {} slices", self.thread_id, self.segment_id, original_slice_count); + /* - * ASSUMPTION: Pointer to the beginning of the slice is passed in free(). + * ASSUMPTION: Pointer to the beginning of the slice is passed in. * We don't need to modify all the intermediate pages while splitting. Only update the following: - * - slice_offsets for last pages in each slice. - * - slice_count for the first pages in each slice. + * - slice_offset for the first page of each slice (should be 0). + * - slice_offset for the last page of each slice. + * - slice_count for the first page of each slice. */ - let last_page_in_slice1 = &mut self.pages[index + num_slices - 1]; - last_page_in_slice1.slice_offset = num_slices - 1; - - let last_page_in_slice2 = &mut self.pages[index + page_ref.slice_count - 1]; - last_page_in_slice2.slice_offset = page_ref.slice_count - num_slices - 1; - - let slice2 = &mut self.pages[index + num_slices]; - slice2.slice_offset = 0; - slice2.slice_count = page_ref.slice_count - num_slices; - page_ref.slice_count = num_slices; - slice2 as *mut Page + // Use raw pointers to avoid borrow checker issues with multiple mutable references + unsafe { + // Update slice1: the original slice becomes the first part + (*page).slice_offset = 0; + (*page).slice_count = num_slices; + + let pages_ptr = self.pages.as_mut_ptr(); + let last_page_in_slice1 = pages_ptr.add(index + num_slices - 1); + (*last_page_in_slice1).slice_offset = num_slices - 1; + + // Update slice2: the remaining pages become the second slice + let slice2_count = original_slice_count - num_slices; + let slice2 = pages_ptr.add(index + num_slices); + (*slice2).slice_offset = 0; + (*slice2).slice_count = slice2_count; + assert!((*slice2).block_size == 0, "block size: {}", (*slice2).block_size); + + let last_page_in_slice2 = pages_ptr.add(index + original_slice_count - 1); + (*last_page_in_slice2).slice_offset = slice2_count - 1; + + slice2 + } } pub fn coalesce_slices(self: &mut Self, left_slice: &mut Page, right_slice: &mut Page) { @@ -121,30 +117,36 @@ impl Segment { right_slice.page_start <= self.pages[PAGES_PER_SEGMENT - 1].page_start); let left_slice_idx = (left_slice.page_start as usize - self.pages[0].page_start as usize) / PAGE_SIZE; + let right_slice_idx = (right_slice.page_start as usize - self.pages[0].page_start as usize) / PAGE_SIZE; + debug_assert!(left_slice_idx + left_slice.slice_count == right_slice_idx, + "left slice count: {}, left slice idx: {}, right slice idx: {}, thread_id: {}", + left_slice.slice_count, left_slice_idx, right_slice_idx, self.thread_id); + debug_assert!(right_slice_idx + right_slice.slice_count <= PAGES_PER_SEGMENT); /* * ASSUMPTION: Pointer to the beginning of the slice is passed in free(). - * We don't need to modify all the intermediate pages while splitting. Only update the following: - * - slice_count for the first pages in combined slice. + * We don't need to modify all the intermediate pages while coalescing. Only update the following: + * - slice_count for the first page of the combined slice (left_slice). * - slice_offset for the last page in the combined slice. + * Note: right_slice becomes an intermediate page after merging, so we don't update its metadata. */ left_slice.slice_offset = 0; - right_slice.slice_offset = left_slice.slice_count; left_slice.slice_count += right_slice.slice_count; let last_page = &mut self.pages[left_slice_idx + left_slice.slice_count - 1]; last_page.slice_offset = left_slice.slice_count - 1; - } - pub fn debug_print(self: &mut Self) { - log::info!("------Segment debug print--------"); + pub fn check_valid_segment(self: &mut Self) { let mut idx = 0; while idx < PAGES_PER_SEGMENT { let page = &mut self.pages[idx]; - log::info!("Page {}: slice_count: {}, slice_offset: {}, block_size: {}", idx, page.slice_count, page.slice_offset, page.block_size); - idx += page.slice_count; + debug_assert!(page.slice_offset == 0 && idx + page.slice_count <= PAGES_PER_SEGMENT); + let slice_count = page.slice_count; + let last_page_in_slice = &mut self.pages[idx + slice_count - 1]; + debug_assert!(last_page_in_slice.slice_offset == slice_count - 1, + "slice count: {}, last page slice offset: {}, thread_id: {}", slice_count, last_page_in_slice.slice_offset, self.thread_id); + idx += slice_count; } - log::info!("------end--------"); } } \ No newline at end of file diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index b173a6e5..af7d50e2 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -118,15 +118,9 @@ impl TCache { return true; } - fn coalesce_slices(self: &mut Self, left_slice: &mut Page, right_slice: &mut Page) { - left_slice.slice_offset = 0; - right_slice.slice_offset = left_slice.slice_count; - left_slice.slice_count += right_slice.slice_count; - } - fn retire_segment(self: &mut Self, segment: *mut Segment) { - log::info!("Retiring segment from thread with id: {}", self.thread_id); - unsafe { (*segment).debug_print(); } + // log::info!("Retiring segment from thread with id: {}", self.thread_id); + unsafe { (*segment).check_valid_segment(); } self.stats.segments_retired += 1; let pages = unsafe { &mut (*segment).pages }; let mut slice_idx: usize = 0; @@ -134,7 +128,6 @@ impl TCache { self.remove_slice_from_span(&mut pages[slice_idx]); slice_idx += pages[slice_idx].slice_count; } - unsafe { (*segment).reset(); } let mut guard = self.arena.lock().unwrap(); guard.retire_segment(segment); } @@ -179,29 +172,30 @@ impl TCache { if next_slice <= (&mut segment.pages[PAGES_PER_SEGMENT - 1]) as *mut Page { let next_slice_ref = unsafe { &mut (*next_slice) }; if next_slice_ref.block_size == 0 { + // log::info!("[thread_id: {}, segment_id: {}] Merging released slice with next slice. Slice count of next slice: {}", self.thread_id, segment.segment_id, next_slice_ref.slice_count); // Page is not in use, remove it self.remove_slice_from_span(next_slice_ref); segment.coalesce_slices(page_ref, unsafe { &mut (*next_slice) }); } } - let mut prev_slice = page.wrapping_sub(1); - if prev_slice >= (&mut segment.pages[0]) as *mut Page { + if unsafe { page.offset_from(&mut segment.pages[0] as *mut Page) > 0 } { + let mut prev_slice = page.wrapping_sub(1); prev_slice = prev_slice.wrapping_sub(unsafe { (*prev_slice).slice_offset }); let prev_slice_ref = unsafe { &mut (*prev_slice) }; if prev_slice_ref.block_size == 0 { // Merge with the previous slice - log::info!("Slice count before merge: {}", prev_slice_ref.slice_count); + // log::info!("[thread_id: {}, segment_id: {}] Merging slice with previous slice. Slice count of previous slice: {}", self.thread_id, segment.segment_id, prev_slice_ref.slice_count); self.remove_slice_from_span(prev_slice_ref); segment.coalesce_slices(prev_slice_ref, page_ref); - log::info!("Slice count after merge: {}", prev_slice_ref.slice_count); let span_idx = Self::get_span_idx_from_slice_count(prev_slice_ref.slice_count); self.spans[span_idx].push(prev_slice); } + } else { + let span_idx = Self::get_span_idx_from_slice_count(page_ref.slice_count); + self.spans[span_idx].push(page); } - - let span_idx = Self::get_span_idx_from_slice_count(page_ref.slice_count); - self.spans[span_idx].push(page); + segment.check_valid_segment(); } fn cleanup_pages(self: &mut Self) { @@ -246,14 +240,16 @@ impl TCache { continue; } bin.remove(j); + let segment = Segment::get_segment_from_ptr(slice as *mut u8); + unsafe { + (*segment).allocated += num_slices_required; + } if num_slices_original > num_slices_required { // split slice - let segment = Segment::get_segment_from_ptr(slice as *mut u8); let next_slice = unsafe { (*segment).split_page(slice, num_slices_required) }; + debug_assert!(unsafe { (*slice).slice_count == num_slices_required}); + unsafe { (*segment).check_valid_segment() } ; let bin = Self::get_span_idx_from_slice_count(num_slices_original - num_slices_required); - unsafe { - (*segment).allocated += num_slices_required; - } self.spans[bin].push(next_slice); } unsafe { @@ -273,6 +269,9 @@ impl TCache { page.slice_count = slice_count; page.slice_offset = 0; self.spans[span_idx].push(page as *mut Page); + + let last_page = &mut segment_ref.pages[PAGES_PER_SEGMENT - 1]; + last_page.slice_offset = PAGES_PER_SEGMENT - 1; } fn allocate_segment_from_arena(self: &mut Self, thread_id: usize) -> bool { From 4fde269ce2198d4c0e6945ec2c079120047ef611 Mon Sep 17 00:00:00 2001 From: proteet Date: Sat, 3 Jan 2026 00:47:03 -0700 Subject: [PATCH 08/20] Bug fixes This commit fixes the following bugs: - Sets file offset correctly in read task for fixed buffers - Sets correct block size in tcache - Clears free list of page before setting block size --- src/common/src/memory/page.rs | 4 +-- src/common/src/memory/pool.rs | 9 +++++-- src/common/src/memory/tcache.rs | 39 ++++++++++++++++++---------- src/parquet/src/io/io_uring/tasks.rs | 5 ++-- 4 files changed, 35 insertions(+), 22 deletions(-) diff --git a/src/common/src/memory/page.rs b/src/common/src/memory/page.rs index 0dfebf01..3172b887 100644 --- a/src/common/src/memory/page.rs +++ b/src/common/src/memory/page.rs @@ -41,6 +41,7 @@ impl Page { self.block_size = block_size; let mut offset: usize = 0; let mut guard = self.free_list.lock().unwrap(); + guard.clear(); while offset < self.capacity { let ptr = unsafe { self.page_start.add(offset) }; guard.push_back(Block {ptr}); @@ -48,9 +49,6 @@ impl Page { } } - /** - * Returns (block, buffer id) pair - */ #[inline] pub fn get_free_block(self: &mut Self) -> *mut u8 { let mut guard = self.free_list.lock().unwrap(); diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs index dacd124c..bfc70694 100644 --- a/src/common/src/memory/pool.rs +++ b/src/common/src/memory/pool.rs @@ -13,6 +13,7 @@ static FIXED_BUFFER_POOL: OnceLock = OnceLock::new(); pub const FIXED_BUFFER_SIZE_BYTES: usize = 1 << 20; pub const FIXED_BUFFER_BITS: u32 = FIXED_BUFFER_SIZE_BYTES.trailing_zeros(); +#[derive(Debug)] pub struct FixedBuffer { pub ptr: *mut u8, pub buf_id: usize, @@ -90,8 +91,10 @@ impl FixedBufferPool { } pub fn malloc(size: usize) -> *mut u8 { + let cpu = unsafe { libc::sched_getcpu() }; let local_cache = Self::get_thread_local_cache(); let ptr = local_cache.lock().unwrap().allocate(size); + log::debug!("Allocated pointer: {:?}, size: {}, cpu: {}", ptr, size, cpu); if ptr.is_null() { let pool = FIXED_BUFFER_POOL.get().unwrap(); log::info!("Foreign frees: {}", pool.foreign_free.load(Ordering::Relaxed)); @@ -151,11 +154,12 @@ impl FixedBufferPool { fn free(ptr: *mut u8) { let segment_ptr = Segment::get_segment_from_ptr(ptr); let page_ptr = unsafe { (*segment_ptr).get_page_from_ptr(ptr) }; + let thread_id = unsafe { (*segment_ptr).thread_id }; + log::debug!("Freed pointer: {:?}, size: {}, owner thread id: {}", ptr, unsafe { (*page_ptr).block_size }, thread_id); unsafe { (*page_ptr).free(ptr); } - // If page is local and unused after free, return it to segment - let thread_id = unsafe { (*segment_ptr).thread_id }; + // If page is local and unused after free, return it to segment let cur_cpu = unsafe { libc::sched_getcpu() as usize }; if cur_cpu == thread_id { let should_free_page = unsafe { (*page_ptr).used.load(Ordering::Relaxed) == 0 }; @@ -165,6 +169,7 @@ impl FixedBufferPool { guard.retire_page(page_ptr); } } else { + log::debug!("Freeing from foreign thread"); let pool = FIXED_BUFFER_POOL.get().unwrap(); pool.foreign_free.fetch_add(1, Ordering::Relaxed); } diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index af7d50e2..4504ba3c 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -110,12 +110,14 @@ impl TCache { fn remove_slice_from_span(self: &mut Self, slice: &mut Page) -> bool { let span_idx = Self::get_span_idx_from_slice_count(slice.slice_count); for i in 0..self.spans[span_idx].len() { - if self.spans[span_idx][i] == slice { + let page_start = unsafe { (*self.spans[span_idx][i]).page_start }; + if page_start == slice.page_start { self.spans[span_idx].remove(i); - break; + return true; } } - return true; + log::info!("[thread_id: {}] Slice not found in span with index: {}, slice count: {}", self.thread_id, span_idx, slice.slice_count); + return false; } fn retire_segment(self: &mut Self, segment: *mut Segment) { @@ -125,7 +127,7 @@ impl TCache { let pages = unsafe { &mut (*segment).pages }; let mut slice_idx: usize = 0; while slice_idx < PAGES_PER_SEGMENT { - self.remove_slice_from_span(&mut pages[slice_idx]); + assert!(pages[slice_idx].block_size != 0 || self.remove_slice_from_span(&mut pages[slice_idx])); slice_idx += pages[slice_idx].slice_count; } let mut guard = self.arena.lock().unwrap(); @@ -153,11 +155,11 @@ impl TCache { } pub(crate) fn retire_page(self: &mut Self, page: *mut Page) { + assert!(unsafe { (*page).used.load(Ordering::Relaxed) == 0 }); self.stats.pages_retired += 1; self.remove_page_from_used_queue(page); self.remove_page_from_free_queue(page); let page_ref = unsafe { &mut (*page) }; - page_ref.block_size = 0; let segment_ptr = Segment::get_segment_from_ptr(page as *mut u8); let segment = unsafe { &mut *segment_ptr }; @@ -167,35 +169,42 @@ impl TCache { self.retire_segment(segment_ptr); return; } + page_ref.block_size = 0; let next_slice = page.wrapping_add(page_ref.slice_count); if next_slice <= (&mut segment.pages[PAGES_PER_SEGMENT - 1]) as *mut Page { let next_slice_ref = unsafe { &mut (*next_slice) }; if next_slice_ref.block_size == 0 { - // log::info!("[thread_id: {}, segment_id: {}] Merging released slice with next slice. Slice count of next slice: {}", self.thread_id, segment.segment_id, next_slice_ref.slice_count); + log::debug!("[thread_id: {}] Merging released slice with next slice. Slice count of next slice: {}", self.thread_id, next_slice_ref.slice_count); // Page is not in use, remove it - self.remove_slice_from_span(next_slice_ref); + assert!(self.remove_slice_from_span(next_slice_ref)); segment.coalesce_slices(page_ref, unsafe { &mut (*next_slice) }); } } + let mut merged_with_prev = false; + if unsafe { page.offset_from(&mut segment.pages[0] as *mut Page) > 0 } { let mut prev_slice = page.wrapping_sub(1); prev_slice = prev_slice.wrapping_sub(unsafe { (*prev_slice).slice_offset }); let prev_slice_ref = unsafe { &mut (*prev_slice) }; if prev_slice_ref.block_size == 0 { // Merge with the previous slice - // log::info!("[thread_id: {}, segment_id: {}] Merging slice with previous slice. Slice count of previous slice: {}", self.thread_id, segment.segment_id, prev_slice_ref.slice_count); - self.remove_slice_from_span(prev_slice_ref); + log::info!("[thread_id: {}] Merging slice with previous slice. Slice count of previous slice: {}", self.thread_id, prev_slice_ref.slice_count); + assert!(self.remove_slice_from_span(prev_slice_ref)); segment.coalesce_slices(prev_slice_ref, page_ref); let span_idx = Self::get_span_idx_from_slice_count(prev_slice_ref.slice_count); self.spans[span_idx].push(prev_slice); + log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, prev_slice_ref.slice_count, span_idx); + merged_with_prev = true; } - } else { + } + if !merged_with_prev { let span_idx = Self::get_span_idx_from_slice_count(page_ref.slice_count); self.spans[span_idx].push(page); + log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, page_ref.slice_count, span_idx); } - segment.check_valid_segment(); + segment.check_valid_segment(); } fn cleanup_pages(self: &mut Self) { @@ -251,8 +260,9 @@ impl TCache { unsafe { (*segment).check_valid_segment() } ; let bin = Self::get_span_idx_from_slice_count(num_slices_original - num_slices_required); self.spans[bin].push(next_slice); + log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, num_slices_original - num_slices_required, bin); } - unsafe { + unsafe { (*slice).set_block_size(block_size); } return slice; @@ -283,7 +293,7 @@ impl TCache { if segment_opt.is_none() { return false; } - log::info!("Allocating segment to thread with id: {}", thread_id); + // log::info!("Allocating segment to thread with id: {}", thread_id); unsafe { (*segment_opt.unwrap()).thread_id = thread_id; } @@ -324,6 +334,7 @@ impl TCache { let block_size = SIZE_CLASSES[size_class]; let mut free_page = self.free_pages[size_class]; if !free_page.is_null() { + debug_assert_eq!(unsafe {(*free_page).block_size}, block_size); // allocate from free page let page = free_page.clone(); unsafe { @@ -354,7 +365,7 @@ impl TCache { if !res { return null_mut() } - free_page = self.find_page_from_spans(1, size); + free_page = self.find_page_from_spans(1, block_size); assert_ne!(free_page, null_mut()); let free_block = unsafe { (*free_page).get_free_block() }; self.free_pages[size_class] = free_page; diff --git a/src/parquet/src/io/io_uring/tasks.rs b/src/parquet/src/io/io_uring/tasks.rs index 68efb295..3dc5d561 100644 --- a/src/parquet/src/io/io_uring/tasks.rs +++ b/src/parquet/src/io/io_uring/tasks.rs @@ -294,7 +294,8 @@ impl IoTask for FixedFileReadTask { fn prepare_sqe(&mut self) -> Vec { let buffers = FixedBufferPool::get_fixed_buffers(&self.fixed_buffer); let mut sqes = Vec::::new(); - let mut file_offset = 0; + let (start_padding, _) = self.padding(); + let mut file_offset = self.range.start - start_padding as u64; for buffer in buffers { let sqe = opcode::ReadFixed::new( io_uring::types::Fd(self.file), @@ -327,7 +328,6 @@ pub(crate) struct FileWriteTask { data: Bytes, fd: RawFd, error: Option, - use_fixed_buffers: bool, } impl FileWriteTask { @@ -336,7 +336,6 @@ impl FileWriteTask { data, fd, error: None, - use_fixed_buffers, } } From faec133843a3b28ae3b64e4a39a9f01c94d4b083 Mon Sep 17 00:00:00 2001 From: proteet Date: Fri, 9 Jan 2026 07:15:28 -0700 Subject: [PATCH 09/20] Sharded free list This commit implements a lockfree queue to hold frees by different threads. Freed blocks are cleaned up later, either through periodic cleanup operations or when under memory pressure. --- benchmark/in_process.rs | 9 ++- benchmark/src/inprocess_runner.rs | 9 +++ src/common/Cargo.toml | 1 + src/common/src/memory/page.rs | 67 ++++++++++------ src/common/src/memory/pool.rs | 32 ++++++-- src/common/src/memory/tcache.rs | 129 +++++++++++++++++++----------- src/local/src/lib.rs | 11 ++- 7 files changed, 179 insertions(+), 79 deletions(-) diff --git a/benchmark/in_process.rs b/benchmark/in_process.rs index 6c7b7b89..a620c1a8 100644 --- a/benchmark/in_process.rs +++ b/benchmark/in_process.rs @@ -4,7 +4,7 @@ use fastrace::prelude::*; use liquid_cache_benchmarks::{ BenchmarkManifest, InProcessBenchmarkMode, InProcessBenchmarkRunner, setup_observability, }; -use liquid_cache_common::IoMode; +use liquid_cache_common::{IoMode, memory::pool::FixedBufferPool}; use mimalloc::MiMalloc; use serde::Serialize; use std::path::PathBuf; @@ -66,6 +66,9 @@ struct InProcessBenchmark { /// IO mode, available options: uring, uring-direct, std-blocking, tokio, std-spawn-blocking #[arg(long = "io-mode", default_value = "uring-multi-async")] io_mode: IoMode, + + #[arg(long = "fixed-buffer-pool-size-mb", default_value = "0")] + fixed_buffer_pool_size_mb: usize, } impl InProcessBenchmark { @@ -83,7 +86,8 @@ impl InProcessBenchmark { .with_cache_dir(self.cache_dir.clone()) .with_query_filter(self.query_index) .with_io_mode(self.io_mode) - .with_output_dir(self.output_dir.clone()); + .with_output_dir(self.output_dir.clone()) + .with_fixed_buffer_pool_size_mb(self.fixed_buffer_pool_size_mb); runner.run(manifest, self, output).await?; Ok(()) } @@ -97,6 +101,7 @@ async fn main() -> Result<()> { let _guard = root.set_local_parent(); benchmark.run().await?; + FixedBufferPool::print_stats(); fastrace::flush(); Ok(()) } diff --git a/benchmark/src/inprocess_runner.rs b/benchmark/src/inprocess_runner.rs index 2891b331..d8fff0b6 100644 --- a/benchmark/src/inprocess_runner.rs +++ b/benchmark/src/inprocess_runner.rs @@ -108,6 +108,7 @@ pub struct InProcessBenchmarkRunner { pub cache_dir: Option, pub io_mode: IoMode, pub output_dir: Option, + pub fixed_buffer_pool_size_mb: usize, } impl Default for InProcessBenchmarkRunner { @@ -129,6 +130,7 @@ impl InProcessBenchmarkRunner { cache_dir: None, io_mode: IoMode::default(), output_dir: None, + fixed_buffer_pool_size_mb: 0, } } @@ -182,6 +184,11 @@ impl InProcessBenchmarkRunner { self } + pub fn with_fixed_buffer_pool_size_mb(mut self, fixed_buffer_pool_size_mb: usize) -> Self { + self.fixed_buffer_pool_size_mb = fixed_buffer_pool_size_mb; + self + } + #[fastrace::trace] async fn setup_context( &self, @@ -245,6 +252,7 @@ impl InProcessBenchmarkRunner { .with_cache_policy(Box::new(LiquidPolicy::new())) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .with_io_mode(self.io_mode) + .with_fixed_buffer_pool_size_mb(self.fixed_buffer_pool_size_mb) .build(session_config)?; (v.0, Some(v.1)) } @@ -255,6 +263,7 @@ impl InProcessBenchmarkRunner { .with_cache_policy(Box::new(LiquidPolicy::new())) .with_squeeze_policy(Box::new(TranscodeEvict)) .with_io_mode(self.io_mode) + .with_fixed_buffer_pool_size_mb(self.fixed_buffer_pool_size_mb) .build(session_config)?; (v.0, Some(v.1)) } diff --git a/src/common/Cargo.toml b/src/common/Cargo.toml index f93cad81..a75bfb3a 100644 --- a/src/common/Cargo.toml +++ b/src/common/Cargo.toml @@ -12,6 +12,7 @@ arrow-flight = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } chrono = "0.4.42" +crossbeam = "0.8.4" futures = { workspace = true } io-uring = "0.7.11" libc = "0.2.177" diff --git a/src/common/src/memory/page.rs b/src/common/src/memory/page.rs index 3172b887..b3565fe1 100644 --- a/src/common/src/memory/page.rs +++ b/src/common/src/memory/page.rs @@ -1,4 +1,6 @@ -use std::{collections::VecDeque, ptr::null_mut, sync::{Mutex, atomic::{AtomicUsize, Ordering}}}; +use std::{collections::VecDeque, ptr::null_mut}; + +use crate::memory::tcache::MIN_SIZE_FROM_PAGES; #[derive(Clone, Copy)] pub struct Block { @@ -10,10 +12,9 @@ pub const PAGE_SIZE: usize = 256<<10; pub struct Page { pub(crate) block_size: usize, // Size of objects that are being allocated to this page // TODO(): Remove dependency on dynamically allocated memory - free_list: Mutex>, - pub(crate) used: AtomicUsize, - // local_free_list: VecDeque, - // thread_free_list: VecDeque, + free_list: VecDeque, + pub(crate) used: usize, + pub(crate) thread_free_list: crossbeam::queue::ArrayQueue, pub(crate) capacity: usize, pub(crate) slice_count: usize, // No. of pages in the slice containing this page pub(crate) slice_offset: usize, // Offset of this page from the start of this slice @@ -22,14 +23,11 @@ pub struct Page { impl Page { pub fn from_slice(slice: Slice) -> Page { - // let mut start_ptr = slice.ptr; - let free_list = VecDeque::::new(); Page { block_size: 0usize, - free_list: Mutex::new(free_list), - used: AtomicUsize::new(0), - // local_free_list: VecDeque::new(), - // thread_free_list: VecDeque::new(), + free_list: VecDeque::::with_capacity(PAGE_SIZE/MIN_SIZE_FROM_PAGES), + used: 0, + thread_free_list: crossbeam::queue::ArrayQueue::new(PAGE_SIZE/MIN_SIZE_FROM_PAGES), capacity: slice.size, slice_count: 1, slice_offset: 0, @@ -40,43 +38,62 @@ impl Page { pub fn set_block_size(self: &mut Self, block_size: usize) { self.block_size = block_size; let mut offset: usize = 0; - let mut guard = self.free_list.lock().unwrap(); - guard.clear(); + self.free_list.clear(); while offset < self.capacity { let ptr = unsafe { self.page_start.add(offset) }; - guard.push_back(Block {ptr}); + self.free_list.push_back(Block {ptr}); offset += self.block_size; } } #[inline] pub fn get_free_block(self: &mut Self) -> *mut u8 { - let mut guard = self.free_list.lock().unwrap(); - let block = guard.pop_front(); + let block = self.free_list.pop_front(); if block.is_none() { return null_mut() } - self.used.fetch_add(1usize, Ordering::Relaxed); + self.used += 1; block.unwrap().ptr } - #[inline] + #[inline(always)] pub fn is_full(self: &Self) -> bool { - let guard = self.free_list.lock().unwrap(); - guard.is_empty() + self.free_list.is_empty() } - #[inline] + #[inline(always)] + pub fn is_unused(self: &Self) -> bool { + self.used == 0 + } + + #[inline(always)] pub fn get_size(self: &Self) -> usize { self.capacity } - #[inline] + /// Pointer freed on the same core + #[inline(always)] pub fn free(self: &mut Self, ptr: *mut u8) { + self.free_list.push_back(Block {ptr}); + self.used -= 1; + } + + /// Pointer freed on a different core + #[inline(always)] + pub(crate) fn foreign_free(self: &mut Self, ptr: *mut u8) { let blk = Block {ptr}; - let mut guard = self.free_list.lock().unwrap(); - guard.push_back(blk); - self.used.fetch_sub(1usize, Ordering::Relaxed); + let r = self.thread_free_list.push(blk); + debug_assert!(r.is_ok()); + } + + /// Collect pointers freed by other threads + #[inline] + pub(crate) fn collect_foreign_frees(self: &mut Self) { + while !self.thread_free_list.is_empty() { + let blk = self.thread_free_list.pop().unwrap(); + self.free_list.push_back(blk); + self.used -= 1; + } } } diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs index bfc70694..b1d4e1ad 100644 --- a/src/common/src/memory/pool.rs +++ b/src/common/src/memory/pool.rs @@ -156,24 +156,46 @@ impl FixedBufferPool { let page_ptr = unsafe { (*segment_ptr).get_page_from_ptr(ptr) }; let thread_id = unsafe { (*segment_ptr).thread_id }; log::debug!("Freed pointer: {:?}, size: {}, owner thread id: {}", ptr, unsafe { (*page_ptr).block_size }, thread_id); - unsafe { - (*page_ptr).free(ptr); - } + // If page is local and unused after free, return it to segment let cur_cpu = unsafe { libc::sched_getcpu() as usize }; if cur_cpu == thread_id { - let should_free_page = unsafe { (*page_ptr).used.load(Ordering::Relaxed) == 0 }; + unsafe { + (*page_ptr).free(ptr); + } + let should_free_page = unsafe { (*page_ptr).is_unused() }; if should_free_page { let local_cache = Self::get_thread_local_cache(); let mut guard = local_cache.lock().unwrap(); guard.retire_page(page_ptr); } } else { - log::debug!("Freeing from foreign thread"); + unsafe { (*page_ptr).foreign_free(ptr); } let pool = FIXED_BUFFER_POOL.get().unwrap(); pool.foreign_free.fetch_add(1, Ordering::Relaxed); } } + + pub fn print_stats() { + if FIXED_BUFFER_POOL.get().is_none() { + return + } + let num_cpus = std::thread::available_parallelism().unwrap(); + let mut agg_stats = TCacheStats::new(); + for i in 0..num_cpus.get() { + let stats = Self::get_stats(i); + agg_stats.allocations_from_arena += stats.allocations_from_arena; + agg_stats.allocations_from_pages += stats.allocations_from_pages; + agg_stats.allocations_from_segment += stats.allocations_from_segment; + agg_stats.fast_allocations += stats.fast_allocations; + agg_stats.pages_retired += stats.pages_retired; + agg_stats.segments_retired += stats.segments_retired; + agg_stats.total_segments_allocated += stats.total_segments_allocated; + agg_stats.unsuccessful_allocations += stats.unsuccessful_allocations; + agg_stats.total_allocations += stats.total_allocations; + } + agg_stats.print(); + } } impl Drop for FixedBufferPool { diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index 4504ba3c..7d217d8b 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -1,6 +1,6 @@ use std::{ ptr::null_mut, - sync::{Arc, Mutex, atomic::Ordering}, + sync::{Arc, Mutex}, }; use crate::memory::{ @@ -29,6 +29,7 @@ const SEGMENT_BINS: usize = (SEGMENT_SIZE/PAGE_SIZE).ilog2() as usize + 1; pub(crate) struct TCacheStats { // Allocation stats pub(crate) total_allocations: usize, + pub(crate) unsuccessful_allocations: usize, pub(crate) total_segments_allocated: usize, pub(crate) fast_allocations: usize, // Allocations from self.free_pages pub(crate) allocations_from_pages: usize, // Allocations from self.used_pages @@ -49,6 +50,7 @@ impl TCacheStats { #[allow(unused)] pub(crate) fn print(self: &Self) { println!("Total allocations: {}", self.total_allocations); + println!("Unsuccessful allocations: {}", self.unsuccessful_allocations); println!("Fast allocations: {}", self.fast_allocations); println!("Allocations from pages: {}", self.allocations_from_pages); println!("Allocations from segment: {}", self.allocations_from_segment); @@ -60,7 +62,8 @@ impl TCacheStats { pub(crate) struct TCache { free_pages: [*mut Page; NUM_SIZE_CLASSES], - used_pages: [Vec<*mut Page>; NUM_SIZE_CLASSES], + // Last size class holds slices that serve large allocations (>256KB) + used_pages: [Vec<*mut Page>; NUM_SIZE_CLASSES + 1], // TODO: Use a linked list for O(1) deletion spans: [Vec<*mut Page>; SEGMENT_BINS], arena: Arc>, @@ -75,7 +78,7 @@ impl TCache { pub(crate) fn new(arena: Arc>, thread_id: usize) -> TCache { TCache { free_pages: [const { null_mut() }; NUM_SIZE_CLASSES], - used_pages: [const { Vec::<*mut Page>::new() }; NUM_SIZE_CLASSES], + used_pages: [const { Vec::<*mut Page>::new() }; NUM_SIZE_CLASSES + 1], spans: [const { Vec::<*mut Page>::new() }; SEGMENT_BINS], arena: arena.clone(), thread_id, @@ -91,13 +94,6 @@ impl TCache { (size.next_power_of_two() / MIN_SIZE_FROM_PAGES).trailing_zeros() as usize } - // #[inline] - // fn get_span_idx_from_size(size: usize) -> usize { - // ((size + PAGE_SIZE - 1) / PAGE_SIZE) - // .next_power_of_two() - // .trailing_zeros() as usize - // } - /** * Get the smallest bin which can hold contiguous runs of `slice_count` pages */ @@ -135,9 +131,9 @@ impl TCache { } fn remove_page_from_used_queue(self: &mut Self, page_ptr: *mut Page) { - let size_class = Self::get_size_class(unsafe { (*page_ptr).block_size }); + let mut size_class = Self::get_size_class(unsafe { (*page_ptr).block_size }); if size_class >= NUM_SIZE_CLASSES { - return + size_class = NUM_SIZE_CLASSES; } for i in 0..self.used_pages[size_class].len() { if self.used_pages[size_class][i] == page_ptr { @@ -155,7 +151,7 @@ impl TCache { } pub(crate) fn retire_page(self: &mut Self, page: *mut Page) { - assert!(unsafe { (*page).used.load(Ordering::Relaxed) == 0 }); + assert!(unsafe { (*page).is_unused() }); self.stats.pages_retired += 1; self.remove_page_from_used_queue(page); self.remove_page_from_free_queue(page); @@ -190,7 +186,7 @@ impl TCache { let prev_slice_ref = unsafe { &mut (*prev_slice) }; if prev_slice_ref.block_size == 0 { // Merge with the previous slice - log::info!("[thread_id: {}] Merging slice with previous slice. Slice count of previous slice: {}", self.thread_id, prev_slice_ref.slice_count); + log::debug!("[thread_id: {}] Merging slice with previous slice. Slice count of previous slice: {}", self.thread_id, prev_slice_ref.slice_count); assert!(self.remove_slice_from_span(prev_slice_ref)); segment.coalesce_slices(prev_slice_ref, page_ref); let span_idx = Self::get_span_idx_from_slice_count(prev_slice_ref.slice_count); @@ -208,13 +204,28 @@ impl TCache { } fn cleanup_pages(self: &mut Self) { + for i in 0..self.free_pages.len() { + let page = self.free_pages[i]; + if page != null_mut() { + unsafe { + (*page).collect_foreign_frees(); + if (*page).is_unused() { + self.retire_page(page); + self.free_pages[i] = null_mut(); + } + } + } + } for i in 0..self.used_pages.len() { - for page_idx in 0..self.used_pages[i].len() { + let mut page_idx = 0; + while page_idx < self.used_pages[i].len() { let page = self.used_pages[i][page_idx]; unsafe { - if (*page).used.load(Ordering::Relaxed) == 0 { + (*page).collect_foreign_frees(); + if (*page).is_unused() { self.retire_page(page); - self.used_pages[i].remove(page_idx); + } else { + page_idx += 1; } } } @@ -224,6 +235,7 @@ impl TCache { fn find_page_from_used(self: &mut Self, bin: usize) -> *mut u8 { for i in 0..self.used_pages[bin].len() { unsafe { + (*self.used_pages[bin][i]).collect_foreign_frees(); if (*self.used_pages[bin][i]).is_full() { continue; } @@ -262,7 +274,7 @@ impl TCache { self.spans[bin].push(next_slice); log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, num_slices_original - num_slices_required, bin); } - unsafe { + unsafe { (*slice).set_block_size(block_size); } return slice; @@ -302,32 +314,51 @@ impl TCache { true } - pub(crate) fn allocate(self: &mut Self, size: usize) -> *mut u8 { - self.stats.total_allocations += 1; - if size > PAGE_SIZE { - // Directly get page from segment - let num_pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; - let block_size = num_pages * PAGE_SIZE; - let mut free_page = self.find_page_from_spans(num_pages, block_size); - if free_page != null_mut() { - self.stats.allocations_from_segment += 1; - let free_block = unsafe { (*free_page).get_free_block() }; - return free_block - } - self.stats.allocations_from_arena += 1; - let res = self.allocate_segment_from_arena(self.thread_id); - if !res { - return null_mut() - } - free_page = self.find_page_from_spans(num_pages, block_size); - if free_page == null_mut() { - return null_mut() - } - debug_assert_eq!(block_size, unsafe { (*free_page).block_size }); - assert_ne!(free_page, null_mut()); + fn allocate_large(self: &mut Self, size: usize) -> *mut u8 { + // Directly get page from segment + let num_pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + let block_size = num_pages * PAGE_SIZE; + let mut free_page = self.find_page_from_spans(num_pages, block_size); + if free_page != null_mut() { + self.stats.allocations_from_segment += 1; + let free_block = unsafe { (*free_page).get_free_block() }; + return free_block + } + self.cleanup_pages(); + // Retry after cleanup + free_page = self.find_page_from_spans(num_pages, block_size); + if free_page != null_mut() { + self.stats.allocations_from_segment += 1; + self.used_pages[NUM_SIZE_CLASSES].push(free_page); let free_block = unsafe { (*free_page).get_free_block() }; return free_block } + + let res = self.allocate_segment_from_arena(self.thread_id); + if !res { + return null_mut() + } + self.stats.allocations_from_arena += 1; + free_page = self.find_page_from_spans(num_pages, block_size); + if free_page == null_mut() { + self.stats.unsuccessful_allocations += 1; + return null_mut() + } + self.used_pages[NUM_SIZE_CLASSES].push(free_page); + assert_ne!(free_page, null_mut()); + let free_block = unsafe { (*free_page).get_free_block() }; + return free_block + } + + pub(crate) fn allocate(self: &mut Self, size: usize) -> *mut u8 { + self.stats.total_allocations = self.stats.total_allocations.wrapping_add(1); + if self.stats.total_allocations & 0x7f == 0 { + // Periodically cleanup pages + self.cleanup_pages(); + } + if size > PAGE_SIZE { + return self.allocate_large(size) + } let size_class = Self::get_size_class(size); debug_assert!(size_class < NUM_SIZE_CLASSES); @@ -338,12 +369,17 @@ impl TCache { // allocate from free page let page = free_page.clone(); unsafe { - if (*page).is_full() { - self.used_pages[size_class].push(page); - self.free_pages[size_class] = null_mut(); - } else { + if !(*page).is_full() { self.stats.fast_allocations += 1; return (*page).get_free_block() + } else { + // Try collecting frees from other threads and retrying + (*page).collect_foreign_frees(); + if !(*page).is_full() { + return (*page).get_free_block() + } + self.used_pages[size_class].push(page); + self.free_pages[size_class] = null_mut(); } } } @@ -360,11 +396,12 @@ impl TCache { return free_block; } // No space available in segments, allocate a new one - self.stats.allocations_from_arena += 1; let res = self.allocate_segment_from_arena(self.thread_id); if !res { + self.stats.unsuccessful_allocations += 1; return null_mut() } + self.stats.allocations_from_arena += 1; free_page = self.find_page_from_spans(1, block_size); assert_ne!(free_page, null_mut()); let free_block = unsafe { (*free_page).get_free_block() }; diff --git a/src/local/src/lib.rs b/src/local/src/lib.rs index 107016a4..2f7ec709 100644 --- a/src/local/src/lib.rs +++ b/src/local/src/lib.rs @@ -69,6 +69,8 @@ pub struct LiquidCacheLocalBuilder { span: fastrace::Span, io_mode: IoMode, + + fixed_buffer_pool_size_mb: usize, } impl Default for LiquidCacheLocalBuilder { @@ -81,6 +83,7 @@ impl Default for LiquidCacheLocalBuilder { squeeze_policy: Box::new(TranscodeSqueezeEvict), span: fastrace::Span::enter_with_local_parent("liquid_cache_local_builder"), io_mode: IoMode::StdBlocking, + fixed_buffer_pool_size_mb: 0, } } } @@ -133,6 +136,12 @@ impl LiquidCacheLocalBuilder { self } + /// Set size of fixed buffer pool + pub fn with_fixed_buffer_pool_size_mb(mut self, fixed_buffer_pool_size_mb: usize) -> Self { + self.fixed_buffer_pool_size_mb = fixed_buffer_pool_size_mb; + self + } + /// Build a SessionContext with liquid cache configured /// Returns the SessionContext and the liquid cache reference pub fn build(self, mut config: SessionConfig) -> Result<(SessionContext, LiquidCacheRef)> { @@ -153,7 +162,7 @@ impl LiquidCacheLocalBuilder { self.cache_policy, self.squeeze_policy, self.io_mode, - 0, + self.fixed_buffer_pool_size_mb, ); let cache_ref = Arc::new(cache); From f67f1e3a42905bd6c38f6f6d33519372d165bf43 Mon Sep 17 00:00:00 2001 From: proteet Date: Fri, 9 Jan 2026 13:50:54 -0700 Subject: [PATCH 10/20] Change page size to 64KB --- src/common/src/memory/page.rs | 7 +------ src/common/src/memory/pool.rs | 5 ++--- src/common/src/memory/segment.rs | 20 ++++++++++++-------- src/common/src/memory/tcache.rs | 2 -- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/common/src/memory/page.rs b/src/common/src/memory/page.rs index b3565fe1..7f8f122d 100644 --- a/src/common/src/memory/page.rs +++ b/src/common/src/memory/page.rs @@ -7,7 +7,7 @@ pub struct Block { ptr: *mut u8, } -pub const PAGE_SIZE: usize = 256<<10; +pub const PAGE_SIZE: usize = 64<<10; // 64KB pub struct Page { pub(crate) block_size: usize, // Size of objects that are being allocated to this page @@ -66,11 +66,6 @@ impl Page { self.used == 0 } - #[inline(always)] - pub fn get_size(self: &Self) -> usize { - self.capacity - } - /// Pointer freed on the same core #[inline(always)] pub fn free(self: &mut Self, ptr: *mut u8) { diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs index b1d4e1ad..25a98a5c 100644 --- a/src/common/src/memory/pool.rs +++ b/src/common/src/memory/pool.rs @@ -6,7 +6,7 @@ use std::{cmp::min, sync::{Arc, Mutex, OnceLock, atomic::{AtomicBool, AtomicU64, use futures::io; use io_uring::IoUring; -use crate::memory::{arena::Arena, page::PAGE_SIZE, segment::Segment, tcache::{TCache, TCacheStats}}; +use crate::memory::{arena::Arena, segment::Segment, tcache::{TCache, TCacheStats}}; static FIXED_BUFFER_POOL: OnceLock = OnceLock::new(); @@ -96,8 +96,7 @@ impl FixedBufferPool { let ptr = local_cache.lock().unwrap().allocate(size); log::debug!("Allocated pointer: {:?}, size: {}, cpu: {}", ptr, size, cpu); if ptr.is_null() { - let pool = FIXED_BUFFER_POOL.get().unwrap(); - log::info!("Foreign frees: {}", pool.foreign_free.load(Ordering::Relaxed)); + log::info!("Unsuccessful allocation of {} bytes", size); } ptr } diff --git a/src/common/src/memory/segment.rs b/src/common/src/memory/segment.rs index e56e247d..90b552f0 100644 --- a/src/common/src/memory/segment.rs +++ b/src/common/src/memory/segment.rs @@ -1,12 +1,12 @@ -use std::ptr::null_mut; +use std::ptr::{null_mut, write}; use crate::memory::{page::{PAGE_SIZE, Page, Slice}}; -pub const SEGMENT_SIZE: usize = 32 * 1024 * 1024; +pub const SEGMENT_SIZE: usize = 32 * 1024 * 1024; // 32 MB pub const SEGMENT_SIZE_BITS: usize = SEGMENT_SIZE.ilog2() as usize; // The metadata is stored at the beginning of the slice. So we don't get the entirety of it for pages -pub const PAGES_PER_SEGMENT: usize = (SEGMENT_SIZE / PAGE_SIZE) - 1; +pub const PAGES_PER_SEGMENT: usize = (SEGMENT_SIZE - 3 * size_of::()) / (PAGE_SIZE + size_of::()); pub struct Segment { pub(crate) allocated: usize, @@ -17,15 +17,19 @@ pub struct Segment { impl Segment { pub fn new_from_slice(slice: Slice) -> *mut Segment { - // First sizeof(Segment) bytes should hold the Segment object let segment_ptr = slice.ptr as *mut Segment; - let mut start_ptr = unsafe { slice.ptr.add(PAGE_SIZE) }; - + let segment_end_ptr = slice.ptr.wrapping_add(SEGMENT_SIZE); + let mut start_ptr = unsafe { segment_end_ptr.sub(PAGES_PER_SEGMENT * PAGE_SIZE) }; unsafe { + let pages_ptr = (*segment_ptr).pages.as_mut_ptr(); (*segment_ptr).allocated = 0; (*segment_ptr).num_slices = PAGES_PER_SEGMENT; - for i in 0..(*segment_ptr).num_slices { - (*segment_ptr).pages[i] = Page::from_slice(Slice {ptr: start_ptr, size: PAGE_SIZE}); + for i in 0..PAGES_PER_SEGMENT { + // Use ptr::write after dropping to initialize new Pages + write( + pages_ptr.add(i), + Page::from_slice(Slice {ptr: start_ptr, size: PAGE_SIZE}) + ); start_ptr = start_ptr.wrapping_add(PAGE_SIZE); } } diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index 7d217d8b..1c6ae55f 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -15,8 +15,6 @@ const SIZE_CLASSES: &'static [usize] = &[ 16 << 10, 32 << 10, 64 << 10, - 128 << 10, - 256 << 10, ]; const NUM_SIZE_CLASSES: usize = SIZE_CLASSES.len(); From 3f7d1d5e67d5c089f5789d280e729405f0bab8f4 Mon Sep 17 00:00:00 2001 From: proteet Date: Thu, 15 Jan 2026 23:59:48 -0700 Subject: [PATCH 11/20] Get io-uring working in polled mode --- src/parquet/src/io/io_backend.rs | 22 +++++--- .../src/io/io_uring/multi_async_uring.rs | 4 +- .../src/io/io_uring/multi_blocking_uring.rs | 4 +- src/parquet/src/io/io_uring/single_uring.rs | 4 +- src/parquet/src/io/io_uring/tasks.rs | 40 +++++++++----- src/parquet/src/io/io_uring/tests.rs | 8 +-- .../src/io/io_uring/thread_pool_uring.rs | 52 ++++++++++++------- src/storage/src/cache/core.rs | 6 +-- 8 files changed, 89 insertions(+), 51 deletions(-) diff --git a/src/parquet/src/io/io_backend.rs b/src/parquet/src/io/io_backend.rs index d1be9b7c..8e49282d 100644 --- a/src/parquet/src/io/io_backend.rs +++ b/src/parquet/src/io/io_backend.rs @@ -16,7 +16,7 @@ pub(super) async fn read( IoMode::Uring => { #[cfg(target_os = "linux")] { - super::io_uring::thread_pool_uring::read(path, range, false,true).await + super::io_uring::thread_pool_uring::read(path, range, false, false).await } #[cfg(not(target_os = "linux"))] { @@ -79,10 +79,20 @@ pub(super) async fn write( data: Bytes, ) -> Result<(), std::io::Error> { match io_mode { - IoMode::Uring | IoMode::UringDirect => { + IoMode::Uring => { #[cfg(target_os = "linux")] { - super::io_uring::thread_pool_uring::write(path, &data, false).await + super::io_uring::thread_pool_uring::write(path, &data, false, false).await + } + #[cfg(not(target_os = "linux"))] + { + panic!("io_uring modes are only supported on Linux"); + } + } + IoMode::UringDirect => { + #[cfg(target_os = "linux")] + { + super::io_uring::thread_pool_uring::write(path, &data, true, false).await } #[cfg(not(target_os = "linux"))] { @@ -92,7 +102,7 @@ pub(super) async fn write( IoMode::UringShared => { #[cfg(target_os = "linux")] { - super::io_uring::single_uring::write(path, &data).await + super::io_uring::single_uring::write(path, &data, false).await } #[cfg(not(target_os = "linux"))] { @@ -102,7 +112,7 @@ pub(super) async fn write( IoMode::UringBlocking => { #[cfg(target_os = "linux")] { - super::io_uring::multi_blocking_uring::write(path, &data) + super::io_uring::multi_blocking_uring::write(path, &data, false) } #[cfg(not(target_os = "linux"))] { @@ -112,7 +122,7 @@ pub(super) async fn write( IoMode::UringMultiAsync => { #[cfg(target_os = "linux")] { - super::io_uring::multi_async_uring::write(path, &data).await + super::io_uring::multi_async_uring::write(path, &data, false).await } #[cfg(not(target_os = "linux"))] { diff --git a/src/parquet/src/io/io_uring/multi_async_uring.rs b/src/parquet/src/io/io_uring/multi_async_uring.rs index 1f499d2e..89809a1f 100644 --- a/src/parquet/src/io/io_uring/multi_async_uring.rs +++ b/src/parquet/src/io/io_uring/multi_async_uring.rs @@ -256,7 +256,7 @@ pub(crate) async fn read( submit_async_task(read_task).await.into_result() } -pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) async fn write(path: PathBuf, data: &Bytes, direct_io: bool) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) @@ -264,6 +264,6 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), false); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, false); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/io/io_uring/multi_blocking_uring.rs b/src/parquet/src/io/io_uring/multi_blocking_uring.rs index 955e7c93..1a677e3a 100644 --- a/src/parquet/src/io/io_uring/multi_blocking_uring.rs +++ b/src/parquet/src/io/io_uring/multi_blocking_uring.rs @@ -168,7 +168,7 @@ pub(crate) fn read( run_blocking_task(Box::new(read_task))?.into_result() } -pub(crate) fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) fn write(path: PathBuf, data: &Bytes, direct_io: bool) -> Result<(), std::io::Error> { use std::fs::OpenOptions; let file = OpenOptions::new() @@ -176,6 +176,6 @@ pub(crate) fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { .truncate(true) .write(true) .open(path)?; - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), false); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, false); run_blocking_task(Box::new(write_task))?.into_result() } diff --git a/src/parquet/src/io/io_uring/single_uring.rs b/src/parquet/src/io/io_uring/single_uring.rs index d7867947..2f6c24f3 100644 --- a/src/parquet/src/io/io_uring/single_uring.rs +++ b/src/parquet/src/io/io_uring/single_uring.rs @@ -331,7 +331,7 @@ pub(crate) async fn read( submit_async_task(read_task).await.into_result() } -pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) async fn write(path: PathBuf, data: &Bytes, direct_io: bool) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) @@ -339,6 +339,6 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), false); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, false); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/io/io_uring/tasks.rs b/src/parquet/src/io/io_uring/tasks.rs index 3dc5d561..35e0793b 100644 --- a/src/parquet/src/io/io_uring/tasks.rs +++ b/src/parquet/src/io/io_uring/tasks.rs @@ -1,13 +1,8 @@ use std::{ - any::Any, - ffi::CString, - fs, mem, - ops::Range, - os::{ + alloc::{Layout, alloc}, any::Any, error::Error, ffi::CString, fs, mem, ops::Range, os::{ fd::{AsRawFd, FromRawFd, RawFd}, unix::ffi::OsStringExt, - }, - path::PathBuf, + }, path::PathBuf }; use bytes::Bytes; @@ -325,16 +320,33 @@ impl IoTask for FixedFileReadTask { #[derive(Debug)] pub(crate) struct FileWriteTask { - data: Bytes, + data: *const u8, fd: RawFd, + size: usize, error: Option, } +unsafe impl Send for FileWriteTask {} + impl FileWriteTask { - pub(crate) fn build(data: Bytes, fd: RawFd, use_fixed_buffers: bool) -> FileWriteTask { + pub(crate) fn build(data: Bytes, fd: RawFd, direct_io: bool, use_fixed_buffers: bool) -> FileWriteTask { + let mut ptr = data.as_ptr(); + let bytes = data.len(); + let mut padding = 0; + if direct_io { + padding = (4096 - (data.len() & 4095)) & 4095; + let layout = Layout::from_size_align(data.len() + padding, 4096).expect("Failed to create layout"); + assert!((data.len() + padding) % 4096 == 0); + unsafe { + let new_ptr = alloc(layout); + std::ptr::copy_nonoverlapping(ptr, new_ptr, data.len()); + ptr = new_ptr; + } + } FileWriteTask { - data, + data: ptr, fd, + size: bytes + padding, error: None, } } @@ -353,8 +365,8 @@ impl IoTask for FileWriteTask { fn prepare_sqe(&mut self) -> Vec { let write_op = opcode::Write::new( io_uring::types::Fd(self.fd), - self.data.as_ptr(), - self.data.len() as u32, + self.data, + self.size as u32, ); vec![write_op.offset(0u64).build()] @@ -362,9 +374,9 @@ impl IoTask for FileWriteTask { #[inline] fn complete(&mut self, cqes: Vec<&cqueue::Entry>) { - debug_assert_eq!(cqes.len(), 1, "Should receive a single completion for a FileRead task"); + debug_assert_eq!(cqes.len(), 1, "Should receive a single completion for a FileWrite task"); let result = cqes[0].result(); - if result < 0 { + if result != self.size as i32 { self.error = Some(std::io::Error::from_raw_os_error(-result)); } } diff --git a/src/parquet/src/io/io_uring/tests.rs b/src/parquet/src/io/io_uring/tests.rs index ba40a0e7..570b5a1a 100644 --- a/src/parquet/src/io/io_uring/tests.rs +++ b/src/parquet/src/io/io_uring/tests.rs @@ -85,15 +85,15 @@ impl BackendKind { fn write_future(self, path: PathBuf, data: Bytes) -> IoFuture<()> { match self { - BackendKind::Shared => async move { single_uring::write(path, &data).await }.boxed(), + BackendKind::Shared => async move { single_uring::write(path, &data, false).await }.boxed(), BackendKind::MultiAsync => { - async move { multi_async_uring::write(path, &data).await }.boxed() + async move { multi_async_uring::write(path, &data, false).await }.boxed() } BackendKind::MultiBlocking => { - async move { multi_blocking_uring::write(path, &data) }.boxed() + async move { multi_blocking_uring::write(path, &data, false) }.boxed() } BackendKind::ThreadPool => { - async move { thread_pool_uring::write(path, &data, false).await }.boxed() + async move { thread_pool_uring::write(path, &data, false, false).await }.boxed() } } } diff --git a/src/parquet/src/io/io_uring/thread_pool_uring.rs b/src/parquet/src/io/io_uring/thread_pool_uring.rs index b03a4327..a5b8ba9e 100644 --- a/src/parquet/src/io/io_uring/thread_pool_uring.rs +++ b/src/parquet/src/io/io_uring/thread_pool_uring.rs @@ -1,17 +1,8 @@ use std::{ - collections::VecDeque, - fs::OpenOptions, - future::Future, - ops::Range, - os::fd::AsRawFd, - path::PathBuf, - pin::Pin, - sync::{ + collections::VecDeque, fs::OpenOptions, future::Future, io, ops::Range, os::{fd::AsRawFd, unix::fs::OpenOptionsExt}, path::PathBuf, pin::Pin, sync::{ OnceLock, atomic::{AtomicBool, AtomicUsize, Ordering}, - }, - task::{Context, Poll}, - thread, + }, task::{Context, Poll}, thread }; use bytes::Bytes; @@ -108,8 +99,10 @@ impl IoUringThreadpool { fn new(io_type: IoMode, register_buffers: bool) -> IoUringThreadpool { let (sender, receiver) = crossbeam_channel::unbounded::(); - let builder = IoUring::::builder(); + let mut builder = IoUring::::builder(); let ring = builder + .setup_iopoll() + // .setup_sqpoll(50000) .build(URING_NUM_ENTRIES) .expect("Failed to build IoUring instance"); @@ -251,8 +244,21 @@ impl UringWorker { self.submitted_tasks[token as usize] = Some(submission); need_submit = true; } - if need_submit { - self.ring.submit().expect("Failed to submit"); + let need_poll = self.tokens.len() < URING_NUM_ENTRIES as usize; + if need_submit || need_poll { + loop { + match self.ring.submit() { + Ok(_num_entries) => { + break; + } + Err(e) => { + if e.kind() == io::ErrorKind::Interrupted { + continue; + } + panic!("Failed to submit: {}", e.to_string()); + } + } + } } } @@ -372,8 +378,12 @@ pub(crate) async fn read( direct_io: bool, use_fixed_buffers: bool, ) -> Result { - let open_task = FileOpenTask::build(path, direct_io)?; - let file = submit_async_task(open_task).await.into_result()?; + // Perform open operations in a blocking manner as they are not compatible with a io_uring instance that uses polled mode IO + let file = OpenOptions::new() + .read(true) + .custom_flags(libc::O_DIRECT) + .open(path) + .expect("failed to create file"); let effective_range = if let Some(range) = range { range @@ -393,14 +403,20 @@ pub(crate) async fn read( return submit_async_task(read_task).await.into_result() } -pub(crate) async fn write(path: PathBuf, data: &Bytes, use_fixed_buffers: bool) -> Result<(), std::io::Error> { +pub(crate) async fn write( + path: PathBuf, + data: &Bytes, + direct_io: bool, + use_fixed_buffers: bool +) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) .write(true) + .custom_flags(libc::O_DIRECT) .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), use_fixed_buffers); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, use_fixed_buffers); submit_async_task(write_task).await.into_result() } diff --git a/src/storage/src/cache/core.rs b/src/storage/src/cache/core.rs index 90e9d281..5da58c14 100644 --- a/src/storage/src/cache/core.rs +++ b/src/storage/src/cache/core.rs @@ -759,7 +759,7 @@ impl CacheStorage { return Some(arrow::array::new_empty_array(data_type)); } let path = self.io_context.liquid_path(entry_id); - let bytes = self.io_context.read(path, None).await.ok()?; + let bytes = self.io_context.read(path, None).await.unwrap(); let compressor_states = self.io_context.get_compressor(entry_id); let compressor = compressor_states.fsst_compressor(); let liquid = crate::liquid_array::ipc::read_from_bytes( @@ -770,7 +770,7 @@ impl CacheStorage { } None => { let path = self.io_context.liquid_path(entry_id); - let bytes = self.io_context.read(path, None).await.ok()?; + let bytes = self.io_context.read(path, None).await.unwrap(); let compressor_states = self.io_context.get_compressor(entry_id); let compressor = compressor_states.fsst_compressor(); let liquid = crate::liquid_array::ipc::read_from_bytes( @@ -834,7 +834,7 @@ impl CacheStorage { } CachedData::DiskArrow(_) => { let path = self.io_context.arrow_path(entry_id); - let bytes = self.io_context.read(path, None).await.ok()?; + let bytes = self.io_context.read(path, None).await.unwrap(); let cursor = std::io::Cursor::new(bytes.to_vec()); let mut reader = arrow::ipc::reader::StreamReader::try_new(cursor, None).ok()?; let batch = reader.next()?.ok()?; From 71066109339f106554c14b54fad29246ba055eba Mon Sep 17 00:00:00 2001 From: proteet Date: Sat, 17 Jan 2026 12:41:34 -0700 Subject: [PATCH 12/20] Batch syscalls --- .../src/io/io_uring/thread_pool_uring.rs | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/parquet/src/io/io_uring/thread_pool_uring.rs b/src/parquet/src/io/io_uring/thread_pool_uring.rs index a5b8ba9e..097ce7bb 100644 --- a/src/parquet/src/io/io_uring/thread_pool_uring.rs +++ b/src/parquet/src/io/io_uring/thread_pool_uring.rs @@ -2,7 +2,7 @@ use std::{ collections::VecDeque, fs::OpenOptions, future::Future, io, ops::Range, os::{fd::AsRawFd, unix::fs::OpenOptionsExt}, path::PathBuf, pin::Pin, sync::{ OnceLock, atomic::{AtomicBool, AtomicUsize, Ordering}, - }, task::{Context, Poll}, thread + }, task::{Context, Poll}, thread, time::{Duration, SystemTime} }; use bytes::Bytes; @@ -16,6 +16,8 @@ use super::tasks::{FileOpenTask, FileReadTask, FileWriteTask, IoTask}; pub(crate) const URING_NUM_ENTRIES: u32 = 256; +const URING_BATCH_SIZE: u32 = 32; + static ENABLED: AtomicBool = AtomicBool::new(true); struct Submission { @@ -162,6 +164,9 @@ struct UringWorker { */ queued_entries: VecDeque, io_performed: AtomicUsize, + last_syscall: SystemTime, + // Number of entries that will be submitted upon calling io_uring_enter + queued_submissions: u32, } impl UringWorker { @@ -177,6 +182,8 @@ impl UringWorker { submitted_tasks: tasks, io_performed: AtomicUsize::new(0), queued_entries: VecDeque::with_capacity(URING_NUM_ENTRIES as usize), + last_syscall: SystemTime::now(), + queued_submissions: 0, } } @@ -193,7 +200,6 @@ impl UringWorker { } fn drain_intermediate_queue(&mut self) { - let mut need_submit = false; { let sq = &mut self.ring.submission(); while !sq.is_full() && !self.queued_entries.is_empty() { @@ -202,17 +208,13 @@ impl UringWorker { sq.push(&sqe).expect("Failed to push to submission queue"); } sq.sync(); - need_submit = true; + self.queued_submissions += 1; } } - if need_submit { - self.ring.submit().expect("Failed to submit"); - } } #[inline(never)] fn drain_submissions(&mut self) { - let mut need_submit = false; while !self.receiver.is_empty() && !self.tokens.is_empty() { let sq = &mut self.ring.submission(); sq.sync(); @@ -225,6 +227,7 @@ impl UringWorker { let mut submission = self.receiver.recv().unwrap(); let task = submission.task.as_mut(); let mut sqes = task.prepare_sqe(); + self.queued_submissions += sqes.len() as u32; submission.set_completions(sqes.len()); let mut tasks_submitted = 0; @@ -242,10 +245,12 @@ impl UringWorker { self.queued_entries.push_back(sqes[i].clone().user_data(token as u64)); } self.submitted_tasks[token as usize] = Some(submission); - need_submit = true; } - let need_poll = self.tokens.len() < URING_NUM_ENTRIES as usize; - if need_submit || need_poll { + // let need_poll = self.tokens.len() < URING_NUM_ENTRIES as u32; + let current_time = SystemTime::now(); + let time_from_last_submit = current_time.duration_since(self.last_syscall).expect("Failed to get duration"); + let need_syscall = self.queued_submissions >= URING_BATCH_SIZE || time_from_last_submit > Duration::from_micros(20); + if need_syscall { loop { match self.ring.submit() { Ok(_num_entries) => { @@ -259,6 +264,8 @@ impl UringWorker { } } } + self.last_syscall = SystemTime::now(); + self.queued_submissions = 0; } } From 561c91aca652ba14f3134fc3862de4b265af4d76 Mon Sep 17 00:00:00 2001 From: proteet Date: Wed, 21 Jan 2026 02:13:59 -0700 Subject: [PATCH 13/20] Store spans as a linked list instead of a vector --- src/common/src/memory/page.rs | 5 +++ src/common/src/memory/pool.rs | 1 + src/common/src/memory/tcache.rs | 78 +++++++++++++++++++++++---------- 3 files changed, 62 insertions(+), 22 deletions(-) diff --git a/src/common/src/memory/page.rs b/src/common/src/memory/page.rs index 7f8f122d..30220c1b 100644 --- a/src/common/src/memory/page.rs +++ b/src/common/src/memory/page.rs @@ -19,6 +19,9 @@ pub struct Page { pub(crate) slice_count: usize, // No. of pages in the slice containing this page pub(crate) slice_offset: usize, // Offset of this page from the start of this slice pub(crate) page_start: *mut u8, + // Next and previous pages in the span which is a doubly-linked list + pub(crate) next_page: *mut Page, + pub(crate) previous_page: *mut Page, } impl Page { @@ -32,6 +35,8 @@ impl Page { slice_count: 1, slice_offset: 0, page_start: slice.ptr, + next_page: null_mut(), + previous_page: null_mut(), } } diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs index 25a98a5c..4f193b2f 100644 --- a/src/common/src/memory/pool.rs +++ b/src/common/src/memory/pool.rs @@ -205,6 +205,7 @@ impl Drop for FixedBufferPool { } mod tests { + #[allow(unused_imports)] use std::{io::Write, os::fd::AsRawFd, ptr::{null, null_mut}}; use bytes::Bytes; diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index 1c6ae55f..4fa91ea7 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -58,12 +58,18 @@ impl TCacheStats { } } +#[derive(Copy, Clone)] +struct Span { + pub(crate) first: *mut Page, + pub(crate) last: *mut Page, +} + pub(crate) struct TCache { free_pages: [*mut Page; NUM_SIZE_CLASSES], // Last size class holds slices that serve large allocations (>256KB) used_pages: [Vec<*mut Page>; NUM_SIZE_CLASSES + 1], // TODO: Use a linked list for O(1) deletion - spans: [Vec<*mut Page>; SEGMENT_BINS], + spans: [Span; SEGMENT_BINS], arena: Arc>, thread_id: usize, stats: TCacheStats, @@ -77,7 +83,7 @@ impl TCache { TCache { free_pages: [const { null_mut() }; NUM_SIZE_CLASSES], used_pages: [const { Vec::<*mut Page>::new() }; NUM_SIZE_CLASSES + 1], - spans: [const { Vec::<*mut Page>::new() }; SEGMENT_BINS], + spans: [Span { first: null_mut(), last: null_mut() }; SEGMENT_BINS], arena: arena.clone(), thread_id, stats: TCacheStats::new(), @@ -100,18 +106,42 @@ impl TCache { (slice_count + 1).next_power_of_two().trailing_zeros() as usize - 1usize } - // TODO(): Use per-page pointers to speed up the removal - fn remove_slice_from_span(self: &mut Self, slice: &mut Page) -> bool { + fn add_slice_to_span(span: &mut Span, slice: &mut Page) { + if span.first == null_mut() { + debug_assert!(span.last == null_mut()); + span.first = slice as *mut Page; + span.last = slice as *mut Page; + return + } + debug_assert!(span.last != null_mut()); + unsafe { (*span.last).next_page = slice; } + slice.previous_page = span.last; + span.last = slice as *mut Page; + } + + fn remove_slice_from_span(self: &mut Self, slice: &mut Page) { let span_idx = Self::get_span_idx_from_slice_count(slice.slice_count); - for i in 0..self.spans[span_idx].len() { - let page_start = unsafe { (*self.spans[span_idx][i]).page_start }; - if page_start == slice.page_start { - self.spans[span_idx].remove(i); - return true; + let span = &mut self.spans[span_idx]; + if span.first == slice as *mut Page { + span.first = slice.next_page; + if slice.next_page != null_mut() { + unsafe { (*slice.next_page).previous_page = null_mut(); } + } else { + span.last = null_mut(); } + } else if span.last == slice as *mut Page { + span.last = slice.previous_page; + debug_assert!(slice.previous_page != null_mut()); + unsafe { (*span.last).next_page = null_mut(); } + } else { + debug_assert!(slice.previous_page != null_mut()); + debug_assert!(slice.next_page != null_mut()); + unsafe { (*slice.previous_page).next_page = slice.next_page; } + unsafe { (*slice.next_page).previous_page = slice.previous_page; } } - log::info!("[thread_id: {}] Slice not found in span with index: {}, slice count: {}", self.thread_id, span_idx, slice.slice_count); - return false; + + slice.next_page = null_mut(); + slice.previous_page = null_mut(); } fn retire_segment(self: &mut Self, segment: *mut Segment) { @@ -121,7 +151,9 @@ impl TCache { let pages = unsafe { &mut (*segment).pages }; let mut slice_idx: usize = 0; while slice_idx < PAGES_PER_SEGMENT { - assert!(pages[slice_idx].block_size != 0 || self.remove_slice_from_span(&mut pages[slice_idx])); + if pages[slice_idx].block_size == 0 { + self.remove_slice_from_span(&mut pages[slice_idx]); + } slice_idx += pages[slice_idx].slice_count; } let mut guard = self.arena.lock().unwrap(); @@ -171,7 +203,7 @@ impl TCache { if next_slice_ref.block_size == 0 { log::debug!("[thread_id: {}] Merging released slice with next slice. Slice count of next slice: {}", self.thread_id, next_slice_ref.slice_count); // Page is not in use, remove it - assert!(self.remove_slice_from_span(next_slice_ref)); + self.remove_slice_from_span(next_slice_ref); segment.coalesce_slices(page_ref, unsafe { &mut (*next_slice) }); } } @@ -185,17 +217,17 @@ impl TCache { if prev_slice_ref.block_size == 0 { // Merge with the previous slice log::debug!("[thread_id: {}] Merging slice with previous slice. Slice count of previous slice: {}", self.thread_id, prev_slice_ref.slice_count); - assert!(self.remove_slice_from_span(prev_slice_ref)); + self.remove_slice_from_span(prev_slice_ref); segment.coalesce_slices(prev_slice_ref, page_ref); let span_idx = Self::get_span_idx_from_slice_count(prev_slice_ref.slice_count); - self.spans[span_idx].push(prev_slice); + Self::add_slice_to_span(&mut self.spans[span_idx], prev_slice_ref); log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, prev_slice_ref.slice_count, span_idx); merged_with_prev = true; } } if !merged_with_prev { let span_idx = Self::get_span_idx_from_slice_count(page_ref.slice_count); - self.spans[span_idx].push(page); + Self::add_slice_to_span(&mut self.spans[span_idx], page_ref); log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, page_ref.slice_count, span_idx); } segment.check_valid_segment(); @@ -250,15 +282,17 @@ impl TCache { debug_assert!(block_size >= MIN_SIZE_FROM_PAGES); let min_bin = Self::get_span_idx_from_slice_count(num_slices_required); for i in min_bin..SEGMENT_BINS { - let bin = &mut self.spans[i]; - for j in 0..bin.len() { - let slice = bin[j]; + // let span = &mut self.spans[i]; + let mut slice = self.spans[i].first; + while slice != null_mut() { let num_slices_original = unsafe { (*slice).slice_count }; debug_assert!(num_slices_original >= 1 << i); if num_slices_original < num_slices_required { + unsafe { slice = (*slice).next_page; } continue; } - bin.remove(j); + self.remove_slice_from_span(unsafe { &mut *slice }); + let segment = Segment::get_segment_from_ptr(slice as *mut u8); unsafe { (*segment).allocated += num_slices_required; @@ -269,7 +303,7 @@ impl TCache { debug_assert!(unsafe { (*slice).slice_count == num_slices_required}); unsafe { (*segment).check_valid_segment() } ; let bin = Self::get_span_idx_from_slice_count(num_slices_original - num_slices_required); - self.spans[bin].push(next_slice); + Self::add_slice_to_span(&mut self.spans[bin], unsafe { &mut (*next_slice) } ); log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, num_slices_original - num_slices_required, bin); } unsafe { @@ -288,7 +322,7 @@ impl TCache { let page = &mut segment_ref.pages[0]; page.slice_count = slice_count; page.slice_offset = 0; - self.spans[span_idx].push(page as *mut Page); + Self::add_slice_to_span(&mut self.spans[span_idx], page); let last_page = &mut segment_ref.pages[PAGES_PER_SEGMENT - 1]; last_page.slice_offset = PAGES_PER_SEGMENT - 1; From bdf79000fbe715a67215dc1c145b802ecba7a050 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sat, 24 Jan 2026 10:09:56 -0600 Subject: [PATCH 14/20] Replace SystemTime with Instant --- src/parquet/src/io/io_uring/thread_pool_uring.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/parquet/src/io/io_uring/thread_pool_uring.rs b/src/parquet/src/io/io_uring/thread_pool_uring.rs index 097ce7bb..5280eb58 100644 --- a/src/parquet/src/io/io_uring/thread_pool_uring.rs +++ b/src/parquet/src/io/io_uring/thread_pool_uring.rs @@ -2,7 +2,7 @@ use std::{ collections::VecDeque, fs::OpenOptions, future::Future, io, ops::Range, os::{fd::AsRawFd, unix::fs::OpenOptionsExt}, path::PathBuf, pin::Pin, sync::{ OnceLock, atomic::{AtomicBool, AtomicUsize, Ordering}, - }, task::{Context, Poll}, thread, time::{Duration, SystemTime} + }, task::{Context, Poll}, thread, time::{Duration, Instant} }; use bytes::Bytes; @@ -164,7 +164,7 @@ struct UringWorker { */ queued_entries: VecDeque, io_performed: AtomicUsize, - last_syscall: SystemTime, + last_syscall: Instant, // Number of entries that will be submitted upon calling io_uring_enter queued_submissions: u32, } @@ -182,7 +182,7 @@ impl UringWorker { submitted_tasks: tasks, io_performed: AtomicUsize::new(0), queued_entries: VecDeque::with_capacity(URING_NUM_ENTRIES as usize), - last_syscall: SystemTime::now(), + last_syscall: Instant::now(), queued_submissions: 0, } } @@ -247,8 +247,7 @@ impl UringWorker { self.submitted_tasks[token as usize] = Some(submission); } // let need_poll = self.tokens.len() < URING_NUM_ENTRIES as u32; - let current_time = SystemTime::now(); - let time_from_last_submit = current_time.duration_since(self.last_syscall).expect("Failed to get duration"); + let time_from_last_submit = self.last_syscall.elapsed(); let need_syscall = self.queued_submissions >= URING_BATCH_SIZE || time_from_last_submit > Duration::from_micros(20); if need_syscall { loop { @@ -264,7 +263,7 @@ impl UringWorker { } } } - self.last_syscall = SystemTime::now(); + self.last_syscall = Instant::now(); self.queued_submissions = 0; } } From 7716ccac1cc1aef357ad4be655832c3ed75d33a0 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sun, 25 Jan 2026 09:07:10 -0600 Subject: [PATCH 15/20] Add usdt tracepoint for threadpool uring --- src/parquet/Cargo.toml | 3 ++- .../src/io/io_uring/thread_pool_uring.rs | 27 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/parquet/Cargo.toml b/src/parquet/Cargo.toml index 3e42f1a7..5422f8f6 100644 --- a/src/parquet/Cargo.toml +++ b/src/parquet/Cargo.toml @@ -23,6 +23,8 @@ liquid-cache-storage = { workspace = true } fastrace = { workspace = true } async-trait = { workspace = true } parquet-variant-json = { workspace = true } +usdt = "0.6" +rand = "0.9.2" [target.'cfg(target_os = "linux")'.dependencies] io-uring = "0.7.10" @@ -33,7 +35,6 @@ crossbeam-queue = "0.3.11" [dev-dependencies] tempfile = "3.23.0" divan = "0.1" -rand = "0.9.2" shuttle = "0.8.1" tokio-test = "0.4" serde_json = { workspace = true } diff --git a/src/parquet/src/io/io_uring/thread_pool_uring.rs b/src/parquet/src/io/io_uring/thread_pool_uring.rs index 5280eb58..3cdabc31 100644 --- a/src/parquet/src/io/io_uring/thread_pool_uring.rs +++ b/src/parquet/src/io/io_uring/thread_pool_uring.rs @@ -11,6 +11,25 @@ use liquid_cache_common::{IoMode, memory::pool::FixedBufferPool}; use tokio::sync::oneshot; use crate::io::io_uring::tasks::FixedFileReadTask; +use rand::Rng; + +#[usdt::provider] +mod liquid_parquet { + fn io_submitted(id: u64) {} + fn io_completed(id: u64) {} +} + +static REGISTRATION_SUCCEEDED: OnceLock = OnceLock::new(); + +fn ensure_registered() -> bool { + *REGISTRATION_SUCCEEDED.get_or_init(|| match usdt::register_probes() { + Ok(()) => true, + Err(err) => { + log::debug!("failed to register USDT probes: {err}"); + false + } + }) +} use super::tasks::{FileOpenTask, FileReadTask, FileWriteTask, IoTask}; @@ -317,6 +336,7 @@ where T: IoTask + 'static, { state: UringState, + id: u64, } impl UringFuture @@ -326,6 +346,7 @@ where fn new(task: Box) -> UringFuture { UringFuture { state: UringState::Created(task), + id: rand::rng().random(), } } } @@ -344,6 +365,9 @@ where let pool = IO_URING_THREAD_POOL_INST .get() .expect("Uring threadpool not initialized"); + if ensure_registered() { + liquid_parquet::io_submitted!(|| self.id); + } let (tx, rx) = oneshot::channel::>(); let boxed_task: Box = task; pool.submit_task(boxed_task, tx); @@ -351,6 +375,9 @@ where } UringState::Submitted(mut receiver) => match Pin::new(&mut receiver).poll(cx) { Poll::Ready(Ok(task)) => { + if ensure_registered() { + liquid_parquet::io_completed!(|| self.id); + } let typed_task = task .into_any() .downcast::() From ee14daaf78541412dfe01da4d39e1c3eeb8ae079 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 26 Jan 2026 09:30:21 -0600 Subject: [PATCH 16/20] Use DEFER_TASKRUN flag with io_uring --- .../src/io/io_uring/thread_pool_uring.rs | 50 +++++++++++-------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/src/parquet/src/io/io_uring/thread_pool_uring.rs b/src/parquet/src/io/io_uring/thread_pool_uring.rs index 3cdabc31..2319dace 100644 --- a/src/parquet/src/io/io_uring/thread_pool_uring.rs +++ b/src/parquet/src/io/io_uring/thread_pool_uring.rs @@ -6,7 +6,7 @@ use std::{ }; use bytes::Bytes; -use io_uring::{IoUring, cqueue, squeue}; +use io_uring::{EnterFlags, IoUring, cqueue, squeue}; use liquid_cache_common::{IoMode, memory::pool::FixedBufferPool}; use tokio::sync::oneshot; @@ -31,7 +31,7 @@ fn ensure_registered() -> bool { }) } -use super::tasks::{FileOpenTask, FileReadTask, FileWriteTask, IoTask}; +use super::tasks::{FileReadTask, FileWriteTask, IoTask}; pub(crate) const URING_NUM_ENTRIES: u32 = 256; @@ -120,24 +120,10 @@ impl IoUringThreadpool { fn new(io_type: IoMode, register_buffers: bool) -> IoUringThreadpool { let (sender, receiver) = crossbeam_channel::unbounded::(); - let mut builder = IoUring::::builder(); - let ring = builder - .setup_iopoll() - // .setup_sqpoll(50000) - .build(URING_NUM_ENTRIES) - .expect("Failed to build IoUring instance"); - - if register_buffers { - let res = FixedBufferPool::register_buffers_with_ring(&ring); - if res.is_err() { - log::error!("Failed to register buffers with io-uring ring: {:?}", res); - } - } - let worker = thread::Builder::new() .name("lc-io-worker".to_string()) .spawn(move || { - let mut uring_worker = UringWorker::new(receiver, ring); + let mut uring_worker = UringWorker::new(receiver, register_buffers); uring_worker.thread_loop(); }) .expect("Failed to spawn io-uring worker thread"); @@ -190,7 +176,23 @@ struct UringWorker { impl UringWorker { #[allow(clippy::new_ret_no_self)] - fn new(channel: crossbeam_channel::Receiver, ring: IoUring) -> UringWorker { + fn new(channel: crossbeam_channel::Receiver, register_buffers: bool) -> UringWorker { + let mut builder = IoUring::::builder(); + let ring = builder + .setup_single_issuer() // Only the worker thread will issue IO and poll completions + .setup_defer_taskrun() + // .setup_iopoll() + // .setup_sqpoll(50000) + .build(URING_NUM_ENTRIES) + .expect("Failed to build IoUring instance"); + + if register_buffers { + let res = FixedBufferPool::register_buffers_with_ring(&ring); + if res.is_err() { + log::error!("Failed to register buffers with io-uring ring: {:?}", res); + } + } + let tokens = (0..URING_NUM_ENTRIES as u16).collect(); let mut tasks = Vec::with_capacity(URING_NUM_ENTRIES as usize); tasks.resize_with(URING_NUM_ENTRIES as usize, || None); @@ -265,12 +267,18 @@ impl UringWorker { } self.submitted_tasks[token as usize] = Some(submission); } - // let need_poll = self.tokens.len() < URING_NUM_ENTRIES as u32; + // let need_poll = self.tokens.len() < URING_NUM_ENTRIES as usize; let time_from_last_submit = self.last_syscall.elapsed(); - let need_syscall = self.queued_submissions >= URING_BATCH_SIZE || time_from_last_submit > Duration::from_micros(20); + let is_batch_full = self.queued_submissions >= URING_BATCH_SIZE; + let need_syscall = is_batch_full || time_from_last_submit > Duration::from_micros(20); if need_syscall { + let mut flags = EnterFlags::empty(); + flags.insert(EnterFlags::GETEVENTS); loop { - match self.ring.submit() { + let res = unsafe { + self.ring.submitter().enter::(self.queued_submissions, 0, flags.bits(), None) + }; + match res { Ok(_num_entries) => { break; } From 3926f5bc28e705ea2d2e6ba48338b87b615c7cb7 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sat, 7 Feb 2026 10:33:55 -0600 Subject: [PATCH 17/20] Avoid dependency on dynamically allocated memory in struct Page --- src/common/src/memory/page.rs | 178 +++++++++++++++++++++++--------- src/common/src/memory/tcache.rs | 1 + 2 files changed, 129 insertions(+), 50 deletions(-) diff --git a/src/common/src/memory/page.rs b/src/common/src/memory/page.rs index 30220c1b..9fb13271 100644 --- a/src/common/src/memory/page.rs +++ b/src/common/src/memory/page.rs @@ -1,20 +1,123 @@ -use std::{collections::VecDeque, ptr::null_mut}; +use std::{ptr::null_mut, sync::atomic::{AtomicU8, Ordering}, u8}; + +use crossbeam::utils::CachePadded; use crate::memory::tcache::MIN_SIZE_FROM_PAGES; -#[derive(Clone, Copy)] -pub struct Block { - ptr: *mut u8, +pub const PAGE_SIZE: usize = 64<<10; // 64KB +const MAX_BLOCKS_PER_PAGE: usize = PAGE_SIZE/MIN_SIZE_FROM_PAGES; + +struct LocalFreeList { + head: u8, + tail: u8, + num_blocks: u8, + /** + * Stores the block indices within the page for a compact representation, rather than storing pointers. + * That is, if block index=i, it represents ith block from the start of the page. + */ + blocks: [u8; MAX_BLOCKS_PER_PAGE], } -pub const PAGE_SIZE: usize = 64<<10; // 64KB +impl LocalFreeList { + fn empty() -> LocalFreeList { + LocalFreeList { + head: 0, + tail: 0, + num_blocks: 0, + blocks: [0; MAX_BLOCKS_PER_PAGE], + } + } + + fn new(num_blocks: usize) -> LocalFreeList { + debug_assert!(num_blocks <= MAX_BLOCKS_PER_PAGE); + let mut blocks = [0u8; MAX_BLOCKS_PER_PAGE]; + for i in 0..num_blocks { + blocks[i] = i as u8; + } + LocalFreeList { head: 0, tail: num_blocks as u8 - 1, num_blocks: num_blocks as u8, blocks: blocks } + } + + fn push(&mut self, block: u8) { + debug_assert!(self.tail.wrapping_sub(self.head) < self.num_blocks); + self.tail = self.tail.wrapping_add(1); + self.blocks[self.tail as usize & (MAX_BLOCKS_PER_PAGE - 1)] = block; + } + + fn is_empty(&self) -> bool { + self.head == self.tail + } + + fn pop(&mut self) -> Option { + if self.head == self.tail { + return None + } + let ret = self.blocks[self.head as usize & (MAX_BLOCKS_PER_PAGE - 1)]; + self.head = self.head.wrapping_add(1); + Some(ret) + } +} + +struct MPSCQueue { + head: u8, + tail: CachePadded, + num_blocks: u8, + blocks: [u8; MAX_BLOCKS_PER_PAGE], +} + +impl MPSCQueue { + const HAZARD: u8 = u8::MAX; + + fn new(num_blocks: usize) -> MPSCQueue { + debug_assert!(num_blocks <= MAX_BLOCKS_PER_PAGE); + MPSCQueue { + head: 0, + num_blocks: num_blocks as u8, + tail: CachePadded::new(AtomicU8::new(0)), + blocks: [Self::HAZARD; MAX_BLOCKS_PER_PAGE], + } + } + + fn push(&mut self, block: u8) { + loop { + let cur_tail = self.tail.load(Ordering::Relaxed); + assert!(cur_tail.wrapping_sub(self.head) < self.num_blocks); + let new_tail = cur_tail.wrapping_add(1); + if self.tail.compare_exchange(cur_tail, new_tail, Ordering::Relaxed, Ordering::Relaxed).is_ok() { + unsafe { + std::ptr::write_volatile(&mut self.blocks[cur_tail as usize & (MAX_BLOCKS_PER_PAGE - 1)] as *mut u8, block); + } + return + } + } + } + + fn pop(&mut self) -> Option { + if self.head == self.tail.load(Ordering::Relaxed) { + return None + } + let idx = self.head as usize & (MAX_BLOCKS_PER_PAGE - 1); + loop { + let ret = unsafe { std::ptr::read_volatile(&self.blocks[idx] as *const u8) }; + /* + * The hazard value prevents the following race condition: + * The producer has reserved a slot, but before it can write to the slot, the consumer calls pop. + */ + if ret != Self::HAZARD { + unsafe { + std::ptr::write_volatile(&mut self.blocks[idx] as *mut u8, Self::HAZARD); + } + self.head = self.head.wrapping_add(1); + return Some(ret); + } + } + } +} pub struct Page { pub(crate) block_size: usize, // Size of objects that are being allocated to this page - // TODO(): Remove dependency on dynamically allocated memory - free_list: VecDeque, + free_list: LocalFreeList, pub(crate) used: usize, - pub(crate) thread_free_list: crossbeam::queue::ArrayQueue, + thread_free_list: MPSCQueue, pub(crate) capacity: usize, pub(crate) slice_count: usize, // No. of pages in the slice containing this page pub(crate) slice_offset: usize, // Offset of this page from the start of this slice @@ -28,9 +131,9 @@ impl Page { pub fn from_slice(slice: Slice) -> Page { Page { block_size: 0usize, - free_list: VecDeque::::with_capacity(PAGE_SIZE/MIN_SIZE_FROM_PAGES), + free_list: LocalFreeList::empty(), used: 0, - thread_free_list: crossbeam::queue::ArrayQueue::new(PAGE_SIZE/MIN_SIZE_FROM_PAGES), + thread_free_list: MPSCQueue::new(PAGE_SIZE/MIN_SIZE_FROM_PAGES), capacity: slice.size, slice_count: 1, slice_offset: 0, @@ -42,23 +145,19 @@ impl Page { pub fn set_block_size(self: &mut Self, block_size: usize) { self.block_size = block_size; - let mut offset: usize = 0; - self.free_list.clear(); - while offset < self.capacity { - let ptr = unsafe { self.page_start.add(offset) }; - self.free_list.push_back(Block {ptr}); - offset += self.block_size; - } + let num_blocks = self.capacity / block_size; + self.free_list = LocalFreeList::new(num_blocks); } #[inline] pub fn get_free_block(self: &mut Self) -> *mut u8 { - let block = self.free_list.pop_front(); - if block.is_none() { - return null_mut() - } + let block_idx = self.free_list.pop(); + let block_idx = match block_idx { + Some(i) => i, + None => return null_mut(), + }; self.used += 1; - block.unwrap().ptr + unsafe { self.page_start.add(block_idx as usize * self.block_size) } } #[inline(always)] @@ -74,24 +173,23 @@ impl Page { /// Pointer freed on the same core #[inline(always)] pub fn free(self: &mut Self, ptr: *mut u8) { - self.free_list.push_back(Block {ptr}); + let block_idx = (ptr as usize - self.page_start as usize) / self.block_size; + self.free_list.push(block_idx as u8); self.used -= 1; } /// Pointer freed on a different core #[inline(always)] pub(crate) fn foreign_free(self: &mut Self, ptr: *mut u8) { - let blk = Block {ptr}; - let r = self.thread_free_list.push(blk); - debug_assert!(r.is_ok()); + let blk_idx = unsafe {ptr.offset_from(self.page_start) as usize / self.block_size}; + self.thread_free_list.push(blk_idx as u8); } /// Collect pointers freed by other threads #[inline] pub(crate) fn collect_foreign_frees(self: &mut Self) { - while !self.thread_free_list.is_empty() { - let blk = self.thread_free_list.pop().unwrap(); - self.free_list.push_back(blk); + while let Some(blk) = self.thread_free_list.pop() { + self.free_list.push(blk as u8); self.used -= 1; } } @@ -115,24 +213,4 @@ impl Slice { }; (slice1, slice2) } -} - -// pub struct PageQueue { -// page: *mut Page, -// next: *mut PageQueue, -// } - -// impl PageQueue { -// pub fn new() -> PageQueue { -// PageQueue { page: null_mut(), next: null_mut() } -// } - -// pub(crate) fn get_page(self: &mut Self) -> Option<*mut Page> { -// if self.page.is_null() { -// return None; -// } -// let result = self.page; -// self = *self.next; -// Some(result) -// } -// } \ No newline at end of file +} \ No newline at end of file diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index 4fa91ea7..e79c64e6 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -301,6 +301,7 @@ impl TCache { // split slice let next_slice = unsafe { (*segment).split_page(slice, num_slices_required) }; debug_assert!(unsafe { (*slice).slice_count == num_slices_required}); + #[cfg(debug_assertions)] unsafe { (*segment).check_valid_segment() } ; let bin = Self::get_span_idx_from_slice_count(num_slices_original - num_slices_required); Self::add_slice_to_span(&mut self.spans[bin], unsafe { &mut (*next_slice) } ); From 5fe79f151a4f32180587b218475d4bead516b58a Mon Sep 17 00:00:00 2001 From: proteet Date: Sat, 7 Feb 2026 11:23:27 -0700 Subject: [PATCH 18/20] Fix --- src/common/src/memory/page.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/common/src/memory/page.rs b/src/common/src/memory/page.rs index 9fb13271..41162bff 100644 --- a/src/common/src/memory/page.rs +++ b/src/common/src/memory/page.rs @@ -34,13 +34,13 @@ impl LocalFreeList { for i in 0..num_blocks { blocks[i] = i as u8; } - LocalFreeList { head: 0, tail: num_blocks as u8 - 1, num_blocks: num_blocks as u8, blocks: blocks } + LocalFreeList { head: 0, tail: num_blocks as u8, num_blocks: num_blocks as u8, blocks: blocks } } fn push(&mut self, block: u8) { debug_assert!(self.tail.wrapping_sub(self.head) < self.num_blocks); - self.tail = self.tail.wrapping_add(1); self.blocks[self.tail as usize & (MAX_BLOCKS_PER_PAGE - 1)] = block; + self.tail = self.tail.wrapping_add(1); } fn is_empty(&self) -> bool { @@ -145,7 +145,7 @@ impl Page { pub fn set_block_size(self: &mut Self, block_size: usize) { self.block_size = block_size; - let num_blocks = self.capacity / block_size; + let num_blocks = (self.slice_count * PAGE_SIZE) / block_size; self.free_list = LocalFreeList::new(num_blocks); } From c8760273eec0fa0eef9571e1478eba53b97c5cb2 Mon Sep 17 00:00:00 2001 From: proteet Date: Mon, 9 Feb 2026 00:27:18 -0700 Subject: [PATCH 19/20] Fix memory tracking bug --- src/common/src/memory/tcache.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index e79c64e6..4f3ca4cb 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -354,6 +354,7 @@ impl TCache { let mut free_page = self.find_page_from_spans(num_pages, block_size); if free_page != null_mut() { self.stats.allocations_from_segment += 1; + self.used_pages[NUM_SIZE_CLASSES].push(free_page); let free_block = unsafe { (*free_page).get_free_block() }; return free_block } @@ -369,6 +370,7 @@ impl TCache { let res = self.allocate_segment_from_arena(self.thread_id); if !res { + self.stats.unsuccessful_allocations += 1; return null_mut() } self.stats.allocations_from_arena += 1; @@ -378,8 +380,9 @@ impl TCache { return null_mut() } self.used_pages[NUM_SIZE_CLASSES].push(free_page); - assert_ne!(free_page, null_mut()); + debug_assert_ne!(free_page, null_mut()); let free_block = unsafe { (*free_page).get_free_block() }; + debug_assert_ne!(free_block, null_mut()); return free_block } @@ -425,6 +428,7 @@ impl TCache { if free_page != null_mut() { self.stats.allocations_from_segment += 1; let free_block = unsafe { (*free_page).get_free_block() }; + debug_assert_ne!(free_block, null_mut()); self.free_pages[size_class] = free_page; return free_block; } From 9ea1a089afd7a78ba0ba49b06d0e5e32069b75de Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sun, 22 Feb 2026 10:26:05 -0600 Subject: [PATCH 20/20] Add bpftrace example in README --- dev/README.md | 18 +++++++++++++++++- src/common/src/memory/tcache.rs | 3 ++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/dev/README.md b/dev/README.md index 1991c206..bfecdf63 100644 --- a/dev/README.md +++ b/dev/README.md @@ -22,7 +22,6 @@ LiquidCache exports OpenTelemetry traces. Spin up a Jaeger v2 ```bash docker run \ --name jaeger \ - --replace \ -e COLLECTOR_OTLP_ENABLED=true \ -p 16686:16686 \ -p 4317:4317 \ @@ -30,6 +29,8 @@ docker run \ cr.jaegertracing.io/jaegertracing/jaeger:2.11.0 ``` +If a container named `jaeger` already exists, remove it first: `docker rm -f jaeger` (or `podman rm -f jaeger`). + This image contains the Jaeger v2 distribution. Port 16686 exposes the frontend UI at http://localhost:16686. 4317 and 4318 expose OTLP over gRPC and HTTP respectively. @@ -76,6 +77,21 @@ This will trace the execution of `iteration = 2` (`arg1 == 2`) and print the `io [512, 1K) 194 |@@@ | ``` +```bash +sudo bpftrace -e ' + usdt:./target/release/in_process:liquid_benchmark:iteration_start /arg1 == 2/ {@enable = 1;} + usdt:./target/release/in_process:liquid_benchmark:iteration_start /arg1 > 2/ {@enable = 0;} + usdt:./target/release/in_process:io_submitted /@enable/ { + @t[arg0] = nsecs; + } + usdt:./target/release/in_process:io_completed /@enable && @t[arg0]/ { + $us = (nsecs - @t[arg0]) / 1000; + @lat = hist($us); + delete(@t[arg0]); + } + ' +``` + If you're using blocking io mode, try this: ```bash sudo bpftrace -e ' diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index 4f3ca4cb..21e9020a 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -66,7 +66,8 @@ struct Span { pub(crate) struct TCache { free_pages: [*mut Page; NUM_SIZE_CLASSES], - // Last size class holds slices that serve large allocations (>256KB) + // TODO(): Make this a linked list + // Last size class holds slices that serve large allocations (>64KB) used_pages: [Vec<*mut Page>; NUM_SIZE_CLASSES + 1], // TODO: Use a linked list for O(1) deletion spans: [Span; SEGMENT_BINS],