From 5d5668644c95ca20c790b74ba85c338343259bad Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 10 Nov 2022 21:35:04 +0800 Subject: [PATCH 01/77] add spanich check --- Cargo.toml | 11 +- src/kv.rs | 226 +++++++++++++++++++++++- src/lib.rs | 16 +- src/manifest.rs | 38 ++++ src/options/mod.rs | 3 +- src/test_util.rs | 62 +++++++ src/types.rs | 127 +++++++++++++ src/value_log.rs | 393 ++++++++++++++++++++++++++++------------- src/value_log_tests.rs | 62 ++++++- src/y/mod.rs | 31 +++- 10 files changed, 829 insertions(+), 140 deletions(-) create mode 100644 src/manifest.rs create mode 100644 src/test_util.rs diff --git a/Cargo.toml b/Cargo.toml index f597052..9dc5cc5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,4 +30,13 @@ fmmap = { version = "0.3.2", features = ["tokio-async"] } parking_lot = "0.12.1" bitflags = "1.3.2" libc = "0.2.137" -log = "0.4.17" +log = { version = "0.4.17", features = ["kv_unstable", "kv_unstable_serde", "kv_unstable_sval"] } +async-channel = "1.7.1" +file-guard = "0.1.0" +fs2 = "0.4.3" +awaitgroup = "0.6.0" +[dev-dependencies] +chrono = "0.4.22" +env_logger = "0.9.1" +console_log = { version = "0.2.0", features = ["color"] } + diff --git a/src/kv.rs b/src/kv.rs index a8fd9de..467d177 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -1 +1,225 @@ -pub struct KV {} \ No newline at end of file +use crate::manifest::{open_or_create_manifest_file, Manifest}; +use crate::options::Options; +use crate::table::builder::Builder; +use crate::table::iterator::IteratorImpl; +use crate::types::{Channel, Closer}; +use crate::value_log::{Request, ValueLogCore, ValuePointer}; +use crate::y::{Encode, Result, ValueStruct}; +use crate::{Error, Node, SkipList}; +use fs2::FileExt; +use log::info; +use std::borrow::BorrowMut; +use std::fs::{create_dir_all, read_dir, File}; +use std::fs::{try_exists, OpenOptions}; +use std::io::Write; +use std::path::Path; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::Arc; + +const _BADGER_PREFIX: &[u8; 8] = b"!badger!"; // Prefix for internal keys used by badger. +const _HEAD: &[u8; 11] = b"!bager!head"; // For Storing value offset for replay. + +struct Closers { + update_size: Closer, + compactors: Closer, + mem_table: Closer, + writes: Closer, + value_gc: Closer, +} + +struct FlushTask { + mt: Option, + vptr: ValuePointer, +} + +pub struct ArcKV(KV); + +pub struct KV { + pub opt: Options, + pub vlog: Option, + pub manifest: Manifest, + flush_chan: Channel, + // write_chan: Channel, + dir_lock_guard: File, + value_dir_guard: File, + closers: Closers, + mt: SkipList, + // Incremented in the non-concurrently accessed write loop. But also accessed outside. So + // we use an atomic op. + last_used_cas_counter: AtomicU64, +} + +impl Drop for KV { + fn drop(&mut self) { + self.dir_lock_guard.unlock().unwrap(); + self.value_dir_guard.unlock().unwrap(); + self.closers.compactors.signal_and_wait(); + self.closers.mem_table.signal_and_wait(); + self.closers.writes.signal_and_wait(); + self.closers.update_size.signal_and_wait(); + } +} + +impl KV { + pub fn new(opt: Options) -> Result { + let mut _opt = opt.clone(); + _opt.max_batch_size = (15 * opt.max_table_size) / 100; + _opt.max_batch_count = opt.max_batch_size / Node::size() as u64; + create_dir_all(opt.dir.as_str())?; + create_dir_all(opt.value_dir.as_str())?; + // todo add directory lock + if !(opt.value_log_file_size <= 2 << 30 && opt.value_log_file_size >= 1 << 20) { + return Err(Error::ValueLogSize); + } + let (manifest_file, manifest) = open_or_create_manifest_file(opt.dir.as_str())?; + let dir_lock_guard = OpenOptions::new() + .write(true) + .append(true) + .create(true) + .open(Path::new(opt.dir.as_str()).join("dir_lock_guard.lock"))?; + dir_lock_guard.lock_exclusive()?; + let value_dir_guard = OpenOptions::new() + .write(true) + .append(true) + .create(true) + .open(Path::new(opt.value_dir.as_str()).join("value_dir_guard.lock"))?; + value_dir_guard.lock_exclusive()?; + let mut closers = Closers { + update_size: Closer::new(0), + compactors: Closer::new(0), + mem_table: Closer::new(0), + writes: Closer::new(0), + value_gc: Closer::new(0), + }; + // go out.updateSize(out.closers.updateSize) + let mut mt = SkipList::new(arena_size(&opt)); + let mut out = KV { + opt: opt.clone(), + vlog: None, + manifest, + flush_chan: Channel::new(1), + // write_chan: Channel::new(1), + dir_lock_guard, + value_dir_guard, + closers, + mt, + last_used_cas_counter: Default::default(), + }; + let mut vlog = ValueLogCore::default(); + vlog.open(&out, opt)?; + out.vlog = Some(vlog); + Ok(out) + } + + pub fn must_vlog(&self) -> &ValueLogCore { + self.vlog.as_ref().unwrap() + } + + pub fn must_mut_vlog(&mut self) -> &mut ValueLogCore { + self.vlog.as_mut().unwrap() + } +} + +impl KV { + fn write_requests(&self, reqs: &[Request]) -> Arc> { + if reqs.is_empty() { + return Arc::new(Ok(())); + } + let done = |res: Arc>| { + for req in reqs { + if res.is_err() { + // todo + *req.err.borrow_mut() = res.clone(); + } + let worker = req.wait_group.borrow_mut().take().unwrap(); + worker.done(); + } + }; + info!("write_requests called. Writing to value log"); + // CAS counter for all operations has to go onto value log. Otherwise, if it is just in + // memtable for a long time, and following CAS operations use that as a check, when + // replaying, we will think that these CAS operations should fail, when they are actually + // valid. + + // There is code (in flushMemtable) whose correctness depends on us generating CAS Counter + // values _before_ we modify s.vptr here. + for req in reqs { + let counter_base = self.new_cas_counter(req.entries.len() as u64); + for (idx, entry) in req.entries.iter().enumerate() { + entry.borrow_mut().cas_counter = counter_base + idx as u64; + } + } + + let ok = self.vlog.as_ref().unwrap().write(reqs); + if ok.is_err() { + let _ok = Arc::new(ok); + done(_ok.clone()); + return _ok.clone(); + } + + info!("Writing to memtable"); + let mut count = 0; + for req in reqs { + if req.entries.is_empty() { + continue; + } + count += req.entries.len(); + } + Arc::new(Ok(())) + } + + async fn flush_mem_table(&self, lc: &Closer) -> Result<()> { + while let Ok(task) = self.flush_chan.recv().await { + if task.mt.is_none() { + break; + } + + if task.vptr.is_zero() { + continue; + } + + info!("Storing offset: {:?}", task.vptr); + let mut offset = vec![0u8; ValuePointer::value_pointer_encoded_size()]; + task.vptr.enc(&mut offset).unwrap(); + // CAS counter is needed and is desirable -- it's the first value log entry + // we replay, so to speak, perhaps the only, and we use it to re-initialize + // the CAS counter. + // + // The write loop generates CAS counter values _before_ it sets vptr. It + // is crucial that we read the cas counter here _after_ reading vptr. That + // way, our value here is guaranteed to be >= the CASCounter values written + // before vptr (because they don't get replayed). + let value = ValueStruct { + meta: 0, + user_meta: 0, + cas_counter: self.last_used_cas_counter.load(Ordering::Acquire), + value: offset, + }; + // todo + task.mt.as_ref().unwrap().put(_HEAD, value); + } + + Ok(()) + } + + fn new_cas_counter(&self, how_many: u64) -> u64 { + self.last_used_cas_counter + .fetch_add(how_many, Ordering::Relaxed) + } +} + +fn write_level0_table(st: &SkipList, mut f: &File) -> Result<()> { + let cur = st.new_cursor(); + let mut builder = Builder::default(); + while let Some(_) = cur.next() { + let key = cur.key(); + let value = cur.value(); + builder.add(key, &value)?; + } + f.write_all(&builder.finish())?; + Ok(()) +} + +fn arena_size(opt: &Options) -> usize { + (opt.max_table_size + opt.max_batch_size + opt.max_batch_count * Node::size() as u64) as usize +} diff --git a/src/lib.rs b/src/lib.rs index e4ca19b..ed3acaa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,22 +8,28 @@ #![feature(pattern)] #![feature(cell_leak)] #![feature(path_file_prefix)] +#![feature(fs_try_exists)] +#![feature(generic_associated_types)] extern crate core; use std::io; use std::mem::align_of; +mod event; +mod kv; +mod log_file; +mod manifest; mod options; mod skl; mod table; -mod y; -mod value_log; -mod event; mod types; -mod log_file; -mod kv; +mod value_log; mod value_log_tests; +mod y; + +#[cfg(test)] +mod test_util; pub use skl::{Arena, Node, SkipList}; pub use y::{Error, Result}; diff --git a/src/manifest.rs b/src/manifest.rs new file mode 100644 index 0000000..5e447c5 --- /dev/null +++ b/src/manifest.rs @@ -0,0 +1,38 @@ +use parking_lot::RwLock; +use std::fs::File; +use crate::Result; + +/// TableManifest contains information about a specific level +/// in the LSM tree. +#[derive(Default)] +pub struct TableManifest { + Level: u8, +} + +#[derive(Default)] +pub(crate) struct ManifestFile { + fp: Option, + directory: String, + // We make this configurable so that unit tests can hit rewrite() code quickly + deletions_rewrite_threshold: usize, + // Guards appends, which includes access to the manifest field. + append_lock: RwLock<()>, + + // Used to track the current state of the manifest, used when rewriting. + manifest: Manifest, +} +/// Manifest represnts the contents of the MANIFEST file in a Badger store. +/// +/// The MANIFEST file describes the startup state of the db -- all LSM files and what level they're +/// at. +/// +/// It consists of a sequence of ManifestChangeSet objects. Each of these is treated atomically, +/// and contains a sequence of ManifestChange's (file creations/deletions) which we use to +/// reconstruct the manifest at startup. +#[derive(Default)] +pub struct Manifest {} + +// todo +pub(crate) fn open_or_create_manifest_file(dir: &str) -> Result<(ManifestFile, Manifest)>{ + Ok((ManifestFile::default(), Manifest::default())) +} diff --git a/src/options/mod.rs b/src/options/mod.rs index f73b609..1d7010c 100644 --- a/src/options/mod.rs +++ b/src/options/mod.rs @@ -82,6 +82,8 @@ impl Options { impl Default for Options { fn default() -> Self { Options { + dir: Box::new("".to_string()), + value_dir: Box::new("".to_string()), sync_writes: false, table_loading_mode: FileLoadingMode::LoadToRADM, max_table_size: 64 << 20, @@ -97,7 +99,6 @@ impl Default for Options { do_not_compact: false, max_batch_count: 0, max_batch_size: 0, - ..Default::default() } } } diff --git a/src/test_util.rs b/src/test_util.rs new file mode 100644 index 0000000..73660d3 --- /dev/null +++ b/src/test_util.rs @@ -0,0 +1,62 @@ +use log::{info, kv::source::as_map, kv::Source, Level}; +use rand::random; +use std::collections::HashMap; +use std::env::temp_dir; +use std::fs::create_dir_all; + +#[cfg(test)] +pub(crate) fn mock_log() { + use chrono::Local; + use env_logger::Env; + use log::kv::source::AsMap; + use log::kv::{Error, Key, ToKey, ToValue, Value}; + use serde::{Deserialize, Serialize}; + use std::io::Write; + + #[derive(Serialize, Deserialize)] + struct JsonLog { + level: log::Level, + ts: String, + module: String, + msg: String, + #[serde(skip_serializing_if = "HashMap::is_empty", flatten)] + kv: HashMap, + } + + let env = Env::default() + .filter_or("MY_LOG_LEVEL", "error") + .write_style_or("MY_LOG_STYLE", "always"); + let _ = env_logger::Builder::from_env(env) + .format(|buf, record| { + let mut l = JsonLog { + ts: Local::now().format("%Y-%m-%dT%H:%M:%S").to_string(), + module: record.file().unwrap_or("unknown").to_string() + + ":" + + &*record.line().unwrap_or(0).to_string(), + level: record.level(), + msg: record.args().to_string(), + kv: Default::default(), + }; + let kv: AsMap<&dyn Source> = as_map(record.key_values()); + if let Ok(kv) = serde_json::to_string(&kv) { + let h: HashMap = serde_json::from_str(&kv).unwrap(); + l.kv.extend(h.into_iter()); + } + writeln!(buf, "{}", serde_json::to_string(&l).unwrap()) + }) + .try_init(); + log::info!( is_ok = true; "start init log"); + // env_logger::try_init_from_env(env); +} + +#[cfg(test)] +pub(crate) fn mock_log_terminal() { + console_log::init_with_level(Level::Debug); +} + +pub fn random_tmp_dir() -> String { + let id = random::(); + let path = temp_dir().join(id.to_string()).join("badger"); + // create_dir_all(&path).unwrap(); + path.to_str().unwrap().to_string() +} diff --git a/src/types.rs b/src/types.rs index 78a8ce9..0407b14 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1 +1,128 @@ use parking_lot::*; +use std::sync::atomic::{AtomicIsize, AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +use async_channel::{bounded, Receiver, RecvError, SendError, Sender, TryRecvError}; +use tokio::time::sleep; + +#[derive(Clone)] +pub(crate) struct Channel { + rx: Option>, + tx: Option>, +} + +impl Channel { + pub(crate) fn new(n: usize) -> Self { + let (tx, rx) = bounded(n); + Channel { + rx: Some(rx), + tx: Some(tx), + } + } + async fn try_send(&self, msg: T) -> Result<(), SendError> { + if let Some(tx) = &self.tx { + return tx.send(msg).await; + } + Ok(()) + } + + pub(crate) async fn try_recv(&self) -> Result { + if let Some(rx) = &self.rx { + return rx.try_recv(); + } + Err(TryRecvError::Empty) + } + + pub(crate) async fn recv(&self) -> Result { + let rx = self.rx.as_ref().unwrap(); + rx.recv().await + } + + pub(crate) async fn send(&self, msg: T) -> Result<(), SendError> { + let tx = self.tx.as_ref().unwrap(); + tx.send(msg).await + } + + pub(crate) fn tx(&self) -> Sender { + self.tx.as_ref().unwrap().clone() + } + + pub(crate) fn take_tx(&mut self) -> Option> { + self.tx.take() + } + + pub(crate) fn close(&self) { + if self.tx.is_none() { + return; + } + self.tx.as_ref().unwrap().close(); + } +} + +/// Holds the two things we need to close a routine and wait for it to finish: a chan +/// to tell the routine to shut down, and a wait_group with which to wait for it to finish shutting +/// down. +#[derive(Clone)] +pub(crate) struct Closer { + closed: Channel<()>, + wait: Arc, +} + +impl Closer { + pub(crate) fn new(initial: isize) -> Self { + let mut close = Closer { + closed: Channel::new(1), + wait: Arc::from(AtomicIsize::new(initial)), + }; + close + } + + pub(crate) fn add_running(&self, delta: isize) { + self.wait.fetch_add(delta, Ordering::Relaxed); + } + + pub(crate) fn signal(&self) { + self.closed.close(); + } + + // todo + pub(crate) fn has_been_closed(&self) -> Channel<()> { + self.closed.clone() + } + + pub(crate) fn done(&self) { + self.wait.fetch_sub(1, Ordering::Relaxed); + } + + pub(crate) async fn wait(&self) { + loop { + if self.wait.load(Ordering::Relaxed) <= 0 { + break; + } + println!("wait"); + sleep(Duration::from_millis(10)).await; + } + } + + pub(crate) async fn signal_and_wait(&self) { + self.signal(); + self.wait().await; + } +} + +#[test] +fn it_closer() { + let mut runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.block_on(async { + let mut closer = Closer::new(1); + let c = closer.clone(); + tokio::spawn(async move { + sleep(Duration::from_millis(20000)).await; + println!("Hello Word1"); + c.done(); + }); + closer.signal_and_wait().await; + println!("Hello Word"); + }); +} diff --git a/src/value_log.rs b/src/value_log.rs index f4b63e4..d60b1ec 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -1,7 +1,9 @@ +use awaitgroup::{WaitGroup, Worker}; use bitflags::bitflags; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use crc32fast::Hasher; use log::info; +use log::kv::Source; use memmap::MmapMut; use parking_lot::*; use rand::random; @@ -10,9 +12,10 @@ use std::cell::{Ref, RefCell, RefMut}; use std::collections::{HashMap, HashSet}; use std::fmt::Formatter; use std::fs::{read_dir, File, OpenOptions}; -use std::io::{Cursor, Read, Seek, SeekFrom, Write}; +use std::io::{BufWriter, Cursor, Read, Seek, SeekFrom, Write}; use std::marker::PhantomData; use std::mem::size_of; +use std::ops::{Deref, Index}; use std::path::Path; use std::process::id; use std::sync::atomic::{AtomicI32, AtomicU32, AtomicU64, Ordering}; @@ -181,7 +184,7 @@ impl fmt::Display for Entry { } } -#[derive(Debug, Default)] +#[derive(Debug, Clone, Default)] #[repr(C)] pub struct ValuePointer { pub(crate) fid: u32, @@ -202,11 +205,11 @@ impl ValuePointer { self.len < o.len } - fn is_zero(&self) -> bool { + pub(crate) fn is_zero(&self) -> bool { self.fid == 0 && self.offset == 0 && self.len == 0 } - const fn value_pointer_encoded_size() -> usize { + pub(crate) const fn value_pointer_encoded_size() -> usize { size_of::() } } @@ -250,21 +253,44 @@ impl Into> for ValuePointer { } } -struct Request<'a> { - entries: &'a [Entry], +pub(crate) struct Request { + // Input values + pub(crate) entries: Vec>, + // Output Values and wait group stuff below + pub(crate) ptrs: RefCell>>, + pub(crate) wait_group: RefCell>, + pub(crate) err: RefCell>>, } pub struct ValueLogCore { dir_path: Box, max_fid: AtomicU32, - files: RwLock, + vlogs: Arc>>>>, + dirty_vlogs: Arc>>, num_active_iterators: AtomicI32, writable_log_offset: AtomicU32, + buf: RefCell>>, opt: Options, kv: *const KV, } -#[derive(Debug)] +impl Default for ValueLogCore { + fn default() -> Self { + ValueLogCore { + dir_path: Box::new("".to_string()), + max_fid: Default::default(), + vlogs: Arc::new(Default::default()), + dirty_vlogs: Arc::new(Default::default()), + num_active_iterators: Default::default(), + writable_log_offset: Default::default(), + buf: RefCell::new(BufWriter::new(vec![0u8; 0])), + opt: Default::default(), + kv: std::ptr::null(), + } + } +} + +#[derive(Debug, Default)] struct ValueFiles { files_to_be_deleted: HashSet, files_map: HashMap, @@ -272,7 +298,7 @@ struct ValueFiles { impl ValueLogCore { fn vlog_file_path(dir_path: &str, fid: u32) -> String { - let mut path = Path::new(dir_path).join(format!("{:6}", fid)); + let mut path = Path::new(dir_path).join(format!("{:06}.vlog", fid)); path.to_str().unwrap().to_string() } fn fpath(&self, fid: u32) -> String { @@ -296,7 +322,16 @@ impl ValueLogCore { Ok(log_file) } - fn open(&mut self, kv: &KV, opt: Options) -> Result<()> { + fn create_mmap_vlog_file(&self, fid: u32, offset: u64) -> Result { + let mut vlog_file = self.create_vlog_file(fid)?; + vlog_file.fd.as_mut().unwrap().set_len(offset)?; + vlog_file.fd.as_mut().unwrap().set_len(offset)?; + let mut _mmap = unsafe { MmapMut::map_mut(vlog_file.fd.as_ref().unwrap())? }; + vlog_file._mmap.replace(_mmap); + Ok(vlog_file) + } + + pub(crate) fn open(&mut self, kv: &KV, opt: Options) -> Result<()> { self.dir_path = opt.value_dir.clone(); self.opt = opt; let kv = kv as *const KV; @@ -312,19 +347,19 @@ impl ValueLogCore { pub fn close(&self) -> Result<()> { info!("Stopping garbage collection of values."); - let mut files = self.files.write(); - for file_log in files.files_map.iter_mut() { - file_log.1._mmap.as_mut().unwrap().flush()?; - if *file_log.0 == self.max_fid.load(Ordering::Acquire) { - file_log - .1 + let mut vlogs = self.vlogs.write(); + for vlog in vlogs.iter() { + vlog.1.write()._mmap.as_mut().unwrap().flush()?; + if *vlog.0 == self.max_fid.load(Ordering::Acquire) { + vlog.1 + .write() .fd .as_mut() .unwrap() .set_len(self.writable_log_offset.load(Ordering::Acquire) as u64)?; } } - files.files_map.clear(); + vlogs.clear(); Ok(()) } @@ -335,42 +370,43 @@ impl ValueLogCore { Err(err) => return Err(err.into()), } - /// add pid_lock + // add pid_lock let mut vlog_files = Self::get_data_files(&self.dir_path)?; let fids = Self::parse_file_ids(&mut vlog_files)?; let mut max_fid = 0; for fid in fids { - let mut log_file = LogFile { + let log_file = LogFile { _path: Box::new(self.fpath(fid as u32)), fd: None, fid: fid as u32, _mmap: None, sz: 0, }; - self.files.write().files_map.insert(fid as u32, log_file); + self.vlogs + .write() + .insert(fid as u32, Arc::new(RwLock::new(log_file))); if fid > max_fid { max_fid = fid; } } - self.max_fid.store(max_fid as u32, Ordering::Release); // Open all previous log files are read only. Open the last log file // as read write. - let mut files = self.files.write(); - for (fid, fp) in files.files_map.iter_mut() { + let mut vlogs = self.vlogs.write(); + for (fid, fp) in vlogs.iter() { if *fid == max_fid as u32 { let fpath = self.fpath(*fid as u32); let _fp = open_existing_synced_file(&fpath, self.opt.sync_writes)?; - fp.fd.replace(_fp); + fp.write().fd.replace(_fp); } else { - fp.open_read_only()?; + fp.write().open_read_only()?; } } - // If no files are found, the create a new file. - if files.files_map.is_empty() { + // If no files are found, creating a new file. + if vlogs.is_empty() { let log_file = self.create_vlog_file(0)?; - files.files_map.insert(0, log_file); + vlogs.insert(0, Arc::new(RwLock::new(log_file))); } Ok(()) } @@ -390,7 +426,7 @@ impl ValueLogCore { vp.offset, self.writable_log_offset.load(Ordering::Acquire) ) - .into()); + .into()); } self.read_value_bytes(vp, |buffer| { @@ -412,21 +448,10 @@ impl ValueLogCore { vp: &ValuePointer, mut f: impl FnMut(&Entry, &ValuePointer) -> Result, ) -> Result<()> { - let mut files = self.files.write(); - let fid_set = files - .files_map - .keys() - .map(|fid| *fid) - .collect::>(); - let mut fids = fid_set - .symmetric_difference(&files.files_to_be_deleted) - .map(|fid| *fid) - .collect::>(); - fids.sort(); - + let vlogs = self.pick_log_guard(); info!("Seeking at value pointer: {:?}", vp); let offset = vp.offset + vp.len; - for id in fids { + for id in vlogs.fids { if id < vp.fid { continue; } @@ -434,15 +459,20 @@ impl ValueLogCore { if id > vp.fid { of = 0; } - let log_file = files.files_map.get_mut(&id).unwrap(); - log_file.iterate(of, &mut f)?; + let log_file = vlogs.vlogs.get(&id).unwrap(); + log_file.write().iterate(of, &mut f)?; } // Seek to the end to start writing. - let mut last_file = files - .files_map - .get_mut(&self.max_fid.load(Ordering::Acquire)) + let last_file = vlogs + .vlogs + .get(&self.max_fid.load(Ordering::Acquire)) .unwrap(); - let last_offset = last_file.fd.as_mut().unwrap().seek(SeekFrom::End(0))?; + let last_offset = last_file + .write() + .fd + .as_mut() + .unwrap() + .seek(SeekFrom::End(0))?; self.writable_log_offset .store(last_offset as u32, Ordering::Release); Ok(()) @@ -457,11 +487,11 @@ impl ValueLogCore { if self.opt.sync_writes { return Ok(()); } - let cur_log_file = self - .files - .write() - .files_map - .get_mut(&self.max_fid.load(Ordering::Acquire)).unwrap(); + let cur_wt_vlog = self + .pick_log_guard() + .vlogs + .get(&self.max_fid.load(Ordering::Acquire)) + .unwrap(); // todo add sync directory meta // sync_dir_async() Ok(()) @@ -472,62 +502,106 @@ impl ValueLogCore { vp: &ValuePointer, mut consumer: impl FnMut(&[u8]) -> Result<()>, ) -> Result<()> { - let mut files = self.files.write(); - let log_file = files.files_map.get_mut(&vp.fid).unwrap(); - let buffer = log_file.read(vp)?; + let log_file = self.pick_log_by_vlog_id(&vp.fid); + let lf = log_file.read(); + let buffer = lf.read(vp)?; consumer(buffer) } // write is thread-unsafe by design and should not be called concurrently. - fn write(&self, req: &[Request]) -> Result<()> { todo!() } + pub(crate) fn write(&self, reqs: &[Request]) -> Result<()> { + let cur_vlog_file = self.pick_log_by_vlog_id(&self.max_fid.load(Ordering::Acquire)); + let to_disk = || -> Result<()> { + if self.buf.borrow().buffer().is_empty() { + return Ok(()); + } + info!( + " Flushing {} blocks of total size: {}", + reqs.len(), + self.buf.borrow().buffer().len() + ); + + let n = cur_vlog_file + .write() + .fd + .as_mut() + .unwrap() + .write(self.buf.borrow().buffer())?; + // todo add metrics + info!("Done"); + self.writable_log_offset + .fetch_add(n as u32, Ordering::Release); + self.buf.borrow_mut().get_mut().clear(); + + if self.writable_log_offset.load(Ordering::Acquire) + > self.opt.value_log_file_size as u32 + { + cur_vlog_file + .write() + .done_writing(self.writable_log_offset.load(Ordering::Acquire))?; + + let new_id = self.max_fid.fetch_add(1, Ordering::Release); + assert!(new_id < 1 << 16, "newid will overflow u16: {}", new_id); + *cur_vlog_file.write() = + self.create_mmap_vlog_file(new_id, 2 * self.opt.value_log_file_size)?; + } + Ok(()) + }; - fn pick_log(&self) -> Option> { - let files = self.files.read(); - let fid_set = files - .files_map - .keys() - .map(|fid| *fid) - .collect::>(); - let mut fids = fid_set - .symmetric_difference(&files.files_to_be_deleted) - .collect::>(); - fids.sort(); - if fids.len() <= 1 { + for req in reqs { + for (idx, entry) in req.entries.iter().enumerate() { + if !self.opt.sync_writes && entry.borrow().value.len() < self.opt.value_threshold { + // No need to write to value log. + req.ptrs.borrow_mut()[idx] = None; + continue; + } + + let mut ptr = ValuePointer::default(); + ptr.fid = cur_vlog_file.read().fid; + // Use the offset including buffer length so far. + ptr.offset = self.writable_log_offset.load(Ordering::Acquire) + + self.buf.borrow().buffer().len() as u32; + let mut buf = self.buf.borrow_mut(); + entry.borrow_mut().enc(&mut *buf)?; + } + } + to_disk() + } + + fn pick_log(&self) -> Option>> { + let vlogs_guard = self.pick_log_guard(); + if vlogs_guard.vlogs.len() <= 1 { return None; } // This file shouldn't be being written to. - let mut idx = random::() % fids.len(); + let mut idx = random::() % vlogs_guard.fids.len(); if idx > 0 { // Another level of rand to favor smaller fids. idx = random::() % idx; } - let log = RwLockReadGuard::try_map(files, |fs| fs.files_map.get(&(idx as u32))).unwrap(); - Some(log) + let fid = vlogs_guard.fids.index(idx); + let vlog = vlogs_guard.vlogs.get(fid).unwrap(); + Some(vlog.clone()) } - fn pick_mut_log(&self) -> Option> { - let files = self.files.write(); - let fid_set = files - .files_map - .keys() + fn pick_log_by_vlog_id(&self, id: &u32) -> Arc> { + let pick_vlogs = self.pick_log_guard(); + let vlogs = pick_vlogs.vlogs.get(id); + let vlog = vlogs.unwrap(); + vlog.clone() + } + + // Note: it not including dirty file + fn pick_log_guard(&self) -> PickVlogsGuardsReadLock { + let vlogs = self.vlogs.read(); + let vlogs_fids = vlogs.keys().map(|fid| *fid).collect::>(); + let dirty_vlogs = self.dirty_vlogs.read(); + let mut fids = vlogs_fids + .difference(&*dirty_vlogs) .map(|fid| *fid) - .collect::>(); - let mut fids = fid_set - .symmetric_difference(&files.files_to_be_deleted) .collect::>(); fids.sort(); - if fids.len() <= 1 { - return None; - } - // This file shouldn't be being written to. - let mut idx = random::() % fids.len(); - if idx > 0 { - // Another level of rand to favor smaller fids. - idx = random::() % idx; - } - let log = - RwLockWriteGuard::try_map(files, |fs| fs.files_map.get_mut(&(idx as u32))).unwrap(); - Some(log) + PickVlogsGuardsReadLock { vlogs, fids } } fn get_data_files(path: &str) -> Result> { @@ -546,6 +620,7 @@ impl ValueLogCore { ps.sort(); Ok(ps) } + fn parse_file_ids(data_files: &mut Vec) -> Result> { let mut data_file_ids = data_files .iter_mut() @@ -562,11 +637,29 @@ impl ValueLogCore { Ok(data_file_ids) } - fn wait_gc(&self) {todo!()} - fn run_gc(&self) -> Result<()> {todo!()} + fn wait_gc(&self) { + todo!() + } + fn run_gc(&self) -> Result<()> { + todo!() + } +} +struct PickVlogsGuardsReadLock<'a> { + vlogs: lock_api::RwLockReadGuard< + 'a, + RawRwLock, + HashMap>>, + >, + fids: Vec, } +// impl <'a> PickVlogsGuardsReadLock<'a> { +// fn to_owned(self) -> lock_api::RwLockReadGuard<'a, RawRwLock, HashMap>>> { +// self.vlogs +// } +// } + struct ValueLogIterator<'a> { fd: &'a File, } @@ -586,43 +679,99 @@ impl<'a> ValueLogIterator<'a> { fn it() { use parking_lot::*; struct Flock { - df: RwLock>, + df: RwLock>>, age: u32, } - impl Flock { - fn get_df( + // impl Flock { + // fn get_df( + // &self, + // ) -> std::result::Result< + // lock_api::MappedRwLockReadGuard<'_, RawRwLock, String>, + // lock_api::RwLockReadGuard<'_, RawRwLock, HashMap>, + // > { + // RwLockReadGuard::try_map(self.df.read(), |df| df.get(&0)) + // } + // + // fn get_mut( + // &self, + // idx: u32, + // ) -> std::result::Result< + // lock_api::MappedRwLockWriteGuard<'_, RawRwLock, String>, + // lock_api::RwLockWriteGuard<'_, RawRwLock, HashMap>, + // > { + // RwLockWriteGuard::try_map(self.df.write(), |df| df.get_mut(&idx)) + // } + // } + + let mut flock = Flock { + df: RwLock::new(HashMap::new()), + age: 19, + }; + { + flock + .df + .write() + .insert(0, RwLock::new("foobat".to_string())); + flock.df.write().insert(1, RwLock::new("ok!".to_string())); + } + // let lock1 = flock.df.write().get(&0).as_mut().unwrap(); + // let lock2 = flock.df.write().get(&1).as_mut().unwrap(); + // flock.df.write().insert(3, RwLock::new("ok!".to_string())); + // let value = RwLockReadGuard::try_map(lock1.read(), |df| Some(df)); + // println!("WHat??? {:?}", value); +} + +#[tokio::test] +async fn lock() { + use parking_lot::*; + + #[derive(Debug)] + struct FileLog {} + + #[derive(Debug)] + struct FileLogProxy { + files: HashMap>, + } + + impl FileLogProxy { + fn get_file( &self, - ) -> std::result::Result< - lock_api::MappedRwLockReadGuard<'_, RawRwLock, String>, - lock_api::RwLockReadGuard<'_, RawRwLock, HashMap>, - > { - RwLockReadGuard::try_map(self.df.read(), |df| df.get(&0)) + idx: u32, + ) -> parking_lot::lock_api::RwLockReadGuard<'_, RawRwLock, FileLog> { + let flog = self.files.get(&idx).unwrap(); + let c = flog.read(); + c } - fn get_mut( + fn get_mut_file( &self, + idx: u32, ) -> std::result::Result< - lock_api::MappedRwLockWriteGuard<'_, RawRwLock, String>, - lock_api::RwLockWriteGuard<'_, RawRwLock, HashMap>, + parking_lot::lock_api::MappedRwLockWriteGuard<'_, RawRwLock, FileLog>, + parking_lot::lock_api::RwLockWriteGuard<'_, RawRwLock, FileLog>, > { - RwLockWriteGuard::try_map(self.df.write(), |df| df.get_mut(&0)) + let flog = self.files.get(&idx).unwrap(); + RwLockWriteGuard::try_map(flog.write(), |df| Some(df)) } } -} -#[test] -fn arc_lock() { - let fd = Arc::new(parking_lot::RwLock::new((100, 200))); - let mut v = vec![]; - for i in 0..100 { - let mut fd = fd.clone(); - let a = thread::spawn(move || { - fd.write().0 += 1; - }); - v.push(a); - } - for i in v { - i.join().unwrap(); - } - println!("{:?}", fd.read()); + struct ValueLog { + df: RwLock, + age: u32, + } + + impl ValueLog { + // fn max_vlog_rl( + // &self, + // ) -> parking_lot::lock_api::RwLockReadGuard<'_, RawRwLock, FileLog> { + // let rl = self.rl(); + // let vlog = rl.get_file(0); + // vlog + // } + + // fn rl(&self) -> parking_lot::lock_api::RwLockReadGuard<'_, RawRwLock, FileLog> { + // let df = self.df.read().get_file(0); + // df + // } + } } diff --git a/src/value_log_tests.rs b/src/value_log_tests.rs index a086799..f7d93c9 100644 --- a/src/value_log_tests.rs +++ b/src/value_log_tests.rs @@ -1,4 +1,62 @@ +use crate::kv::KV; +use crate::options::Options; +use crate::test_util::{mock_log, mock_log_terminal, random_tmp_dir}; +use crate::value_log::{Entry, MetaBit, Request}; +use std::cell::RefCell; +use std::env::temp_dir; +use std::fs; +use std::sync::Arc; +use awaitgroup::WaitGroup; + +fn new_test_options(dir: String) -> Options { + let mut opt = Options::default(); + opt.max_table_size = 1 << 15; // Force more compaction + opt.level_one_size = 4 << 15; // Force more compaction. + opt.dir = Box::new(dir.clone()); + opt.value_dir = Box::new(dir.clone()); + return opt; +} + #[test] fn value_basic() { - -} \ No newline at end of file + mock_log_terminal(); + let dir = random_tmp_dir(); + println!("{}", dir); + let mut kv = KV::new(new_test_options(dir)).unwrap(); + // Use value big enough that the value log writes them even if SyncWrites is false. + let val1 = b"sampleval012345678901234567890123"; + let val2 = b"samplevalb012345678901234567890123"; + assert!(val1.len() >= kv.opt.value_threshold); + + let entry1 = Entry { + key: b"samplekey".to_vec(), + value: val1.to_vec(), + meta: MetaBit::BitValuePointer.bits(), + cas_counter_check: 22222, + cas_counter: 33333, + offset: 0, + user_meta: 0, + }; + let entry2 = Entry { + key: b"samplekeyb".to_vec(), + value: val2.to_vec(), + meta: MetaBit::BitValuePointer.bits(), + cas_counter_check: 22225, + cas_counter: 33335, + offset: 0, + user_meta: 0, + }; + + let mut wait = WaitGroup::new(); + let b = Request { + entries: vec![RefCell::new(entry1), RefCell::new(entry2)], + ptrs: RefCell::new(vec![]), + wait_group: RefCell::new(Some(wait.worker())), + err: RefCell::new(Arc::new(Ok(()))), + }; + // todo add event stats + + kv.must_mut_vlog() + .write(&vec![b]) + .expect("TODO: panic message"); +} diff --git a/src/y/mod.rs b/src/y/mod.rs index 112f612..bca0478 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -60,6 +60,10 @@ pub enum Error { /// Returned if the user request is invalid. #[error("Invalid request")] ValueInvalidRequest, + #[error("Invalid Dir, directory does not exist")] + InValidDir, + #[error("Invalid ValueLogFileSize, must be between 1MB and 2GB")] + ValueLogSize, ////////////////////////////////// // valueLog error @@ -183,7 +187,7 @@ pub(crate) fn parallel_load_block_key(fp: File, offsets: Vec) -> Vec Result if synced { flags |= datasyncFileFlag; } - File::options().mode(0).custom_flags(flags).open(file_name).map_err(|err| err.into()) + File::options() + .mode(0) + .custom_flags(flags) + .open(file_name) + .map_err(|err| err.into()) } pub(crate) fn create_synced_file(file_name: &str, synce: bool) -> Result { - use std::os::unix::fs::OpenOptionsExt; - let mut flags = libc::O_RDWR | libc::O_CREAT | libc::O_EXCL; - if synce { - flags |= datasyncFileFlag; - } - File::options().mode(0666).custom_flags(flags).open(file_name).map_err(|err| err.into()) + // use std::os::unix::fs::OpenOptionsExt; + // let mut flags = libc::O_RDWR | libc::O_CREAT | libc::O_EXCL; + // if synce { + // // flags |= datasyncFileFlag; + // } + // File::options().custom_flags(flags).open(file_name).map_err(|err| err.into()) + OpenOptions::new() + .write(true) + .read(true) + .create(true) + .append(true) + .open(file_name) + .map_err(|err| err.into()) } #[test] From 028b1c297b217cc1cf210b1ca7473652dedef5f6 Mon Sep 17 00:00:00 2001 From: Rg Date: Sat, 12 Nov 2022 22:11:31 +0800 Subject: [PATCH 02/77] :cat::dog: --- src/kv.rs | 4 ++-- src/lib.rs | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 467d177..6a7e709 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -141,7 +141,7 @@ impl KV { // replaying, we will think that these CAS operations should fail, when they are actually // valid. - // There is code (in flushMemtable) whose correctness depends on us generating CAS Counter + // There is code (in flush_mem_table) whose correctness depends on us generating CAS Counter // values _before_ we modify s.vptr here. for req in reqs { let counter_base = self.new_cas_counter(req.entries.len() as u64); @@ -157,7 +157,7 @@ impl KV { return _ok.clone(); } - info!("Writing to memtable"); + info!("Writing to memory table"); let mut count = 0; for req in reqs { if req.entries.is_empty() { diff --git a/src/lib.rs b/src/lib.rs index ed3acaa..2b72302 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,6 +18,7 @@ use std::mem::align_of; mod event; mod kv; +mod level_handler; mod log_file; mod manifest; mod options; @@ -25,6 +26,7 @@ mod skl; mod table; mod types; mod value_log; +#[cfg(test)] mod value_log_tests; mod y; From 7c327105a33cdaaaa40332c6e86094a4241961fa Mon Sep 17 00:00:00 2001 From: zhenghuarong Date: Wed, 16 Nov 2022 20:40:34 +0800 Subject: [PATCH 03/77] add log --- src/kv.rs | 38 ++++++++++- src/lib.rs | 1 - src/types.rs | 8 +-- src/value_log.rs | 163 +++++++++++++++++++++++------------------------ src/y/mod.rs | 18 +++++- 5 files changed, 136 insertions(+), 92 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 6a7e709..965d8ff 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -16,7 +16,8 @@ use std::path::Path; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::Arc; -const _BADGER_PREFIX: &[u8; 8] = b"!badger!"; // Prefix for internal keys used by badger. +const _BADGER_PREFIX: &[u8; 8] = b"!badger!"; +// Prefix for internal keys used by badger. const _HEAD: &[u8; 11] = b"!bager!head"; // For Storing value offset for replay. struct Closers { @@ -43,7 +44,10 @@ pub struct KV { dir_lock_guard: File, value_dir_guard: File, closers: Closers, + // Our latest (actively written) in-memory table. mt: SkipList, + // Add here only AFTER pushing to flush_ch + imm: Vec, // Incremented in the non-concurrently accessed write loop. But also accessed outside. So // we use an atomic op. last_used_cas_counter: AtomicU64, @@ -103,6 +107,7 @@ impl KV { value_dir_guard, closers, mt, + imm: Vec::new(), last_used_cas_counter: Default::default(), }; let mut vlog = ValueLogCore::default(); @@ -121,6 +126,37 @@ impl KV { } impl KV { + // get returns the value in `mem_table` or disk for given key. + // Note that value will include meta byte. + fn get(&self, key: &[u8]) -> Result { + let tables = self.get_mem_tables(); + + // TODO add metrics + for tb in tables { + let vs = tb.get(); + if vs.is_none() { + continue; + } + + } + } + + // Returns the current `mem_tables` and get references. + fn get_mem_tables(&self) -> Vec<&SkipList> { + // TODO add kv lock + let mut tables = Vec::with_capacity(self.imm.len() + 1); + // Get mutable `mem_tables`. + tables.push(&self.mt); + tables[0].incr_ref(); + + // Get immutable `mem_tables`. + for tb in self.imm.iter().rev() { + tb.incr_ref(); + tables.push(tb); + } + tables + } + fn write_requests(&self, reqs: &[Request]) -> Arc> { if reqs.is_empty() { return Arc::new(Ok(())); diff --git a/src/lib.rs b/src/lib.rs index 2b72302..fe81e1c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,7 +18,6 @@ use std::mem::align_of; mod event; mod kv; -mod level_handler; mod log_file; mod manifest; mod options; diff --git a/src/types.rs b/src/types.rs index 0407b14..d140cc6 100644 --- a/src/types.rs +++ b/src/types.rs @@ -3,7 +3,7 @@ use std::sync::atomic::{AtomicIsize, AtomicUsize, Ordering}; use std::sync::Arc; use std::time::Duration; -use async_channel::{bounded, Receiver, RecvError, SendError, Sender, TryRecvError}; +use async_channel::{bounded, Receiver, RecvError, SendError, Sender, TryRecvError, TrySendError}; use tokio::time::sleep; #[derive(Clone)] @@ -20,14 +20,14 @@ impl Channel { tx: Some(tx), } } - async fn try_send(&self, msg: T) -> Result<(), SendError> { + pub(crate) fn try_send(&self, msg: T) -> Result<(), TrySendError> { if let Some(tx) = &self.tx { - return tx.send(msg).await; + return tx.try_send(msg); } Ok(()) } - pub(crate) async fn try_recv(&self) -> Result { + pub(crate) fn try_recv(&self) -> Result { if let Some(rx) = &self.rx { return rx.try_recv(); } diff --git a/src/value_log.rs b/src/value_log.rs index d60b1ec..d925335 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -28,20 +28,23 @@ use crate::log_file::LogFile; use crate::options::Options; use crate::skl::BlockBytes; use crate::table::iterator::BlockSlice; -use crate::y::{create_synced_file, is_eof, open_existing_synced_file, read_at, Decode, Encode}; +use crate::types::Channel; +use crate::y::{ + create_synced_file, is_eof, open_existing_synced_file, read_at, sync_directory, Decode, Encode, +}; use crate::Error::Unexpected; use crate::{Error, Result}; -// Values have their first byte being byteData or byteDelete. This helps us distinguish between -// a key that has never been seen and a key that has been explicitly deleted. +/// Values have their first byte being byteData or byteDelete. This helps us distinguish between +/// a key that has never been seen and a key that has been explicitly deleted. bitflags! { pub struct MetaBit: u8{ - // Set if the key has been deleted. + /// Set if the key has been deleted. const BitDelete = 1; - // Set if the value is NOT stored directly next to key. + /// Set if the value is NOT stored directly next to key. const BitValuePointer = 2; const BitUnused = 4; - // Set if the key is set using SetIfAbsent. + /// Set if the key is set using SetIfAbsent. const BitSetIfAbsent = 8; } } @@ -65,45 +68,15 @@ impl Header { } } -impl From<&[u8]> for Header { - fn from(buf: &[u8]) -> Self { - let mut cur = Cursor::new(buf); - let mut h = Header::default(); - h.k_len = cur.read_u32::().unwrap(); - h.v_len = cur.read_u32::().unwrap(); - h.meta = cur.read_u8().unwrap(); - h.user_mata = cur.read_u8().unwrap(); - h.cas_counter = cur.read_u64::().unwrap(); - h.cas_counter_check = cur.read_u64::().unwrap(); - h - } -} - -impl Into> for Header { - fn into(self) -> Vec { - let mut cur = Cursor::new(vec![0u8; Header::encoded_size()]); - cur.write_u32::(self.k_len).unwrap(); - cur.write_u32::(self.v_len).unwrap(); - cur.write_u8(self.meta).unwrap(); - cur.write_u8(self.user_mata).unwrap(); - cur.write_u64::(self.cas_counter).unwrap(); - cur.write_u64::(self.cas_counter_check).unwrap(); - cur.into_inner() - } -} - impl Encode for Header { fn enc(&self, wt: &mut dyn Write) -> Result { - let mut cur = Cursor::new(vec![0u8; Header::encoded_size()]); - cur.write_u32::(self.k_len)?; - cur.write_u32::(self.v_len)?; - cur.write_u8(self.meta)?; - cur.write_u8(self.user_mata)?; - cur.write_u64::(self.cas_counter)?; - cur.write_u64::(self.cas_counter_check)?; - let block = cur.into_inner(); - wt.write_all(&block)?; - Ok(block.len()) + wt.write_u32::(self.k_len)?; + wt.write_u32::(self.v_len)?; + wt.write_u8(self.meta)?; + wt.write_u8(self.user_mata)?; + wt.write_u64::(self.cas_counter)?; + wt.write_u64::(self.cas_counter_check)?; + Ok(Self::encoded_size()) } } @@ -119,8 +92,8 @@ impl Decode for Header { } } -/// Entry provides Key, Value and if required, CASCounterCheck to kv.BatchSet() API. -/// If CASCounterCheck is provided, it would be compared against the current casCounter +/// Entry provides Key, Value and if required, cas_counter_check to kv.batch_set() API. +/// If cas_counter_check is provided, it would be compared against the current `cas_counter` /// assigned to this key-value. Set be done on this key only if the counters match. #[derive(Default)] pub struct Entry { @@ -209,7 +182,7 @@ impl ValuePointer { self.fid == 0 && self.offset == 0 && self.len == 0 } - pub(crate) const fn value_pointer_encoded_size() -> usize { + pub(crate) const fn value_pointer_encoded_size() -> usize { size_of::() } } @@ -232,27 +205,6 @@ impl Decode for ValuePointer { } } -impl From<&[u8]> for ValuePointer { - fn from(buf: &[u8]) -> Self { - let mut cur = Cursor::new(buf); - let mut value = ValuePointer::default(); - value.fid = cur.read_u32::().unwrap(); - value.len = cur.read_u32::().unwrap(); - value.offset = cur.read_u32::().unwrap(); - value - } -} - -impl Into> for ValuePointer { - fn into(self) -> Vec { - let mut cur = Cursor::new(vec![0u8; ValuePointer::value_pointer_encoded_size()]); - cur.write_u32::(self.fid).unwrap(); - cur.write_u32::(self.len).unwrap(); - cur.write_u32::(self.offset).unwrap(); - cur.into_inner() - } -} - pub(crate) struct Request { // Input values pub(crate) entries: Vec>, @@ -265,13 +217,20 @@ pub(crate) struct Request { pub struct ValueLogCore { dir_path: Box, max_fid: AtomicU32, + // TODO + // guards our view of which files exist, which to be deleted, how many active iterators + files_log: Arc>, vlogs: Arc>>>>, dirty_vlogs: Arc>>, + // TODO why? + // A refcount of iterators -- when this hits zero, we can delete the files_to_be_deleted. num_active_iterators: AtomicI32, writable_log_offset: AtomicU32, buf: RefCell>>, opt: Options, kv: *const KV, + // Only allow one GC at a time. + garbage_ch: Channel<()>, } impl Default for ValueLogCore { @@ -279,6 +238,7 @@ impl Default for ValueLogCore { ValueLogCore { dir_path: Box::new("".to_string()), max_fid: Default::default(), + files_log: Arc::new(Default::default()), vlogs: Arc::new(Default::default()), dirty_vlogs: Arc::new(Default::default()), num_active_iterators: Default::default(), @@ -286,16 +246,11 @@ impl Default for ValueLogCore { buf: RefCell::new(BufWriter::new(vec![0u8; 0])), opt: Default::default(), kv: std::ptr::null(), + garbage_ch: Channel::new(1), } } } -#[derive(Debug, Default)] -struct ValueFiles { - files_to_be_deleted: HashSet, - files_map: HashMap, -} - impl ValueLogCore { fn vlog_file_path(dir_path: &str, fid: u32) -> String { let mut path = Path::new(dir_path).join(format!("{:06}.vlog", fid)); @@ -305,7 +260,6 @@ impl ValueLogCore { ValueLogCore::vlog_file_path(&self.dir_path, fid) } - // todo add logFile as return value. fn create_vlog_file(&self, fid: u32) -> Result { let _path = self.fpath(fid); let mut log_file = LogFile { @@ -318,26 +272,26 @@ impl ValueLogCore { self.writable_log_offset.store(0, Ordering::Release); let fd = create_synced_file(&_path, self.opt.sync_writes)?; log_file.fd.replace(fd); - // todo sync directly + sync_directory(&self.dir_path)?; Ok(log_file) } fn create_mmap_vlog_file(&self, fid: u32, offset: u64) -> Result { let mut vlog_file = self.create_vlog_file(fid)?; vlog_file.fd.as_mut().unwrap().set_len(offset)?; - vlog_file.fd.as_mut().unwrap().set_len(offset)?; let mut _mmap = unsafe { MmapMut::map_mut(vlog_file.fd.as_ref().unwrap())? }; vlog_file._mmap.replace(_mmap); Ok(vlog_file) } + // TODO Use Arc to replace it pub(crate) fn open(&mut self, kv: &KV, opt: Options) -> Result<()> { self.dir_path = opt.value_dir.clone(); self.opt = opt; let kv = kv as *const KV; self.kv = kv; self.open_create_files()?; - // todo add garbage + // todo add garbage and metrics Ok(()) } @@ -363,7 +317,7 @@ impl ValueLogCore { Ok(()) } - fn open_create_files(&mut self) -> Result<()> { + fn open_create_files(&self) -> Result<()> { match fs::create_dir_all(self.dir_path.as_str()) { Ok(_) => {} Err(err) if err.kind() == std::io::ErrorKind::AlreadyExists => {} @@ -568,6 +522,31 @@ impl ValueLogCore { to_disk() } + fn rewrite(&self, lf: &LogFile) -> Result<()> { + let max_fid = self.max_fid.load(Ordering::Relaxed); + assert!( + lf.fid < max_fid, + "fid to move: {}. Current max fid: {}", + lf.fid, + max_fid + ); + // TODO add metrics + + let mut wb = Vec::with_capacity(1000); + let mut size = 0i64; + let mut count = 0; + let fe = |e: &Entry| -> Result<()> { + count += 1; + if count % 1000 == 0 { + info!("Processing entry {}", count); + } + let vs = self.get_kv().get(&e.key); + Ok(()) + }; + + Ok(()) + } + fn pick_log(&self) -> Option>> { let vlogs_guard = self.pick_log_guard(); if vlogs_guard.vlogs.len() <= 1 { @@ -654,12 +633,6 @@ struct PickVlogsGuardsReadLock<'a> { fids: Vec, } -// impl <'a> PickVlogsGuardsReadLock<'a> { -// fn to_owned(self) -> lock_api::RwLockReadGuard<'a, RawRwLock, HashMap>>> { -// self.vlogs -// } -// } - struct ValueLogIterator<'a> { fd: &'a File, } @@ -675,6 +648,28 @@ impl<'a> ValueLogIterator<'a> { } } +pub struct SafeValueLog { + gc_channel: Channel<()>, + value_log: Arc, +} + +impl SafeValueLog { + async fn trigger_gc(&self, gc_threshold: f64) -> Result<()> { + return match self.gc_channel.try_send(()) { + Ok(()) => { + let ok = self.do_run_gcc(gc_threshold).await; + self.gc_channel.recv().await.unwrap(); + ok + } + Err(err) => Err(Error::ValueRejected), + }; + } + + async fn do_run_gcc(&self, gc_threshold: f64) -> Result<()> { + Ok(()) + } +} + #[test] fn it() { use parking_lot::*; diff --git a/src/y/mod.rs b/src/y/mod.rs index bca0478..049c304 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -13,9 +13,10 @@ use std::collections::HashMap; use std::error::Error as _; use std::fs::{File, OpenOptions}; use std::hash::Hasher; -use std::io::ErrorKind; +use std::io::{ErrorKind, Write}; use std::sync::{Arc, RwLock}; use std::{cmp, io}; +use libc::fsync; use thiserror::Error; /// Constants use in serialization sizes, and in ValueStruct serialization @@ -187,7 +188,7 @@ pub(crate) fn parallel_load_block_key(fp: File, offsets: Vec) -> Vec cmp::Ordering { } const datasyncFileFlag: libc::c_int = 0x0; + pub(crate) fn open_existing_synced_file(file_name: &str, synced: bool) -> Result { use std::os::unix::fs::OpenOptionsExt; let mut flags = libc::O_RDWR; @@ -238,7 +240,19 @@ pub(crate) fn create_synced_file(file_name: &str, synce: bool) -> Result { .map_err(|err| err.into()) } +pub(crate) fn sync_directory(d: &String) -> Result<()> { + let mut fp = File::open(d)?; + fp.sync_all().map_err(|err| err.into()) +} + #[test] fn it_cpu() { println!("{:?}", num_cpu()); } + + +#[test] +fn sync_dir() { + let ok = sync_directory(&"/tmp".to_string()); + println!("{:?}", ok); +} \ No newline at end of file From f090cbea6efdc6df22d87e5650f2cc50e829cad8 Mon Sep 17 00:00:00 2001 From: rg Date: Thu, 17 Nov 2022 01:48:07 +0800 Subject: [PATCH 04/77] :dog: --- src/compaction.rs | 3 +++ src/kv.rs | 24 +++++++++++++++++++++--- src/level_handler.rs | 25 +++++++++++++++++++++++++ src/levels.rs | 42 ++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 +++ src/skl/skip.rs | 2 +- src/table/mod.rs | 2 +- 7 files changed, 96 insertions(+), 5 deletions(-) create mode 100644 src/compaction.rs create mode 100644 src/level_handler.rs create mode 100644 src/levels.rs diff --git a/src/compaction.rs b/src/compaction.rs new file mode 100644 index 0000000..1fcc52d --- /dev/null +++ b/src/compaction.rs @@ -0,0 +1,3 @@ +pub(crate) struct CompactStatus{ + +} \ No newline at end of file diff --git a/src/kv.rs b/src/kv.rs index 965d8ff..10de3dd 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -14,7 +14,7 @@ use std::fs::{try_exists, OpenOptions}; use std::io::Write; use std::path::Path; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, Weak}; const _BADGER_PREFIX: &[u8; 8] = b"!badger!"; // Prefix for internal keys used by badger. @@ -128,7 +128,7 @@ impl KV { impl KV { // get returns the value in `mem_table` or disk for given key. // Note that value will include meta byte. - fn get(&self, key: &[u8]) -> Result { + pub(crate) fn get(&self, key: &[u8]) -> Result { let tables = self.get_mem_tables(); // TODO add metrics @@ -137,7 +137,11 @@ impl KV { if vs.is_none() { continue; } - + let vs = vs.unwrap(); + // TODO why + if vs.meta != 0 && !vs.value.is_empty() { + return Ok(vs); + } } } @@ -244,6 +248,20 @@ impl KV { } } +pub struct WeakKV(Weak); + +impl WeakKV { + pub(crate) fn new() -> Self { Self(Weak::new()) } + pub(crate) fn upgrade(&self) -> Option { + // self.0.upgrade().map() + todo!() + } + pub(crate) fn from(kv: &ArcKV) -> Self { + // Self(Arc::downgrade(&kv.0)) + todo!() + } +} + fn write_level0_table(st: &SkipList, mut f: &File) -> Result<()> { let cur = st.new_cursor(); let mut builder = Builder::default(); diff --git a/src/level_handler.rs b/src/level_handler.rs new file mode 100644 index 0000000..c9f6e3e --- /dev/null +++ b/src/level_handler.rs @@ -0,0 +1,25 @@ +use std::sync::Arc; +use std::sync::atomic::AtomicI32; +use parking_lot::RwLock; +use crate::kv::WeakKV; +use crate::table::builder; +use crate::table::table::TableCore; + +pub(crate) struct LevelHandler { + // Guards tables, total_size. + tables: Arc, i64)>>, + // The following are initialized once and const. + level: Arc, + str_level: Arc, + max_total_size: Arc, + kv: WeakKV, +} + +impl LevelHandler { + fn init_tables(&self, tables: Vec) { + let total_size = tables.iter().fold(0, |acc, &table| acc + table.size()); + let tb = self.tables.write(); + tb.0 = tables; + tb.1 = total_size; + } +} \ No newline at end of file diff --git a/src/levels.rs b/src/levels.rs new file mode 100644 index 0000000..4c8f425 --- /dev/null +++ b/src/levels.rs @@ -0,0 +1,42 @@ +use std::sync::Arc; +use std::sync::atomic::AtomicI64; +use awaitgroup::WaitGroup; +use crate::compaction::CompactStatus; +use crate::kv::{KV, WeakKV}; +use crate::level_handler::LevelHandler; + +pub(crate) struct LevelsController { + // The following are initialized once and const + levels: Arc>, + kv: *const KV, + // Atomic + next_file_id: AtomicI64, + // For ending compactions. + compact_worker_wg: WaitGroup, + c_status: CompactStatus, +} + +impl Default for LevelsController { + fn default() -> Self { + todo!() + } +} + +struct LevelsControllerInner { + // The following are initialized once and const + levels: Arc>, + kv: WeakKV, + // Atomic + next_file_id: AtomicI64, + // For ending compactions. + compact_worker_wg: WaitGroup, + c_status: CompactStatus, +} + +impl LevelsControllerInner { + // Returns true if level zero may be compacted, without accounting for compactions that already + // might be happening. + fn is_level0_compact_table(&self) -> bool { + // self.levels[0] + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index fe81e1c..3f30e8d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -31,6 +31,9 @@ mod y; #[cfg(test)] mod test_util; +mod levels; +mod level_handler; +mod compaction; pub use skl::{Arena, Node, SkipList}; pub use y::{Error, Result}; diff --git a/src/skl/skip.rs b/src/skl/skip.rs index 66b5178..f9bac1e 100644 --- a/src/skl/skip.rs +++ b/src/skl/skip.rs @@ -333,7 +333,7 @@ impl SkipList { // gets the value associated with the key. // FIXME: maybe return Option<&ValueStruct> - fn get(&self, key: &[u8]) -> Option { + pub(crate) fn get(&self, key: &[u8]) -> Option { let (node, found) = self.find_near(key, false, true); if !found { return None; diff --git a/src/table/mod.rs b/src/table/mod.rs index 6b7cc37..ec24163 100644 --- a/src/table/mod.rs +++ b/src/table/mod.rs @@ -1,4 +1,4 @@ pub(crate) mod builder; pub(crate) mod iterator; -mod table; +pub(crate) mod table; mod tests; From 3dd5431b34b69b508728204adedffd3182821fb5 Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 28 Dec 2022 01:09:06 +0800 Subject: [PATCH 05/77] fix compiled --- src/kv.rs | 4 +++- src/level_handler.rs | 8 ++++---- src/levels.rs | 1 + src/manifest.rs | 45 ++++++++++++++++++++++++++++++++++++++------ src/value_log.rs | 2 +- 5 files changed, 48 insertions(+), 12 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 10de3dd..44161a5 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -133,7 +133,7 @@ impl KV { // TODO add metrics for tb in tables { - let vs = tb.get(); + let vs = tb.get(key); if vs.is_none() { continue; } @@ -143,6 +143,8 @@ impl KV { return Ok(vs); } } + + todo!() } // Returns the current `mem_tables` and get references. diff --git a/src/level_handler.rs b/src/level_handler.rs index c9f6e3e..55ff906 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -17,9 +17,9 @@ pub(crate) struct LevelHandler { impl LevelHandler { fn init_tables(&self, tables: Vec) { - let total_size = tables.iter().fold(0, |acc, &table| acc + table.size()); - let tb = self.tables.write(); - tb.0 = tables; - tb.1 = total_size; + // let total_size = tables.iter().fold(0, |acc, &table| acc + table.size()); + // let mut tb = self.tables.write(); + // tb.0 = tables; + // tb.1 = total_size as i64; } } \ No newline at end of file diff --git a/src/levels.rs b/src/levels.rs index 4c8f425..f4541db 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -38,5 +38,6 @@ impl LevelsControllerInner { // might be happening. fn is_level0_compact_table(&self) -> bool { // self.levels[0] + todo!() } } \ No newline at end of file diff --git a/src/manifest.rs b/src/manifest.rs index 5e447c5..5ee0d4b 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -1,12 +1,26 @@ +use std::collections::{HashMap, HashSet}; +use crate::Result; use parking_lot::RwLock; use std::fs::File; -use crate::Result; -/// TableManifest contains information about a specific level +// Manifest file +const MANIFEST_FILENAME: &str = "MANIFEST"; +const MANIFEST_REWRITE_FILENAME: &str = "MANIFEST-REWRITE"; +const MANIFEST_DELETIONS_REWRITE_THRESHOLD: usize = 10000; +const MANIFEST_DELETIONS_RATIO: usize = 10; + + +/// Contains information about LSM tree levels +/// in the *MANIFEST* file. +pub struct LevelManifest { + tables: HashSet, // Set of table id's +} + +/// *TableManifest* contains information about a specific level /// in the LSM tree. #[derive(Default)] pub struct TableManifest { - Level: u8, + level: u8, } #[derive(Default)] @@ -21,7 +35,8 @@ pub(crate) struct ManifestFile { // Used to track the current state of the manifest, used when rewriting. manifest: Manifest, } -/// Manifest represnts the contents of the MANIFEST file in a Badger store. + +/// Manifest represents the contents of the MANIFEST file in a Badger store. /// /// The MANIFEST file describes the startup state of the db -- all LSM files and what level they're /// at. @@ -30,9 +45,27 @@ pub(crate) struct ManifestFile { /// and contains a sequence of ManifestChange's (file creations/deletions) which we use to /// reconstruct the manifest at startup. #[derive(Default)] -pub struct Manifest {} +pub struct Manifest { + levels: Vec, + tables: HashMap, + // Contains total number of creation and deletion changes in the manifest --- used to compute + // whether it'd be useful to rewrite the manifest + creations: usize, + deletions: usize, +} + +impl Manifest { + pub fn new() -> Self { + Manifest { + levels: vec![], + tables: HashMap::default(), + creations: Default::default(), + deletions: Default::default(), + } + } +} // todo -pub(crate) fn open_or_create_manifest_file(dir: &str) -> Result<(ManifestFile, Manifest)>{ +pub(crate) fn open_or_create_manifest_file(dir: &str) -> Result<(ManifestFile, Manifest)> { Ok((ManifestFile::default(), Manifest::default())) } diff --git a/src/value_log.rs b/src/value_log.rs index d925335..4102af3 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -532,7 +532,7 @@ impl ValueLogCore { ); // TODO add metrics - let mut wb = Vec::with_capacity(1000); + // let mut wb = Vec::with_capacity(1000); let mut size = 0i64; let mut count = 0; let fe = |e: &Entry| -> Result<()> { From 95a89544194c2c1bc8c588590275f48e88e5d461 Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 28 Dec 2022 15:30:37 +0800 Subject: [PATCH 06/77] :dog: --- Cargo.toml | 1 + build.rs | 4 + src/kv.rs | 2 +- src/lib.rs | 10 +-- src/manifest.rs | 205 +++++++++++++++++++++++++++++++++++++++++++++++- src/pb/mod.rs | 3 + src/y/mod.rs | 3 + 7 files changed, 218 insertions(+), 10 deletions(-) create mode 100644 build.rs create mode 100644 src/pb/mod.rs diff --git a/Cargo.toml b/Cargo.toml index 9dc5cc5..6627f17 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ async-channel = "1.7.1" file-guard = "0.1.0" fs2 = "0.4.3" awaitgroup = "0.6.0" +protobuf = { version = "3.2.0", features = ["with-bytes"] } [dev-dependencies] chrono = "0.4.22" env_logger = "0.9.1" diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..36e2d35 --- /dev/null +++ b/build.rs @@ -0,0 +1,4 @@ + +fn main () { + // protoc --rust_out=src/pb src/pb/badgerpb3.proto +} \ No newline at end of file diff --git a/src/kv.rs b/src/kv.rs index 44161a5..81c3c8f 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -1,4 +1,4 @@ -use crate::manifest::{open_or_create_manifest_file, Manifest}; +use crate::manifest::{Manifest, open_or_create_manifest_file}; use crate::options::Options; use crate::table::builder::Builder; use crate::table::iterator::IteratorImpl; diff --git a/src/lib.rs b/src/lib.rs index 3f30e8d..193eac9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,8 +10,7 @@ #![feature(path_file_prefix)] #![feature(fs_try_exists)] #![feature(generic_associated_types)] - -extern crate core; +#![feature(unwrap_infallible)] use std::io; use std::mem::align_of; @@ -29,11 +28,12 @@ mod value_log; mod value_log_tests; mod y; +mod compaction; +mod level_handler; +mod levels; +mod pb; #[cfg(test)] mod test_util; -mod levels; -mod level_handler; -mod compaction; pub use skl::{Arena, Node, SkipList}; pub use y::{Error, Result}; diff --git a/src/manifest.rs b/src/manifest.rs index 5ee0d4b..6df3654 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -1,7 +1,18 @@ -use std::collections::{HashMap, HashSet}; -use crate::Result; +// use crate::pb::badgerpb3::{ManifestChange, ManifestChangeSet, ManifestChange_Operation}; +use crate::pb::badgerpb3::manifest_change::Operation; +use crate::pb::badgerpb3::{ManifestChange, ManifestChangeSet}; +use crate::y::is_eof; +use crate::Error::{BadMagic, Unexpected}; +use crate::{Error, Result}; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use bytes::buf::Reader; +use libc::bind; use parking_lot::RwLock; -use std::fs::File; +use protobuf::{Enum, EnumOrUnknown, Message}; +use std::collections::{HashMap, HashSet}; +use std::fs::{rename, File}; +use std::io::{Cursor, Read, Write}; +use std::path::Path; // Manifest file const MANIFEST_FILENAME: &str = "MANIFEST"; @@ -9,9 +20,15 @@ const MANIFEST_REWRITE_FILENAME: &str = "MANIFEST-REWRITE"; const MANIFEST_DELETIONS_REWRITE_THRESHOLD: usize = 10000; const MANIFEST_DELETIONS_RATIO: usize = 10; +// Has to be 4 bytes. The value can never change, ever, anyway. +const MAGIC_TEXT: &[u8; 4] = b"bdgr"; + +// The magic version number +const MAGIC_VERSION: u32 = 2; /// Contains information about LSM tree levels /// in the *MANIFEST* file. +#[derive(Default)] pub struct LevelManifest { tables: HashSet, // Set of table id's } @@ -63,9 +80,189 @@ impl Manifest { deletions: Default::default(), } } + + /// Reads the manifest file and constructs two manifest objects. (We need one immutable + /// copy and one mutable copy of the manifest. Easiest way is to construct two of them.) + /// Also, returns the last offset after a completely read manifest entry -- the file must be + /// truncated at that point before further appends are made (if there is a partial entry after + /// that). In normal conditions, trunc_offset is the file size. + pub fn replay_manifest_file(fp: &mut File) -> Result<(Manifest, usize)> { + let mut magic = vec![0u8; 4]; + if fp.read(&mut magic)? != 4 { + return Err(BadMagic); + } + if MAGIC_TEXT[..] != magic[..4] { + return Err(BadMagic); + } + if MAGIC_VERSION != fp.read_u32::()? { + return Err(BadMagic); + } + + let mut build = Manifest::new(); + let mut offset = 8; + loop { + let sz = fp.read_u32::()?; + let crc32 = fp.read_u32::()?; + offset += 8; + let mut buffer = vec![0u8; sz as usize]; + offset += fp.read(&mut buffer)?; + if crc32 != crc32fast::hash(&buffer) { + // TODO why + break; + } + let mut mf_set = ManifestChangeSet::parse_from_bytes(&buffer).map_err(|_| BadMagic)?; + apply_manifest_change_set(&mut build, &mf_set)?; + } + + Ok((build, offset)) + } + + pub fn rewrite(&self, dir: &str) -> Result<(File, usize)> { + let rewrite_path = Path::new(dir).join(MANIFEST_REWRITE_FILENAME); + // We explicitly sync. + let mut fp = File::options() + .create(true) + .write(true) + .truncate(true) + .read(true) + .open(&rewrite_path)?; + let mut wt = Cursor::new(vec![]); + wt.write_all(MAGIC_TEXT)?; + wt.write_u32::(MAGIC_VERSION)?; + + let net_creations = self.tables.len(); + let mut mf_set = ManifestChangeSet::new(); + mf_set.changes = self.as_changes(); + let mf_buffer = mf_set.write_to_bytes().unwrap(); + wt.write_u32::(mf_buffer.len() as u32)?; + let crc32 = crc32fast::hash(&*mf_buffer); + wt.write_u32::(crc32)?; + wt.write_all(&*mf_buffer)?; + fp.write_all(&*wt.into_inner())?; + fp.sync_all()?; + drop(fp); + + let manifest_path = Path::new(dir).join(MANIFEST_FILENAME); + rename(&rewrite_path, &manifest_path)?; + // TODO add directory sync + + let fp = File::options() + .create(true) + .write(true) + .truncate(true) + .read(true) + .open(manifest_path)?; + Ok((fp, net_creations)) + } + + fn as_changes(&self) -> Vec { + self.tables + .iter() + .map(|(id, tb)| { + ManifestChangeBuilder::new(*id) + .with_op(Operation::CREATE) + .with_level(tb.level as u32) + .build() + }) + .collect::>() + } +} + +// this is not a "recoverable" error -- opening the KV store fails because the MANIFEST file +// is just plain broken. +fn apply_manifest_change_set(build: &mut Manifest, mf_set: &ManifestChangeSet) -> Result<()> { + for change in mf_set.changes.iter() { + apply_manifest_change(build, change)?; + } + Ok(()) +} + +fn apply_manifest_change(build: &mut Manifest, tc: &ManifestChange) -> Result<()> { + let op = Operation::from_i32(tc.Op.value()).unwrap(); + match op { + Operation::CREATE => { + if build.tables.contains_key(&tc.Id) { + return Err(Unexpected(format!( + "MANIFEST invalid, table {} exists", + tc.Id + ))); + } + let table_mf = TableManifest { + level: tc.Level as u8, + }; + for _ in build.levels.len()..=tc.Level as usize { + build.levels.push(LevelManifest::default()); + } + build.tables.insert(tc.Id, table_mf); + build.levels[tc.Level as usize].tables.insert(tc.Id); + build.creations += 1; + } + + Operation::DELETE => { + let has = build.tables.remove(&tc.Id); + if has.is_none() { + return Err(Unexpected(format!( + "MANIFEST removes non-existing table {}", + tc.Id + ))); + } + let has = build + .levels + .get_mut(tc.Level as usize) + .unwrap() + .tables + .remove(&tc.Id); + assert!(has); + } + _ => { + return Err(Unexpected( + "MANIFEST file has invalid manifest_change op".into(), + )) + } + } + + Ok(()) } -// todo pub(crate) fn open_or_create_manifest_file(dir: &str) -> Result<(ManifestFile, Manifest)> { Ok((ManifestFile::default(), Manifest::default())) } + +struct ManifestChangeBuilder { + id: u64, + level: u32, + op: Operation, +} + +impl ManifestChangeBuilder { + fn new(id: u64) -> Self { + ManifestChangeBuilder { + id, + level: 0, + op: Default::default(), + } + } + + // fn id(mut self, id: u64) -> Self { + // self.id = id; + // self + // } + + fn with_level(mut self, level: u32) -> Self { + self.level = level; + self + } + + fn with_op(mut self, op: Operation) -> Self { + self.op = op; + self + } + + fn build(self) -> ManifestChange { + let mut mf = ManifestChange::new(); + mf.Id = self.id; + mf.Level = self.level; + mf.Op = EnumOrUnknown::new(self.op); + mf + } +} diff --git a/src/pb/mod.rs b/src/pb/mod.rs new file mode 100644 index 0000000..3c074c6 --- /dev/null +++ b/src/pb/mod.rs @@ -0,0 +1,3 @@ +// @generated + +pub mod badgerpb3; diff --git a/src/y/mod.rs b/src/y/mod.rs index 049c304..120aba0 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -75,6 +75,9 @@ pub enum Error { /// and encountering the end of slice. #[error("End of mapped region")] EOF, + #[error("Manifest has bad magic")] + BadMagic, + ///////////////////////////////// } impl Default for Error { From 6f7b92f69a3d266d687f42ef775306f23467da27 Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 28 Dec 2022 17:34:56 +0800 Subject: [PATCH 07/77] add badgerpb3 --- src/iterator.rs | 0 src/manifest.rs | 3 +- src/pb/badgerpb3.rs | 1648 +++++++++++++++++++++++++++++++++++++++++++ src/y/mod.rs | 12 +- 4 files changed, 1653 insertions(+), 10 deletions(-) create mode 100644 src/iterator.rs create mode 100644 src/pb/badgerpb3.rs diff --git a/src/iterator.rs b/src/iterator.rs new file mode 100644 index 0000000..e69de29 diff --git a/src/manifest.rs b/src/manifest.rs index 6df3654..b9df165 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -228,6 +228,7 @@ pub(crate) fn open_or_create_manifest_file(dir: &str) -> Result<(ManifestFile, M Ok((ManifestFile::default(), Manifest::default())) } +#[derive(Debug)] struct ManifestChangeBuilder { id: u64, level: u32, @@ -239,7 +240,7 @@ impl ManifestChangeBuilder { ManifestChangeBuilder { id, level: 0, - op: Default::default(), + op: Operation::CREATE, } } diff --git a/src/pb/badgerpb3.rs b/src/pb/badgerpb3.rs new file mode 100644 index 0000000..c02286a --- /dev/null +++ b/src/pb/badgerpb3.rs @@ -0,0 +1,1648 @@ +// This file is generated by rust-protobuf 3.2.0. Do not edit +// .proto file is parsed by protoc --rust-out=... +// @generated + +// https://github.com/rust-lang/rust-clippy/issues/702 +#![allow(unknown_lints)] +#![allow(clippy::all)] + +#![allow(unused_attributes)] +#![cfg_attr(rustfmt, rustfmt::skip)] + +#![allow(box_pointers)] +#![allow(dead_code)] +#![allow(missing_docs)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(non_upper_case_globals)] +#![allow(trivial_casts)] +#![allow(unused_results)] +#![allow(unused_mut)] + +//! Generated file from `src/pb/badgerpb3.proto` + +/// Generated files are compatible only with the same version +/// of protobuf runtime. +const _PROTOBUF_VERSION_CHECK: () = ::protobuf::VERSION_3_2_0; + +#[derive(PartialEq,Clone,Default,Debug)] +// @@protoc_insertion_point(message:badgerpb3.KV) +pub struct KV { + // message fields + // @@protoc_insertion_point(field:badgerpb3.KV.key) + pub key: ::std::vec::Vec, + // @@protoc_insertion_point(field:badgerpb3.KV.value) + pub value: ::std::vec::Vec, + // @@protoc_insertion_point(field:badgerpb3.KV.user_meta) + pub user_meta: ::std::vec::Vec, + // @@protoc_insertion_point(field:badgerpb3.KV.version) + pub version: u64, + // @@protoc_insertion_point(field:badgerpb3.KV.expires_at) + pub expires_at: u64, + // @@protoc_insertion_point(field:badgerpb3.KV.meta) + pub meta: ::std::vec::Vec, + /// Stream id is used to identify which stream the KV came from. + // @@protoc_insertion_point(field:badgerpb3.KV.stream_id) + pub stream_id: u32, + /// Stream done is used to indicate end of stream. + // @@protoc_insertion_point(field:badgerpb3.KV.stream_done) + pub stream_done: bool, + // @@protoc_insertion_point(field:badgerpb3.KV.kind) + pub kind: ::protobuf::EnumOrUnknown, + // special fields + // @@protoc_insertion_point(special_field:badgerpb3.KV.special_fields) + pub special_fields: ::protobuf::SpecialFields, +} + +impl<'a> ::std::default::Default for &'a KV { + fn default() -> &'a KV { + ::default_instance() + } +} + +impl KV { + pub fn new() -> KV { + ::std::default::Default::default() + } + + fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { + let mut fields = ::std::vec::Vec::with_capacity(9); + let mut oneofs = ::std::vec::Vec::with_capacity(0); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "key", + |m: &KV| { &m.key }, + |m: &mut KV| { &mut m.key }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "value", + |m: &KV| { &m.value }, + |m: &mut KV| { &mut m.value }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "user_meta", + |m: &KV| { &m.user_meta }, + |m: &mut KV| { &mut m.user_meta }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "version", + |m: &KV| { &m.version }, + |m: &mut KV| { &mut m.version }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "expires_at", + |m: &KV| { &m.expires_at }, + |m: &mut KV| { &mut m.expires_at }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "meta", + |m: &KV| { &m.meta }, + |m: &mut KV| { &mut m.meta }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "stream_id", + |m: &KV| { &m.stream_id }, + |m: &mut KV| { &mut m.stream_id }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "stream_done", + |m: &KV| { &m.stream_done }, + |m: &mut KV| { &mut m.stream_done }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "kind", + |m: &KV| { &m.kind }, + |m: &mut KV| { &mut m.kind }, + )); + ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( + "KV", + fields, + oneofs, + ) + } +} + +impl ::protobuf::Message for KV { + const NAME: &'static str = "KV"; + + fn is_initialized(&self) -> bool { + true + } + + fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::Result<()> { + while let Some(tag) = is.read_raw_tag_or_eof()? { + match tag { + 10 => { + self.key = is.read_bytes()?; + }, + 18 => { + self.value = is.read_bytes()?; + }, + 26 => { + self.user_meta = is.read_bytes()?; + }, + 32 => { + self.version = is.read_uint64()?; + }, + 40 => { + self.expires_at = is.read_uint64()?; + }, + 50 => { + self.meta = is.read_bytes()?; + }, + 80 => { + self.stream_id = is.read_uint32()?; + }, + 88 => { + self.stream_done = is.read_bool()?; + }, + 96 => { + self.kind = is.read_enum_or_unknown()?; + }, + tag => { + ::protobuf::rt::read_unknown_or_skip_group(tag, is, self.special_fields.mut_unknown_fields())?; + }, + }; + } + ::std::result::Result::Ok(()) + } + + // Compute sizes of nested messages + #[allow(unused_variables)] + fn compute_size(&self) -> u64 { + let mut my_size = 0; + if !self.key.is_empty() { + my_size += ::protobuf::rt::bytes_size(1, &self.key); + } + if !self.value.is_empty() { + my_size += ::protobuf::rt::bytes_size(2, &self.value); + } + if !self.user_meta.is_empty() { + my_size += ::protobuf::rt::bytes_size(3, &self.user_meta); + } + if self.version != 0 { + my_size += ::protobuf::rt::uint64_size(4, self.version); + } + if self.expires_at != 0 { + my_size += ::protobuf::rt::uint64_size(5, self.expires_at); + } + if !self.meta.is_empty() { + my_size += ::protobuf::rt::bytes_size(6, &self.meta); + } + if self.stream_id != 0 { + my_size += ::protobuf::rt::uint32_size(10, self.stream_id); + } + if self.stream_done != false { + my_size += 1 + 1; + } + if self.kind != ::protobuf::EnumOrUnknown::new(kv::Kind::KEY) { + my_size += ::protobuf::rt::int32_size(12, self.kind.value()); + } + my_size += ::protobuf::rt::unknown_fields_size(self.special_fields.unknown_fields()); + self.special_fields.cached_size().set(my_size as u32); + my_size + } + + fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::Result<()> { + if !self.key.is_empty() { + os.write_bytes(1, &self.key)?; + } + if !self.value.is_empty() { + os.write_bytes(2, &self.value)?; + } + if !self.user_meta.is_empty() { + os.write_bytes(3, &self.user_meta)?; + } + if self.version != 0 { + os.write_uint64(4, self.version)?; + } + if self.expires_at != 0 { + os.write_uint64(5, self.expires_at)?; + } + if !self.meta.is_empty() { + os.write_bytes(6, &self.meta)?; + } + if self.stream_id != 0 { + os.write_uint32(10, self.stream_id)?; + } + if self.stream_done != false { + os.write_bool(11, self.stream_done)?; + } + if self.kind != ::protobuf::EnumOrUnknown::new(kv::Kind::KEY) { + os.write_enum(12, ::protobuf::EnumOrUnknown::value(&self.kind))?; + } + os.write_unknown_fields(self.special_fields.unknown_fields())?; + ::std::result::Result::Ok(()) + } + + fn special_fields(&self) -> &::protobuf::SpecialFields { + &self.special_fields + } + + fn mut_special_fields(&mut self) -> &mut ::protobuf::SpecialFields { + &mut self.special_fields + } + + fn new() -> KV { + KV::new() + } + + fn clear(&mut self) { + self.key.clear(); + self.value.clear(); + self.user_meta.clear(); + self.version = 0; + self.expires_at = 0; + self.meta.clear(); + self.stream_id = 0; + self.stream_done = false; + self.kind = ::protobuf::EnumOrUnknown::new(kv::Kind::KEY); + self.special_fields.clear(); + } + + fn default_instance() -> &'static KV { + static instance: KV = KV { + key: ::std::vec::Vec::new(), + value: ::std::vec::Vec::new(), + user_meta: ::std::vec::Vec::new(), + version: 0, + expires_at: 0, + meta: ::std::vec::Vec::new(), + stream_id: 0, + stream_done: false, + kind: ::protobuf::EnumOrUnknown::from_i32(0), + special_fields: ::protobuf::SpecialFields::new(), + }; + &instance + } +} + +impl ::protobuf::MessageFull for KV { + fn descriptor() -> ::protobuf::reflect::MessageDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::MessageDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| file_descriptor().message_by_package_relative_name("KV").unwrap()).clone() + } +} + +impl ::std::fmt::Display for KV { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + ::protobuf::text_format::fmt(self, f) + } +} + +impl ::protobuf::reflect::ProtobufValue for KV { + type RuntimeType = ::protobuf::reflect::rt::RuntimeTypeMessage; +} + +/// Nested message and enums of message `KV` +pub mod kv { + #[derive(Clone,Copy,PartialEq,Eq,Debug,Hash)] + // @@protoc_insertion_point(enum:badgerpb3.KV.Kind) + pub enum Kind { + // @@protoc_insertion_point(enum_value:badgerpb3.KV.Kind.KEY) + KEY = 0, + // @@protoc_insertion_point(enum_value:badgerpb3.KV.Kind.DATA_KEY) + DATA_KEY = 1, + // @@protoc_insertion_point(enum_value:badgerpb3.KV.Kind.FILE) + FILE = 2, + } + + impl ::protobuf::Enum for Kind { + const NAME: &'static str = "Kind"; + + fn value(&self) -> i32 { + *self as i32 + } + + fn from_i32(value: i32) -> ::std::option::Option { + match value { + 0 => ::std::option::Option::Some(Kind::KEY), + 1 => ::std::option::Option::Some(Kind::DATA_KEY), + 2 => ::std::option::Option::Some(Kind::FILE), + _ => ::std::option::Option::None + } + } + + const VALUES: &'static [Kind] = &[ + Kind::KEY, + Kind::DATA_KEY, + Kind::FILE, + ]; + } + + impl ::protobuf::EnumFull for Kind { + fn enum_descriptor() -> ::protobuf::reflect::EnumDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::EnumDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| super::file_descriptor().enum_by_package_relative_name("KV.Kind").unwrap()).clone() + } + + fn descriptor(&self) -> ::protobuf::reflect::EnumValueDescriptor { + let index = *self as usize; + Self::enum_descriptor().value_by_index(index) + } + } + + impl ::std::default::Default for Kind { + fn default() -> Self { + Kind::KEY + } + } + + impl Kind { + pub(in super) fn generated_enum_descriptor_data() -> ::protobuf::reflect::GeneratedEnumDescriptorData { + ::protobuf::reflect::GeneratedEnumDescriptorData::new::("KV.Kind") + } + } +} + +#[derive(PartialEq,Clone,Default,Debug)] +// @@protoc_insertion_point(message:badgerpb3.KVList) +pub struct KVList { + // message fields + // @@protoc_insertion_point(field:badgerpb3.KVList.kv) + pub kv: ::std::vec::Vec, + /// alloc_ref used internally for memory management. + // @@protoc_insertion_point(field:badgerpb3.KVList.alloc_ref) + pub alloc_ref: u64, + // special fields + // @@protoc_insertion_point(special_field:badgerpb3.KVList.special_fields) + pub special_fields: ::protobuf::SpecialFields, +} + +impl<'a> ::std::default::Default for &'a KVList { + fn default() -> &'a KVList { + ::default_instance() + } +} + +impl KVList { + pub fn new() -> KVList { + ::std::default::Default::default() + } + + fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { + let mut fields = ::std::vec::Vec::with_capacity(2); + let mut oneofs = ::std::vec::Vec::with_capacity(0); + fields.push(::protobuf::reflect::rt::v2::make_vec_simpler_accessor::<_, _>( + "kv", + |m: &KVList| { &m.kv }, + |m: &mut KVList| { &mut m.kv }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "alloc_ref", + |m: &KVList| { &m.alloc_ref }, + |m: &mut KVList| { &mut m.alloc_ref }, + )); + ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( + "KVList", + fields, + oneofs, + ) + } +} + +impl ::protobuf::Message for KVList { + const NAME: &'static str = "KVList"; + + fn is_initialized(&self) -> bool { + true + } + + fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::Result<()> { + while let Some(tag) = is.read_raw_tag_or_eof()? { + match tag { + 10 => { + self.kv.push(is.read_message()?); + }, + 80 => { + self.alloc_ref = is.read_uint64()?; + }, + tag => { + ::protobuf::rt::read_unknown_or_skip_group(tag, is, self.special_fields.mut_unknown_fields())?; + }, + }; + } + ::std::result::Result::Ok(()) + } + + // Compute sizes of nested messages + #[allow(unused_variables)] + fn compute_size(&self) -> u64 { + let mut my_size = 0; + for value in &self.kv { + let len = value.compute_size(); + my_size += 1 + ::protobuf::rt::compute_raw_varint64_size(len) + len; + }; + if self.alloc_ref != 0 { + my_size += ::protobuf::rt::uint64_size(10, self.alloc_ref); + } + my_size += ::protobuf::rt::unknown_fields_size(self.special_fields.unknown_fields()); + self.special_fields.cached_size().set(my_size as u32); + my_size + } + + fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::Result<()> { + for v in &self.kv { + ::protobuf::rt::write_message_field_with_cached_size(1, v, os)?; + }; + if self.alloc_ref != 0 { + os.write_uint64(10, self.alloc_ref)?; + } + os.write_unknown_fields(self.special_fields.unknown_fields())?; + ::std::result::Result::Ok(()) + } + + fn special_fields(&self) -> &::protobuf::SpecialFields { + &self.special_fields + } + + fn mut_special_fields(&mut self) -> &mut ::protobuf::SpecialFields { + &mut self.special_fields + } + + fn new() -> KVList { + KVList::new() + } + + fn clear(&mut self) { + self.kv.clear(); + self.alloc_ref = 0; + self.special_fields.clear(); + } + + fn default_instance() -> &'static KVList { + static instance: KVList = KVList { + kv: ::std::vec::Vec::new(), + alloc_ref: 0, + special_fields: ::protobuf::SpecialFields::new(), + }; + &instance + } +} + +impl ::protobuf::MessageFull for KVList { + fn descriptor() -> ::protobuf::reflect::MessageDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::MessageDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| file_descriptor().message_by_package_relative_name("KVList").unwrap()).clone() + } +} + +impl ::std::fmt::Display for KVList { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + ::protobuf::text_format::fmt(self, f) + } +} + +impl ::protobuf::reflect::ProtobufValue for KVList { + type RuntimeType = ::protobuf::reflect::rt::RuntimeTypeMessage; +} + +#[derive(PartialEq,Clone,Default,Debug)] +// @@protoc_insertion_point(message:badgerpb3.ManifestChangeSet) +pub struct ManifestChangeSet { + // message fields + /// A set of changes that are applied atomically. + // @@protoc_insertion_point(field:badgerpb3.ManifestChangeSet.changes) + pub changes: ::std::vec::Vec, + // special fields + // @@protoc_insertion_point(special_field:badgerpb3.ManifestChangeSet.special_fields) + pub special_fields: ::protobuf::SpecialFields, +} + +impl<'a> ::std::default::Default for &'a ManifestChangeSet { + fn default() -> &'a ManifestChangeSet { + ::default_instance() + } +} + +impl ManifestChangeSet { + pub fn new() -> ManifestChangeSet { + ::std::default::Default::default() + } + + fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { + let mut fields = ::std::vec::Vec::with_capacity(1); + let mut oneofs = ::std::vec::Vec::with_capacity(0); + fields.push(::protobuf::reflect::rt::v2::make_vec_simpler_accessor::<_, _>( + "changes", + |m: &ManifestChangeSet| { &m.changes }, + |m: &mut ManifestChangeSet| { &mut m.changes }, + )); + ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( + "ManifestChangeSet", + fields, + oneofs, + ) + } +} + +impl ::protobuf::Message for ManifestChangeSet { + const NAME: &'static str = "ManifestChangeSet"; + + fn is_initialized(&self) -> bool { + true + } + + fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::Result<()> { + while let Some(tag) = is.read_raw_tag_or_eof()? { + match tag { + 10 => { + self.changes.push(is.read_message()?); + }, + tag => { + ::protobuf::rt::read_unknown_or_skip_group(tag, is, self.special_fields.mut_unknown_fields())?; + }, + }; + } + ::std::result::Result::Ok(()) + } + + // Compute sizes of nested messages + #[allow(unused_variables)] + fn compute_size(&self) -> u64 { + let mut my_size = 0; + for value in &self.changes { + let len = value.compute_size(); + my_size += 1 + ::protobuf::rt::compute_raw_varint64_size(len) + len; + }; + my_size += ::protobuf::rt::unknown_fields_size(self.special_fields.unknown_fields()); + self.special_fields.cached_size().set(my_size as u32); + my_size + } + + fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::Result<()> { + for v in &self.changes { + ::protobuf::rt::write_message_field_with_cached_size(1, v, os)?; + }; + os.write_unknown_fields(self.special_fields.unknown_fields())?; + ::std::result::Result::Ok(()) + } + + fn special_fields(&self) -> &::protobuf::SpecialFields { + &self.special_fields + } + + fn mut_special_fields(&mut self) -> &mut ::protobuf::SpecialFields { + &mut self.special_fields + } + + fn new() -> ManifestChangeSet { + ManifestChangeSet::new() + } + + fn clear(&mut self) { + self.changes.clear(); + self.special_fields.clear(); + } + + fn default_instance() -> &'static ManifestChangeSet { + static instance: ManifestChangeSet = ManifestChangeSet { + changes: ::std::vec::Vec::new(), + special_fields: ::protobuf::SpecialFields::new(), + }; + &instance + } +} + +impl ::protobuf::MessageFull for ManifestChangeSet { + fn descriptor() -> ::protobuf::reflect::MessageDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::MessageDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| file_descriptor().message_by_package_relative_name("ManifestChangeSet").unwrap()).clone() + } +} + +impl ::std::fmt::Display for ManifestChangeSet { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + ::protobuf::text_format::fmt(self, f) + } +} + +impl ::protobuf::reflect::ProtobufValue for ManifestChangeSet { + type RuntimeType = ::protobuf::reflect::rt::RuntimeTypeMessage; +} + +#[derive(PartialEq,Clone,Default,Debug)] +// @@protoc_insertion_point(message:badgerpb3.ManifestChange) +pub struct ManifestChange { + // message fields + // @@protoc_insertion_point(field:badgerpb3.ManifestChange.Id) + pub Id: u64, + // @@protoc_insertion_point(field:badgerpb3.ManifestChange.Op) + pub Op: ::protobuf::EnumOrUnknown, + // @@protoc_insertion_point(field:badgerpb3.ManifestChange.Level) + pub Level: u32, + // @@protoc_insertion_point(field:badgerpb3.ManifestChange.key_id) + pub key_id: u64, + // @@protoc_insertion_point(field:badgerpb3.ManifestChange.encryption_algo) + pub encryption_algo: ::protobuf::EnumOrUnknown, + // @@protoc_insertion_point(field:badgerpb3.ManifestChange.compression) + pub compression: u32, + // special fields + // @@protoc_insertion_point(special_field:badgerpb3.ManifestChange.special_fields) + pub special_fields: ::protobuf::SpecialFields, +} + +impl<'a> ::std::default::Default for &'a ManifestChange { + fn default() -> &'a ManifestChange { + ::default_instance() + } +} + +impl ManifestChange { + pub fn new() -> ManifestChange { + ::std::default::Default::default() + } + + fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { + let mut fields = ::std::vec::Vec::with_capacity(6); + let mut oneofs = ::std::vec::Vec::with_capacity(0); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "Id", + |m: &ManifestChange| { &m.Id }, + |m: &mut ManifestChange| { &mut m.Id }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "Op", + |m: &ManifestChange| { &m.Op }, + |m: &mut ManifestChange| { &mut m.Op }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "Level", + |m: &ManifestChange| { &m.Level }, + |m: &mut ManifestChange| { &mut m.Level }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "key_id", + |m: &ManifestChange| { &m.key_id }, + |m: &mut ManifestChange| { &mut m.key_id }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "encryption_algo", + |m: &ManifestChange| { &m.encryption_algo }, + |m: &mut ManifestChange| { &mut m.encryption_algo }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "compression", + |m: &ManifestChange| { &m.compression }, + |m: &mut ManifestChange| { &mut m.compression }, + )); + ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( + "ManifestChange", + fields, + oneofs, + ) + } +} + +impl ::protobuf::Message for ManifestChange { + const NAME: &'static str = "ManifestChange"; + + fn is_initialized(&self) -> bool { + true + } + + fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::Result<()> { + while let Some(tag) = is.read_raw_tag_or_eof()? { + match tag { + 8 => { + self.Id = is.read_uint64()?; + }, + 16 => { + self.Op = is.read_enum_or_unknown()?; + }, + 24 => { + self.Level = is.read_uint32()?; + }, + 32 => { + self.key_id = is.read_uint64()?; + }, + 40 => { + self.encryption_algo = is.read_enum_or_unknown()?; + }, + 48 => { + self.compression = is.read_uint32()?; + }, + tag => { + ::protobuf::rt::read_unknown_or_skip_group(tag, is, self.special_fields.mut_unknown_fields())?; + }, + }; + } + ::std::result::Result::Ok(()) + } + + // Compute sizes of nested messages + #[allow(unused_variables)] + fn compute_size(&self) -> u64 { + let mut my_size = 0; + if self.Id != 0 { + my_size += ::protobuf::rt::uint64_size(1, self.Id); + } + if self.Op != ::protobuf::EnumOrUnknown::new(manifest_change::Operation::CREATE) { + my_size += ::protobuf::rt::int32_size(2, self.Op.value()); + } + if self.Level != 0 { + my_size += ::protobuf::rt::uint32_size(3, self.Level); + } + if self.key_id != 0 { + my_size += ::protobuf::rt::uint64_size(4, self.key_id); + } + if self.encryption_algo != ::protobuf::EnumOrUnknown::new(EncryptionAlgo::aes) { + my_size += ::protobuf::rt::int32_size(5, self.encryption_algo.value()); + } + if self.compression != 0 { + my_size += ::protobuf::rt::uint32_size(6, self.compression); + } + my_size += ::protobuf::rt::unknown_fields_size(self.special_fields.unknown_fields()); + self.special_fields.cached_size().set(my_size as u32); + my_size + } + + fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::Result<()> { + if self.Id != 0 { + os.write_uint64(1, self.Id)?; + } + if self.Op != ::protobuf::EnumOrUnknown::new(manifest_change::Operation::CREATE) { + os.write_enum(2, ::protobuf::EnumOrUnknown::value(&self.Op))?; + } + if self.Level != 0 { + os.write_uint32(3, self.Level)?; + } + if self.key_id != 0 { + os.write_uint64(4, self.key_id)?; + } + if self.encryption_algo != ::protobuf::EnumOrUnknown::new(EncryptionAlgo::aes) { + os.write_enum(5, ::protobuf::EnumOrUnknown::value(&self.encryption_algo))?; + } + if self.compression != 0 { + os.write_uint32(6, self.compression)?; + } + os.write_unknown_fields(self.special_fields.unknown_fields())?; + ::std::result::Result::Ok(()) + } + + fn special_fields(&self) -> &::protobuf::SpecialFields { + &self.special_fields + } + + fn mut_special_fields(&mut self) -> &mut ::protobuf::SpecialFields { + &mut self.special_fields + } + + fn new() -> ManifestChange { + ManifestChange::new() + } + + fn clear(&mut self) { + self.Id = 0; + self.Op = ::protobuf::EnumOrUnknown::new(manifest_change::Operation::CREATE); + self.Level = 0; + self.key_id = 0; + self.encryption_algo = ::protobuf::EnumOrUnknown::new(EncryptionAlgo::aes); + self.compression = 0; + self.special_fields.clear(); + } + + fn default_instance() -> &'static ManifestChange { + static instance: ManifestChange = ManifestChange { + Id: 0, + Op: ::protobuf::EnumOrUnknown::from_i32(0), + Level: 0, + key_id: 0, + encryption_algo: ::protobuf::EnumOrUnknown::from_i32(0), + compression: 0, + special_fields: ::protobuf::SpecialFields::new(), + }; + &instance + } +} + +impl ::protobuf::MessageFull for ManifestChange { + fn descriptor() -> ::protobuf::reflect::MessageDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::MessageDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| file_descriptor().message_by_package_relative_name("ManifestChange").unwrap()).clone() + } +} + +impl ::std::fmt::Display for ManifestChange { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + ::protobuf::text_format::fmt(self, f) + } +} + +impl ::protobuf::reflect::ProtobufValue for ManifestChange { + type RuntimeType = ::protobuf::reflect::rt::RuntimeTypeMessage; +} + +/// Nested message and enums of message `ManifestChange` +pub mod manifest_change { + #[derive(Clone,Copy,PartialEq,Eq,Debug,Hash)] + // @@protoc_insertion_point(enum:badgerpb3.ManifestChange.Operation) + pub enum Operation { + // @@protoc_insertion_point(enum_value:badgerpb3.ManifestChange.Operation.CREATE) + CREATE = 0, + // @@protoc_insertion_point(enum_value:badgerpb3.ManifestChange.Operation.DELETE) + DELETE = 1, + } + + impl ::protobuf::Enum for Operation { + const NAME: &'static str = "Operation"; + + fn value(&self) -> i32 { + *self as i32 + } + + fn from_i32(value: i32) -> ::std::option::Option { + match value { + 0 => ::std::option::Option::Some(Operation::CREATE), + 1 => ::std::option::Option::Some(Operation::DELETE), + _ => ::std::option::Option::None + } + } + + const VALUES: &'static [Operation] = &[ + Operation::CREATE, + Operation::DELETE, + ]; + } + + impl ::protobuf::EnumFull for Operation { + fn enum_descriptor() -> ::protobuf::reflect::EnumDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::EnumDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| super::file_descriptor().enum_by_package_relative_name("ManifestChange.Operation").unwrap()).clone() + } + + fn descriptor(&self) -> ::protobuf::reflect::EnumValueDescriptor { + let index = *self as usize; + Self::enum_descriptor().value_by_index(index) + } + } + + impl ::std::default::Default for Operation { + fn default() -> Self { + Operation::CREATE + } + } + + impl Operation { + pub(in super) fn generated_enum_descriptor_data() -> ::protobuf::reflect::GeneratedEnumDescriptorData { + ::protobuf::reflect::GeneratedEnumDescriptorData::new::("ManifestChange.Operation") + } + } +} + +#[derive(PartialEq,Clone,Default,Debug)] +// @@protoc_insertion_point(message:badgerpb3.Checksum) +pub struct Checksum { + // message fields + // @@protoc_insertion_point(field:badgerpb3.Checksum.algo) + pub algo: ::protobuf::EnumOrUnknown, + // @@protoc_insertion_point(field:badgerpb3.Checksum.sum) + pub sum: u64, + // special fields + // @@protoc_insertion_point(special_field:badgerpb3.Checksum.special_fields) + pub special_fields: ::protobuf::SpecialFields, +} + +impl<'a> ::std::default::Default for &'a Checksum { + fn default() -> &'a Checksum { + ::default_instance() + } +} + +impl Checksum { + pub fn new() -> Checksum { + ::std::default::Default::default() + } + + fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { + let mut fields = ::std::vec::Vec::with_capacity(2); + let mut oneofs = ::std::vec::Vec::with_capacity(0); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "algo", + |m: &Checksum| { &m.algo }, + |m: &mut Checksum| { &mut m.algo }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "sum", + |m: &Checksum| { &m.sum }, + |m: &mut Checksum| { &mut m.sum }, + )); + ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( + "Checksum", + fields, + oneofs, + ) + } +} + +impl ::protobuf::Message for Checksum { + const NAME: &'static str = "Checksum"; + + fn is_initialized(&self) -> bool { + true + } + + fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::Result<()> { + while let Some(tag) = is.read_raw_tag_or_eof()? { + match tag { + 8 => { + self.algo = is.read_enum_or_unknown()?; + }, + 16 => { + self.sum = is.read_uint64()?; + }, + tag => { + ::protobuf::rt::read_unknown_or_skip_group(tag, is, self.special_fields.mut_unknown_fields())?; + }, + }; + } + ::std::result::Result::Ok(()) + } + + // Compute sizes of nested messages + #[allow(unused_variables)] + fn compute_size(&self) -> u64 { + let mut my_size = 0; + if self.algo != ::protobuf::EnumOrUnknown::new(checksum::Algorithm::CRC32C) { + my_size += ::protobuf::rt::int32_size(1, self.algo.value()); + } + if self.sum != 0 { + my_size += ::protobuf::rt::uint64_size(2, self.sum); + } + my_size += ::protobuf::rt::unknown_fields_size(self.special_fields.unknown_fields()); + self.special_fields.cached_size().set(my_size as u32); + my_size + } + + fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::Result<()> { + if self.algo != ::protobuf::EnumOrUnknown::new(checksum::Algorithm::CRC32C) { + os.write_enum(1, ::protobuf::EnumOrUnknown::value(&self.algo))?; + } + if self.sum != 0 { + os.write_uint64(2, self.sum)?; + } + os.write_unknown_fields(self.special_fields.unknown_fields())?; + ::std::result::Result::Ok(()) + } + + fn special_fields(&self) -> &::protobuf::SpecialFields { + &self.special_fields + } + + fn mut_special_fields(&mut self) -> &mut ::protobuf::SpecialFields { + &mut self.special_fields + } + + fn new() -> Checksum { + Checksum::new() + } + + fn clear(&mut self) { + self.algo = ::protobuf::EnumOrUnknown::new(checksum::Algorithm::CRC32C); + self.sum = 0; + self.special_fields.clear(); + } + + fn default_instance() -> &'static Checksum { + static instance: Checksum = Checksum { + algo: ::protobuf::EnumOrUnknown::from_i32(0), + sum: 0, + special_fields: ::protobuf::SpecialFields::new(), + }; + &instance + } +} + +impl ::protobuf::MessageFull for Checksum { + fn descriptor() -> ::protobuf::reflect::MessageDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::MessageDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| file_descriptor().message_by_package_relative_name("Checksum").unwrap()).clone() + } +} + +impl ::std::fmt::Display for Checksum { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + ::protobuf::text_format::fmt(self, f) + } +} + +impl ::protobuf::reflect::ProtobufValue for Checksum { + type RuntimeType = ::protobuf::reflect::rt::RuntimeTypeMessage; +} + +/// Nested message and enums of message `Checksum` +pub mod checksum { + #[derive(Clone,Copy,PartialEq,Eq,Debug,Hash)] + // @@protoc_insertion_point(enum:badgerpb3.Checksum.Algorithm) + pub enum Algorithm { + // @@protoc_insertion_point(enum_value:badgerpb3.Checksum.Algorithm.CRC32C) + CRC32C = 0, + // @@protoc_insertion_point(enum_value:badgerpb3.Checksum.Algorithm.XXHash64) + XXHash64 = 1, + } + + impl ::protobuf::Enum for Algorithm { + const NAME: &'static str = "Algorithm"; + + fn value(&self) -> i32 { + *self as i32 + } + + fn from_i32(value: i32) -> ::std::option::Option { + match value { + 0 => ::std::option::Option::Some(Algorithm::CRC32C), + 1 => ::std::option::Option::Some(Algorithm::XXHash64), + _ => ::std::option::Option::None + } + } + + const VALUES: &'static [Algorithm] = &[ + Algorithm::CRC32C, + Algorithm::XXHash64, + ]; + } + + impl ::protobuf::EnumFull for Algorithm { + fn enum_descriptor() -> ::protobuf::reflect::EnumDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::EnumDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| super::file_descriptor().enum_by_package_relative_name("Checksum.Algorithm").unwrap()).clone() + } + + fn descriptor(&self) -> ::protobuf::reflect::EnumValueDescriptor { + let index = *self as usize; + Self::enum_descriptor().value_by_index(index) + } + } + + impl ::std::default::Default for Algorithm { + fn default() -> Self { + Algorithm::CRC32C + } + } + + impl Algorithm { + pub(in super) fn generated_enum_descriptor_data() -> ::protobuf::reflect::GeneratedEnumDescriptorData { + ::protobuf::reflect::GeneratedEnumDescriptorData::new::("Checksum.Algorithm") + } + } +} + +#[derive(PartialEq,Clone,Default,Debug)] +// @@protoc_insertion_point(message:badgerpb3.DataKey) +pub struct DataKey { + // message fields + // @@protoc_insertion_point(field:badgerpb3.DataKey.key_id) + pub key_id: u64, + // @@protoc_insertion_point(field:badgerpb3.DataKey.data) + pub data: ::std::vec::Vec, + // @@protoc_insertion_point(field:badgerpb3.DataKey.iv) + pub iv: ::std::vec::Vec, + // @@protoc_insertion_point(field:badgerpb3.DataKey.created_at) + pub created_at: i64, + // special fields + // @@protoc_insertion_point(special_field:badgerpb3.DataKey.special_fields) + pub special_fields: ::protobuf::SpecialFields, +} + +impl<'a> ::std::default::Default for &'a DataKey { + fn default() -> &'a DataKey { + ::default_instance() + } +} + +impl DataKey { + pub fn new() -> DataKey { + ::std::default::Default::default() + } + + fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { + let mut fields = ::std::vec::Vec::with_capacity(4); + let mut oneofs = ::std::vec::Vec::with_capacity(0); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "key_id", + |m: &DataKey| { &m.key_id }, + |m: &mut DataKey| { &mut m.key_id }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "data", + |m: &DataKey| { &m.data }, + |m: &mut DataKey| { &mut m.data }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "iv", + |m: &DataKey| { &m.iv }, + |m: &mut DataKey| { &mut m.iv }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "created_at", + |m: &DataKey| { &m.created_at }, + |m: &mut DataKey| { &mut m.created_at }, + )); + ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( + "DataKey", + fields, + oneofs, + ) + } +} + +impl ::protobuf::Message for DataKey { + const NAME: &'static str = "DataKey"; + + fn is_initialized(&self) -> bool { + true + } + + fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::Result<()> { + while let Some(tag) = is.read_raw_tag_or_eof()? { + match tag { + 8 => { + self.key_id = is.read_uint64()?; + }, + 18 => { + self.data = is.read_bytes()?; + }, + 26 => { + self.iv = is.read_bytes()?; + }, + 32 => { + self.created_at = is.read_int64()?; + }, + tag => { + ::protobuf::rt::read_unknown_or_skip_group(tag, is, self.special_fields.mut_unknown_fields())?; + }, + }; + } + ::std::result::Result::Ok(()) + } + + // Compute sizes of nested messages + #[allow(unused_variables)] + fn compute_size(&self) -> u64 { + let mut my_size = 0; + if self.key_id != 0 { + my_size += ::protobuf::rt::uint64_size(1, self.key_id); + } + if !self.data.is_empty() { + my_size += ::protobuf::rt::bytes_size(2, &self.data); + } + if !self.iv.is_empty() { + my_size += ::protobuf::rt::bytes_size(3, &self.iv); + } + if self.created_at != 0 { + my_size += ::protobuf::rt::int64_size(4, self.created_at); + } + my_size += ::protobuf::rt::unknown_fields_size(self.special_fields.unknown_fields()); + self.special_fields.cached_size().set(my_size as u32); + my_size + } + + fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::Result<()> { + if self.key_id != 0 { + os.write_uint64(1, self.key_id)?; + } + if !self.data.is_empty() { + os.write_bytes(2, &self.data)?; + } + if !self.iv.is_empty() { + os.write_bytes(3, &self.iv)?; + } + if self.created_at != 0 { + os.write_int64(4, self.created_at)?; + } + os.write_unknown_fields(self.special_fields.unknown_fields())?; + ::std::result::Result::Ok(()) + } + + fn special_fields(&self) -> &::protobuf::SpecialFields { + &self.special_fields + } + + fn mut_special_fields(&mut self) -> &mut ::protobuf::SpecialFields { + &mut self.special_fields + } + + fn new() -> DataKey { + DataKey::new() + } + + fn clear(&mut self) { + self.key_id = 0; + self.data.clear(); + self.iv.clear(); + self.created_at = 0; + self.special_fields.clear(); + } + + fn default_instance() -> &'static DataKey { + static instance: DataKey = DataKey { + key_id: 0, + data: ::std::vec::Vec::new(), + iv: ::std::vec::Vec::new(), + created_at: 0, + special_fields: ::protobuf::SpecialFields::new(), + }; + &instance + } +} + +impl ::protobuf::MessageFull for DataKey { + fn descriptor() -> ::protobuf::reflect::MessageDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::MessageDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| file_descriptor().message_by_package_relative_name("DataKey").unwrap()).clone() + } +} + +impl ::std::fmt::Display for DataKey { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + ::protobuf::text_format::fmt(self, f) + } +} + +impl ::protobuf::reflect::ProtobufValue for DataKey { + type RuntimeType = ::protobuf::reflect::rt::RuntimeTypeMessage; +} + +#[derive(PartialEq,Clone,Default,Debug)] +// @@protoc_insertion_point(message:badgerpb3.Match) +pub struct Match { + // message fields + // @@protoc_insertion_point(field:badgerpb3.Match.prefix) + pub prefix: ::std::vec::Vec, + // @@protoc_insertion_point(field:badgerpb3.Match.ignore_bytes) + pub ignore_bytes: ::std::string::String, + // special fields + // @@protoc_insertion_point(special_field:badgerpb3.Match.special_fields) + pub special_fields: ::protobuf::SpecialFields, +} + +impl<'a> ::std::default::Default for &'a Match { + fn default() -> &'a Match { + ::default_instance() + } +} + +impl Match { + pub fn new() -> Match { + ::std::default::Default::default() + } + + fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { + let mut fields = ::std::vec::Vec::with_capacity(2); + let mut oneofs = ::std::vec::Vec::with_capacity(0); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "prefix", + |m: &Match| { &m.prefix }, + |m: &mut Match| { &mut m.prefix }, + )); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "ignore_bytes", + |m: &Match| { &m.ignore_bytes }, + |m: &mut Match| { &mut m.ignore_bytes }, + )); + ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( + "Match", + fields, + oneofs, + ) + } +} + +impl ::protobuf::Message for Match { + const NAME: &'static str = "Match"; + + fn is_initialized(&self) -> bool { + true + } + + fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::Result<()> { + while let Some(tag) = is.read_raw_tag_or_eof()? { + match tag { + 10 => { + self.prefix = is.read_bytes()?; + }, + 18 => { + self.ignore_bytes = is.read_string()?; + }, + tag => { + ::protobuf::rt::read_unknown_or_skip_group(tag, is, self.special_fields.mut_unknown_fields())?; + }, + }; + } + ::std::result::Result::Ok(()) + } + + // Compute sizes of nested messages + #[allow(unused_variables)] + fn compute_size(&self) -> u64 { + let mut my_size = 0; + if !self.prefix.is_empty() { + my_size += ::protobuf::rt::bytes_size(1, &self.prefix); + } + if !self.ignore_bytes.is_empty() { + my_size += ::protobuf::rt::string_size(2, &self.ignore_bytes); + } + my_size += ::protobuf::rt::unknown_fields_size(self.special_fields.unknown_fields()); + self.special_fields.cached_size().set(my_size as u32); + my_size + } + + fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::Result<()> { + if !self.prefix.is_empty() { + os.write_bytes(1, &self.prefix)?; + } + if !self.ignore_bytes.is_empty() { + os.write_string(2, &self.ignore_bytes)?; + } + os.write_unknown_fields(self.special_fields.unknown_fields())?; + ::std::result::Result::Ok(()) + } + + fn special_fields(&self) -> &::protobuf::SpecialFields { + &self.special_fields + } + + fn mut_special_fields(&mut self) -> &mut ::protobuf::SpecialFields { + &mut self.special_fields + } + + fn new() -> Match { + Match::new() + } + + fn clear(&mut self) { + self.prefix.clear(); + self.ignore_bytes.clear(); + self.special_fields.clear(); + } + + fn default_instance() -> &'static Match { + static instance: Match = Match { + prefix: ::std::vec::Vec::new(), + ignore_bytes: ::std::string::String::new(), + special_fields: ::protobuf::SpecialFields::new(), + }; + &instance + } +} + +impl ::protobuf::MessageFull for Match { + fn descriptor() -> ::protobuf::reflect::MessageDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::MessageDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| file_descriptor().message_by_package_relative_name("Match").unwrap()).clone() + } +} + +impl ::std::fmt::Display for Match { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + ::protobuf::text_format::fmt(self, f) + } +} + +impl ::protobuf::reflect::ProtobufValue for Match { + type RuntimeType = ::protobuf::reflect::rt::RuntimeTypeMessage; +} + +#[derive(Clone,Copy,PartialEq,Eq,Debug,Hash)] +// @@protoc_insertion_point(enum:badgerpb3.EncryptionAlgo) +pub enum EncryptionAlgo { + // @@protoc_insertion_point(enum_value:badgerpb3.EncryptionAlgo.aes) + aes = 0, +} + +impl ::protobuf::Enum for EncryptionAlgo { + const NAME: &'static str = "EncryptionAlgo"; + + fn value(&self) -> i32 { + *self as i32 + } + + fn from_i32(value: i32) -> ::std::option::Option { + match value { + 0 => ::std::option::Option::Some(EncryptionAlgo::aes), + _ => ::std::option::Option::None + } + } + + const VALUES: &'static [EncryptionAlgo] = &[ + EncryptionAlgo::aes, + ]; +} + +impl ::protobuf::EnumFull for EncryptionAlgo { + fn enum_descriptor() -> ::protobuf::reflect::EnumDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::EnumDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| file_descriptor().enum_by_package_relative_name("EncryptionAlgo").unwrap()).clone() + } + + fn descriptor(&self) -> ::protobuf::reflect::EnumValueDescriptor { + let index = *self as usize; + Self::enum_descriptor().value_by_index(index) + } +} + +impl ::std::default::Default for EncryptionAlgo { + fn default() -> Self { + EncryptionAlgo::aes + } +} + +impl EncryptionAlgo { + fn generated_enum_descriptor_data() -> ::protobuf::reflect::GeneratedEnumDescriptorData { + ::protobuf::reflect::GeneratedEnumDescriptorData::new::("EncryptionAlgo") + } +} + +static file_descriptor_proto_data: &'static [u8] = b"\ + \n\x16src/pb/badgerpb3.proto\x12\tbadgerpb3\"\xa5\x02\n\x02KV\x12\x10\n\ + \x03key\x18\x01\x20\x01(\x0cR\x03key\x12\x14\n\x05value\x18\x02\x20\x01(\ + \x0cR\x05value\x12\x1b\n\tuser_meta\x18\x03\x20\x01(\x0cR\x08userMeta\ + \x12\x18\n\x07version\x18\x04\x20\x01(\x04R\x07version\x12\x1d\n\nexpire\ + s_at\x18\x05\x20\x01(\x04R\texpiresAt\x12\x12\n\x04meta\x18\x06\x20\x01(\ + \x0cR\x04meta\x12\x1b\n\tstream_id\x18\n\x20\x01(\rR\x08streamId\x12\x1f\ + \n\x0bstream_done\x18\x0b\x20\x01(\x08R\nstreamDone\x12&\n\x04kind\x18\ + \x0c\x20\x01(\x0e2\x12.badgerpb3.KV.KindR\x04kind\"'\n\x04Kind\x12\x07\n\ + \x03KEY\x10\0\x12\x0c\n\x08DATA_KEY\x10\x01\x12\x08\n\x04FILE\x10\x02\"D\ + \n\x06KVList\x12\x1d\n\x02kv\x18\x01\x20\x03(\x0b2\r.badgerpb3.KVR\x02kv\ + \x12\x1b\n\talloc_ref\x18\n\x20\x01(\x04R\x08allocRef\"H\n\x11ManifestCh\ + angeSet\x123\n\x07changes\x18\x01\x20\x03(\x0b2\x19.badgerpb3.ManifestCh\ + angeR\x07changes\"\x8d\x02\n\x0eManifestChange\x12\x0e\n\x02Id\x18\x01\ + \x20\x01(\x04R\x02Id\x123\n\x02Op\x18\x02\x20\x01(\x0e2#.badgerpb3.Manif\ + estChange.OperationR\x02Op\x12\x14\n\x05Level\x18\x03\x20\x01(\rR\x05Lev\ + el\x12\x15\n\x06key_id\x18\x04\x20\x01(\x04R\x05keyId\x12B\n\x0fencrypti\ + on_algo\x18\x05\x20\x01(\x0e2\x19.badgerpb3.EncryptionAlgoR\x0eencryptio\ + nAlgo\x12\x20\n\x0bcompression\x18\x06\x20\x01(\rR\x0bcompression\"#\n\t\ + Operation\x12\n\n\x06CREATE\x10\0\x12\n\n\x06DELETE\x10\x01\"v\n\x08Chec\ + ksum\x121\n\x04algo\x18\x01\x20\x01(\x0e2\x1d.badgerpb3.Checksum.Algorit\ + hmR\x04algo\x12\x10\n\x03sum\x18\x02\x20\x01(\x04R\x03sum\"%\n\tAlgorith\ + m\x12\n\n\x06CRC32C\x10\0\x12\x0c\n\x08XXHash64\x10\x01\"c\n\x07DataKey\ + \x12\x15\n\x06key_id\x18\x01\x20\x01(\x04R\x05keyId\x12\x12\n\x04data\ + \x18\x02\x20\x01(\x0cR\x04data\x12\x0e\n\x02iv\x18\x03\x20\x01(\x0cR\x02\ + iv\x12\x1d\n\ncreated_at\x18\x04\x20\x01(\x03R\tcreatedAt\"B\n\x05Match\ + \x12\x16\n\x06prefix\x18\x01\x20\x01(\x0cR\x06prefix\x12!\n\x0cignore_by\ + tes\x18\x02\x20\x01(\tR\x0bignoreBytes*\x19\n\x0eEncryptionAlgo\x12\x07\ + \n\x03aes\x10\0B#Z!github.com/dgraph-io/badger/v3/pbJ\x8b\x19\n\x06\x12\ + \x04\x11\0\\\x01\n\x96\x05\n\x01\x0c\x12\x03\x11\0\x12\x1a.\x20Use\x20pr\ + otos/gen.sh\x20to\x20generate\x20.pb.go\x20files.\r\n2\xdb\x04\r\n\x20Co\ + pyright\x20(C)\x202017\x20Dgraph\x20Labs,\x20Inc.\x20and\x20Contributors\ + \r\n\r\n\x20Licensed\x20under\x20the\x20Apache\x20License,\x20Version\ + \x202.0\x20(the\x20\"License\");\r\n\x20you\x20may\x20not\x20use\x20this\ + \x20file\x20except\x20in\x20compliance\x20with\x20the\x20License.\r\n\ + \x20You\x20may\x20obtain\x20a\x20copy\x20of\x20the\x20License\x20at\r\n\ + \r\n\x20\x20\x20\x20http://www.apache.org/licenses/LICENSE-2.0\r\n\r\n\ + \x20Unless\x20required\x20by\x20applicable\x20law\x20or\x20agreed\x20to\ + \x20in\x20writing,\x20software\r\n\x20distributed\x20under\x20the\x20Lic\ + ense\x20is\x20distributed\x20on\x20an\x20\"AS\x20IS\"\x20BASIS,\r\n\x20W\ + ITHOUT\x20WARRANTIES\x20OR\x20CONDITIONS\x20OF\x20ANY\x20KIND,\x20either\ + \x20express\x20or\x20implied.\r\n\x20See\x20the\x20License\x20for\x20the\ + \x20specific\x20language\x20governing\x20permissions\x20and\r\n\x20limit\ + ations\x20under\x20the\x20License.\r\n\n\x08\n\x01\x02\x12\x03\x13\0\x12\ + \n\x08\n\x01\x08\x12\x03\x15\08\n\t\n\x02\x08\x0b\x12\x03\x15\08\n\n\n\ + \x02\x04\0\x12\x04\x17\0*\x01\n\n\n\x03\x04\0\x01\x12\x03\x17\x08\n\n\ + \x0b\n\x04\x04\0\x02\0\x12\x03\x18\x02\x10\n\x0c\n\x05\x04\0\x02\0\x05\ + \x12\x03\x18\x02\x07\n\x0c\n\x05\x04\0\x02\0\x01\x12\x03\x18\x08\x0b\n\ + \x0c\n\x05\x04\0\x02\0\x03\x12\x03\x18\x0e\x0f\n\x0b\n\x04\x04\0\x02\x01\ + \x12\x03\x19\x02\x12\n\x0c\n\x05\x04\0\x02\x01\x05\x12\x03\x19\x02\x07\n\ + \x0c\n\x05\x04\0\x02\x01\x01\x12\x03\x19\x08\r\n\x0c\n\x05\x04\0\x02\x01\ + \x03\x12\x03\x19\x10\x11\n\x0b\n\x04\x04\0\x02\x02\x12\x03\x1a\x02\x16\n\ + \x0c\n\x05\x04\0\x02\x02\x05\x12\x03\x1a\x02\x07\n\x0c\n\x05\x04\0\x02\ + \x02\x01\x12\x03\x1a\x08\x11\n\x0c\n\x05\x04\0\x02\x02\x03\x12\x03\x1a\ + \x14\x15\n\x0b\n\x04\x04\0\x02\x03\x12\x03\x1b\x02\x15\n\x0c\n\x05\x04\0\ + \x02\x03\x05\x12\x03\x1b\x02\x08\n\x0c\n\x05\x04\0\x02\x03\x01\x12\x03\ + \x1b\t\x10\n\x0c\n\x05\x04\0\x02\x03\x03\x12\x03\x1b\x13\x14\n\x0b\n\x04\ + \x04\0\x02\x04\x12\x03\x1c\x02\x18\n\x0c\n\x05\x04\0\x02\x04\x05\x12\x03\ + \x1c\x02\x08\n\x0c\n\x05\x04\0\x02\x04\x01\x12\x03\x1c\t\x13\n\x0c\n\x05\ + \x04\0\x02\x04\x03\x12\x03\x1c\x16\x17\n\x0b\n\x04\x04\0\x02\x05\x12\x03\ + \x1d\x02\x11\n\x0c\n\x05\x04\0\x02\x05\x05\x12\x03\x1d\x02\x07\n\x0c\n\ + \x05\x04\0\x02\x05\x01\x12\x03\x1d\x08\x0c\n\x0c\n\x05\x04\0\x02\x05\x03\ + \x12\x03\x1d\x0f\x10\nL\n\x04\x04\0\x02\x06\x12\x03\x20\x02\x18\x1a?\x20\ + Stream\x20id\x20is\x20used\x20to\x20identify\x20which\x20stream\x20the\ + \x20KV\x20came\x20from.\r\n\n\x0c\n\x05\x04\0\x02\x06\x05\x12\x03\x20\ + \x02\x08\n\x0c\n\x05\x04\0\x02\x06\x01\x12\x03\x20\t\x12\n\x0c\n\x05\x04\ + \0\x02\x06\x03\x12\x03\x20\x15\x17\n>\n\x04\x04\0\x02\x07\x12\x03\"\x02\ + \x18\x1a1\x20Stream\x20done\x20is\x20used\x20to\x20indicate\x20end\x20of\ + \x20stream.\r\n\n\x0c\n\x05\x04\0\x02\x07\x05\x12\x03\"\x02\x06\n\x0c\n\ + \x05\x04\0\x02\x07\x01\x12\x03\"\x07\x12\n\x0c\n\x05\x04\0\x02\x07\x03\ + \x12\x03\"\x15\x17\n\x0c\n\x04\x04\0\x04\0\x12\x04$\x02(\x03\n\x0c\n\x05\ + \x04\0\x04\0\x01\x12\x03$\x07\x0b\n\r\n\x06\x04\0\x04\0\x02\0\x12\x03%\ + \x04\x0c\n\x0e\n\x07\x04\0\x04\0\x02\0\x01\x12\x03%\x04\x07\n\x0e\n\x07\ + \x04\0\x04\0\x02\0\x02\x12\x03%\n\x0b\n\r\n\x06\x04\0\x04\0\x02\x01\x12\ + \x03&\x04\x11\n\x0e\n\x07\x04\0\x04\0\x02\x01\x01\x12\x03&\x04\x0c\n\x0e\ + \n\x07\x04\0\x04\0\x02\x01\x02\x12\x03&\x0f\x10\n\r\n\x06\x04\0\x04\0\ + \x02\x02\x12\x03'\x04\r\n\x0e\n\x07\x04\0\x04\0\x02\x02\x01\x12\x03'\x04\ + \x08\n\x0e\n\x07\x04\0\x04\0\x02\x02\x02\x12\x03'\x0b\x0c\n\x0b\n\x04\ + \x04\0\x02\x08\x12\x03)\x02\x11\n\x0c\n\x05\x04\0\x02\x08\x06\x12\x03)\ + \x02\x06\n\x0c\n\x05\x04\0\x02\x08\x01\x12\x03)\x07\x0b\n\x0c\n\x05\x04\ + \0\x02\x08\x03\x12\x03)\x0e\x10\n\n\n\x02\x04\x01\x12\x04,\01\x01\n\n\n\ + \x03\x04\x01\x01\x12\x03,\x08\x0e\n\x0b\n\x04\x04\x01\x02\0\x12\x03-\x02\ + \x15\n\x0c\n\x05\x04\x01\x02\0\x04\x12\x03-\x02\n\n\x0c\n\x05\x04\x01\ + \x02\0\x06\x12\x03-\x0b\r\n\x0c\n\x05\x04\x01\x02\0\x01\x12\x03-\x0e\x10\ + \n\x0c\n\x05\x04\x01\x02\0\x03\x12\x03-\x13\x14\n@\n\x04\x04\x01\x02\x01\ + \x12\x030\x02\x18\x1a3\x20alloc_ref\x20used\x20internally\x20for\x20memo\ + ry\x20management.\r\n\n\x0c\n\x05\x04\x01\x02\x01\x05\x12\x030\x02\x08\n\ + \x0c\n\x05\x04\x01\x02\x01\x01\x12\x030\t\x12\n\x0c\n\x05\x04\x01\x02\ + \x01\x03\x12\x030\x15\x17\n\n\n\x02\x04\x02\x12\x043\06\x01\n\n\n\x03\ + \x04\x02\x01\x12\x033\x08\x19\n=\n\x04\x04\x02\x02\0\x12\x035\x02&\x1a0\ + \x20A\x20set\x20of\x20changes\x20that\x20are\x20applied\x20atomically.\r\ + \n\n\x0c\n\x05\x04\x02\x02\0\x04\x12\x035\x02\n\n\x0c\n\x05\x04\x02\x02\ + \0\x06\x12\x035\x0b\x19\n\x0c\n\x05\x04\x02\x02\0\x01\x12\x035\x1a!\n\ + \x0c\n\x05\x04\x02\x02\0\x03\x12\x035$%\n\n\n\x02\x05\0\x12\x048\0:\x01\ + \n\n\n\x03\x05\0\x01\x12\x038\x05\x13\n\x0b\n\x04\x05\0\x02\0\x12\x039\ + \x02\n\n\x0c\n\x05\x05\0\x02\0\x01\x12\x039\x02\x05\n\x0c\n\x05\x05\0\ + \x02\0\x02\x12\x039\x08\t\n\n\n\x02\x04\x03\x12\x04<\0G\x01\n\n\n\x03\ + \x04\x03\x01\x12\x03<\x08\x16\n\x19\n\x04\x04\x03\x02\0\x12\x03=\x02\x10\ + \"\x0c\x20Table\x20ID.\r\n\n\x0c\n\x05\x04\x03\x02\0\x05\x12\x03=\x02\ + \x08\n\x0c\n\x05\x04\x03\x02\0\x01\x12\x03=\t\x0b\n\x0c\n\x05\x04\x03\ + \x02\0\x03\x12\x03=\x0e\x0f\n\x0c\n\x04\x04\x03\x04\0\x12\x04>\x02A\x03\ + \n\x0c\n\x05\x04\x03\x04\0\x01\x12\x03>\x07\x10\n\r\n\x06\x04\x03\x04\0\ + \x02\0\x12\x03?\x04\x0f\n\x0e\n\x07\x04\x03\x04\0\x02\0\x01\x12\x03?\x04\ + \n\n\x0e\n\x07\x04\x03\x04\0\x02\0\x02\x12\x03?\r\x0e\n\r\n\x06\x04\x03\ + \x04\0\x02\x01\x12\x03@\x04\x0f\n\x0e\n\x07\x04\x03\x04\0\x02\x01\x01\ + \x12\x03@\x04\n\n\x0e\n\x07\x04\x03\x04\0\x02\x01\x02\x12\x03@\r\x0e\n\ + \x0b\n\x04\x04\x03\x02\x01\x12\x03B\x02\x15\n\x0c\n\x05\x04\x03\x02\x01\ + \x06\x12\x03B\x02\x0b\n\x0c\n\x05\x04\x03\x02\x01\x01\x12\x03B\x0c\x0e\n\ + \x0c\n\x05\x04\x03\x02\x01\x03\x12\x03B\x13\x14\n%\n\x04\x04\x03\x02\x02\ + \x12\x03C\x02\x15\"\x18\x20Only\x20used\x20for\x20CREATE.\r\n\n\x0c\n\ + \x05\x04\x03\x02\x02\x05\x12\x03C\x02\x08\n\x0c\n\x05\x04\x03\x02\x02\ + \x01\x12\x03C\t\x0e\n\x0c\n\x05\x04\x03\x02\x02\x03\x12\x03C\x13\x14\n\ + \x0b\n\x04\x04\x03\x02\x03\x12\x03D\x02\x15\n\x0c\n\x05\x04\x03\x02\x03\ + \x05\x12\x03D\x02\x08\n\x0c\n\x05\x04\x03\x02\x03\x01\x12\x03D\t\x0f\n\ + \x0c\n\x05\x04\x03\x02\x03\x03\x12\x03D\x13\x14\n\x0b\n\x04\x04\x03\x02\ + \x04\x12\x03E\x02%\n\x0c\n\x05\x04\x03\x02\x04\x06\x12\x03E\x02\x10\n\ + \x0c\n\x05\x04\x03\x02\x04\x01\x12\x03E\x11\x20\n\x0c\n\x05\x04\x03\x02\ + \x04\x03\x12\x03E#$\n(\n\x04\x04\x03\x02\x05\x12\x03F\x02\x19\"\x1b\x20O\ + nly\x20used\x20for\x20CREATE\x20Op.\r\n\n\x0c\n\x05\x04\x03\x02\x05\x05\ + \x12\x03F\x02\x08\n\x0c\n\x05\x04\x03\x02\x05\x01\x12\x03F\t\x14\n\x0c\n\ + \x05\x04\x03\x02\x05\x03\x12\x03F\x17\x18\n\n\n\x02\x04\x04\x12\x04I\0P\ + \x01\n\n\n\x03\x04\x04\x01\x12\x03I\x08\x10\n\x0c\n\x04\x04\x04\x04\0\ + \x12\x04J\x02M\x03\n\x0c\n\x05\x04\x04\x04\0\x01\x12\x03J\x07\x10\n\r\n\ + \x06\x04\x04\x04\0\x02\0\x12\x03K\x04\x0f\n\x0e\n\x07\x04\x04\x04\0\x02\ + \0\x01\x12\x03K\x04\n\n\x0e\n\x07\x04\x04\x04\0\x02\0\x02\x12\x03K\r\x0e\ + \n\r\n\x06\x04\x04\x04\0\x02\x01\x12\x03L\x04\x11\n\x0e\n\x07\x04\x04\ + \x04\0\x02\x01\x01\x12\x03L\x04\x0c\n\x0e\n\x07\x04\x04\x04\0\x02\x01\ + \x02\x12\x03L\x0f\x10\n;\n\x04\x04\x04\x02\0\x12\x03N\x02\x15\".\x20For\ + \x20storing\x20type\x20of\x20Checksum\x20algorithm\x20used\r\n\n\x0c\n\ + \x05\x04\x04\x02\0\x06\x12\x03N\x02\x0b\n\x0c\n\x05\x04\x04\x02\0\x01\ + \x12\x03N\x0c\x10\n\x0c\n\x05\x04\x04\x02\0\x03\x12\x03N\x13\x14\n\x0b\n\ + \x04\x04\x04\x02\x01\x12\x03O\x02\x11\n\x0c\n\x05\x04\x04\x02\x01\x05\ + \x12\x03O\x02\x08\n\x0c\n\x05\x04\x04\x02\x01\x01\x12\x03O\t\x0c\n\x0c\n\ + \x05\x04\x04\x02\x01\x03\x12\x03O\x0f\x10\n\n\n\x02\x04\x05\x12\x04R\0W\ + \x01\n\n\n\x03\x04\x05\x01\x12\x03R\x08\x0f\n\x0b\n\x04\x04\x05\x02\0\ + \x12\x03S\x02\x19\n\x0c\n\x05\x04\x05\x02\0\x05\x12\x03S\x02\x08\n\x0c\n\ + \x05\x04\x05\x02\0\x01\x12\x03S\t\x0f\n\x0c\n\x05\x04\x05\x02\0\x03\x12\ + \x03S\x17\x18\n\x0b\n\x04\x04\x05\x02\x01\x12\x03T\x02\x18\n\x0c\n\x05\ + \x04\x05\x02\x01\x05\x12\x03T\x02\x07\n\x0c\n\x05\x04\x05\x02\x01\x01\ + \x12\x03T\t\r\n\x0c\n\x05\x04\x05\x02\x01\x03\x12\x03T\x16\x17\n\x0b\n\ + \x04\x04\x05\x02\x02\x12\x03U\x02\x18\n\x0c\n\x05\x04\x05\x02\x02\x05\ + \x12\x03U\x02\x07\n\x0c\n\x05\x04\x05\x02\x02\x01\x12\x03U\t\x0b\n\x0c\n\ + \x05\x04\x05\x02\x02\x03\x12\x03U\x16\x17\n\x0b\n\x04\x04\x05\x02\x03\ + \x12\x03V\x02\x18\n\x0c\n\x05\x04\x05\x02\x03\x05\x12\x03V\x02\x07\n\x0c\ + \n\x05\x04\x05\x02\x03\x01\x12\x03V\t\x13\n\x0c\n\x05\x04\x05\x02\x03\ + \x03\x12\x03V\x16\x17\n\n\n\x02\x04\x06\x12\x04Y\0\\\x01\n\n\n\x03\x04\ + \x06\x01\x12\x03Y\x08\r\n\x0b\n\x04\x04\x06\x02\0\x12\x03Z\x02\x13\n\x0c\ + \n\x05\x04\x06\x02\0\x05\x12\x03Z\x02\x07\n\x0c\n\x05\x04\x06\x02\0\x01\ + \x12\x03Z\x08\x0e\n\x0c\n\x05\x04\x06\x02\0\x03\x12\x03Z\x11\x12\nN\n\ + \x04\x04\x06\x02\x01\x12\x03[\x02\x1a\"A\x20Comma\x20separated\x20with\ + \x20dash\x20to\x20represent\x20ranges\x20\"1,\x202-3,\x204-7,\x209\"\r\n\ + \n\x0c\n\x05\x04\x06\x02\x01\x05\x12\x03[\x02\x08\n\x0c\n\x05\x04\x06\ + \x02\x01\x01\x12\x03[\t\x15\n\x0c\n\x05\x04\x06\x02\x01\x03\x12\x03[\x18\ + \x19b\x06proto3\ +"; + +/// `FileDescriptorProto` object which was a source for this generated file +fn file_descriptor_proto() -> &'static ::protobuf::descriptor::FileDescriptorProto { + static file_descriptor_proto_lazy: ::protobuf::rt::Lazy<::protobuf::descriptor::FileDescriptorProto> = ::protobuf::rt::Lazy::new(); + file_descriptor_proto_lazy.get(|| { + ::protobuf::Message::parse_from_bytes(file_descriptor_proto_data).unwrap() + }) +} + +/// `FileDescriptor` object which allows dynamic access to files +pub fn file_descriptor() -> &'static ::protobuf::reflect::FileDescriptor { + static generated_file_descriptor_lazy: ::protobuf::rt::Lazy<::protobuf::reflect::GeneratedFileDescriptor> = ::protobuf::rt::Lazy::new(); + static file_descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::FileDescriptor> = ::protobuf::rt::Lazy::new(); + file_descriptor.get(|| { + let generated_file_descriptor = generated_file_descriptor_lazy.get(|| { + let mut deps = ::std::vec::Vec::with_capacity(0); + let mut messages = ::std::vec::Vec::with_capacity(7); + messages.push(KV::generated_message_descriptor_data()); + messages.push(KVList::generated_message_descriptor_data()); + messages.push(ManifestChangeSet::generated_message_descriptor_data()); + messages.push(ManifestChange::generated_message_descriptor_data()); + messages.push(Checksum::generated_message_descriptor_data()); + messages.push(DataKey::generated_message_descriptor_data()); + messages.push(Match::generated_message_descriptor_data()); + let mut enums = ::std::vec::Vec::with_capacity(4); + enums.push(EncryptionAlgo::generated_enum_descriptor_data()); + enums.push(kv::Kind::generated_enum_descriptor_data()); + enums.push(manifest_change::Operation::generated_enum_descriptor_data()); + enums.push(checksum::Algorithm::generated_enum_descriptor_data()); + ::protobuf::reflect::GeneratedFileDescriptor::new_generated( + file_descriptor_proto(), + deps, + messages, + enums, + ) + }); + ::protobuf::reflect::FileDescriptor::new_generated_2(generated_file_descriptor) + }) +} diff --git a/src/y/mod.rs b/src/y/mod.rs index 120aba0..d9d09e7 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -2,21 +2,16 @@ mod codec; pub(crate) mod iterator; mod metrics; -use crate::Error::Unexpected; pub use codec::{Decode, Encode}; pub use iterator::ValueStruct; -use memmap::{Mmap, MmapMut}; -use std::backtrace; -use std::cmp::Ordering; +use memmap::MmapMut; use std::collections::hash_map::DefaultHasher; -use std::collections::HashMap; use std::error::Error as _; use std::fs::{File, OpenOptions}; use std::hash::Hasher; use std::io::{ErrorKind, Write}; use std::sync::{Arc, RwLock}; use std::{cmp, io}; -use libc::fsync; use thiserror::Error; /// Constants use in serialization sizes, and in ValueStruct serialization @@ -191,7 +186,7 @@ pub(crate) fn parallel_load_block_key(fp: File, offsets: Vec) -> Vec Date: Thu, 29 Dec 2022 01:11:14 +0800 Subject: [PATCH 08/77] :cat: --- src/manifest.rs | 150 ++++++++++++++++++++++++++++++++++++++++-------- src/y/mod.rs | 5 +- 2 files changed, 129 insertions(+), 26 deletions(-) diff --git a/src/manifest.rs b/src/manifest.rs index b9df165..e303266 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -1,7 +1,7 @@ // use crate::pb::badgerpb3::{ManifestChange, ManifestChangeSet, ManifestChange_Operation}; use crate::pb::badgerpb3::manifest_change::Operation; use crate::pb::badgerpb3::{ManifestChange, ManifestChangeSet}; -use crate::y::is_eof; +use crate::y::{is_eof, open_existing_synced_file}; use crate::Error::{BadMagic, Unexpected}; use crate::{Error, Result}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; @@ -13,6 +13,8 @@ use std::collections::{HashMap, HashSet}; use std::fs::{rename, File}; use std::io::{Cursor, Read, Write}; use std::path::Path; +use std::sync::atomic::AtomicU32; +use std::sync::Arc; // Manifest file const MANIFEST_FILENAME: &str = "MANIFEST"; @@ -28,14 +30,14 @@ const MAGIC_VERSION: u32 = 2; /// Contains information about LSM tree levels /// in the *MANIFEST* file. -#[derive(Default)] +#[derive(Default, Clone)] pub struct LevelManifest { tables: HashSet, // Set of table id's } /// *TableManifest* contains information about a specific level /// in the LSM tree. -#[derive(Default)] +#[derive(Default, Clone)] pub struct TableManifest { level: u8, } @@ -45,12 +47,106 @@ pub(crate) struct ManifestFile { fp: Option, directory: String, // We make this configurable so that unit tests can hit rewrite() code quickly - deletions_rewrite_threshold: usize, - // Guards appends, which includes access to the manifest field. - append_lock: RwLock<()>, + deletions_rewrite_threshold: AtomicU32, + // Access must be with a lock. // Used to track the current state of the manifest, used when rewriting. - manifest: Manifest, + manifest: Arc>, +} + +impl ManifestFile { + /// Write a batch of changes, atomically, to the file. By "atomically" that means when + /// we replay the *MANIFEST* file, we'll either replay all the changes or none of them. (The truth of + /// this depends on the filesystem) + pub fn add_changes(&mut self, changes: Vec) -> Result<()> { + let mut mf_changes = ManifestChangeSet::new(); + mf_changes.changes.extend(changes); + let mf_buffer = mf_changes.write_to_bytes().unwrap(); + // Maybe we could user O_APPEND instead (on certain file systems) + apply_manifest_change_set(self.manifest.clone(), &mf_changes)?; + // Rewrite manifest if it'd shrink by 1/10 and it's big enough to care + let rewrite = { + let mf_lck = self.manifest.read(); + mf_lck.deletions + > self + .deletions_rewrite_threshold + .load(atomic::Ordering::Relaxed) as usize + && mf_lck.deletions + > MANIFEST_DELETIONS_RATIO * (mf_lck.creations - mf_lck.deletions) + }; + if rewrite { + self.rewrite()?; + } else { + let mut buffer = Cursor::new(vec![]); + buffer.write_u32::(mf_buffer.len() as u32)?; + let crc32 = crc32fast::hash(&mf_buffer); + buffer.write_u32::(crc32)?; + buffer.write_all(&mf_buffer)?; + self.fp.as_mut().unwrap().write_all(&buffer.into_inner())?; + } + self.fp.as_mut().unwrap().sync_all()?; + Ok(()) + } + + /// Must be called while appendLock is held. + pub fn rewrite(&mut self) -> Result<()> { + { + self.fp.take(); + } + let (fp, n) = self.help_rewrite(&self.directory, self.manifest.clone())?; + self.fp = Some(fp); + let mut m_lck = self.manifest.write(); + m_lck.creations = n; + m_lck.deletions = 0; + Ok(()) + } + + fn help_rewrite(&self, dir: &str, m: Arc>) -> Result<(File, usize)> { + let rewrite_path = Path::new(dir).join(MANIFEST_REWRITE_FILENAME); + // We explicitly sync. + let mut fp = File::options() + .create(true) + .write(true) + .truncate(true) + .read(true) + .open(&rewrite_path)?; + let mut wt = Cursor::new(vec![]); + wt.write_all(MAGIC_TEXT)?; + wt.write_u32::(MAGIC_VERSION)?; + + let m_lck = m.read(); + let net_creations = m_lck.tables.len(); + let mut mf_set = ManifestChangeSet::new(); + mf_set.changes = m_lck.as_changes(); + let mf_buffer = mf_set.write_to_bytes().unwrap(); + wt.write_u32::(mf_buffer.len() as u32)?; + let crc32 = crc32fast::hash(&*mf_buffer); + wt.write_u32::(crc32)?; + wt.write_all(&*mf_buffer)?; + fp.write_all(&*wt.into_inner())?; + fp.sync_all()?; + drop(fp); + + let manifest_path = Path::new(dir).join(MANIFEST_FILENAME); + rename(&rewrite_path, &manifest_path)?; + // TODO add directory sync + + let fp = File::options() + .create(true) + .write(true) + .truncate(true) + .read(true) + .open(manifest_path)?; + Ok((fp, net_creations)) + } + + fn open_or_create_manifest_file(dir: &str, deletions_threshold: u32) -> Result { + let path = Path::new(dir).join(MANIFEST_FILENAME); + // We explicitly sync in add_changes, outside the lock. + let fp = open_existing_synced_file(path.to_str().unwrap(), false)?; + + todo! + } } /// Manifest represents the contents of the MANIFEST file in a Badger store. @@ -61,7 +157,7 @@ pub(crate) struct ManifestFile { /// It consists of a sequence of ManifestChangeSet objects. Each of these is treated atomically, /// and contains a sequence of ManifestChange's (file creations/deletions) which we use to /// reconstruct the manifest at startup. -#[derive(Default)] +#[derive(Default, Clone)] pub struct Manifest { levels: Vec, tables: HashMap, @@ -98,26 +194,35 @@ impl Manifest { return Err(BadMagic); } - let mut build = Manifest::new(); + let build = Arc::new(RwLock::new(Manifest::new())); let mut offset = 8; loop { - let sz = fp.read_u32::()?; - let crc32 = fp.read_u32::()?; + let sz = fp.read_u32::(); + if is_eof(&sz) { + break; + } + let sz = sz?; + let crc32 = fp.read_u32::(); + if is_eof(&crc32) { + break; + } + let crc32 = crc32?; offset += 8; let mut buffer = vec![0u8; sz as usize]; offset += fp.read(&mut buffer)?; if crc32 != crc32fast::hash(&buffer) { - // TODO why break; } - let mut mf_set = ManifestChangeSet::parse_from_bytes(&buffer).map_err(|_| BadMagic)?; - apply_manifest_change_set(&mut build, &mf_set)?; + let mf_set = ManifestChangeSet::parse_from_bytes(&buffer).map_err(|_| BadMagic)?; + apply_manifest_change_set(build.clone(), &mf_set)?; } + let build = build.write().clone(); + // so, return the lasted ManifestFile Ok((build, offset)) } - pub fn rewrite(&self, dir: &str) -> Result<(File, usize)> { + fn help_rewrite(&self, dir: &str) -> Result<(File, usize)> { let rewrite_path = Path::new(dir).join(MANIFEST_REWRITE_FILENAME); // We explicitly sync. let mut fp = File::options() @@ -170,15 +275,19 @@ impl Manifest { // this is not a "recoverable" error -- opening the KV store fails because the MANIFEST file // is just plain broken. -fn apply_manifest_change_set(build: &mut Manifest, mf_set: &ManifestChangeSet) -> Result<()> { +fn apply_manifest_change_set( + build: Arc>, + mf_set: &ManifestChangeSet, +) -> Result<()> { for change in mf_set.changes.iter() { - apply_manifest_change(build, change)?; + apply_manifest_change(build.clone(), change)?; } Ok(()) } -fn apply_manifest_change(build: &mut Manifest, tc: &ManifestChange) -> Result<()> { +fn apply_manifest_change(build: Arc>, tc: &ManifestChange) -> Result<()> { let op = Operation::from_i32(tc.Op.value()).unwrap(); + let mut build = build.write(); match op { Operation::CREATE => { if build.tables.contains_key(&tc.Id) { @@ -214,11 +323,6 @@ fn apply_manifest_change(build: &mut Manifest, tc: &ManifestChange) -> Result<() .remove(&tc.Id); assert!(has); } - _ => { - return Err(Unexpected( - "MANIFEST file has invalid manifest_change op".into(), - )) - } } Ok(()) diff --git a/src/y/mod.rs b/src/y/mod.rs index d9d09e7..270fee3 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -10,7 +10,6 @@ use std::error::Error as _; use std::fs::{File, OpenOptions}; use std::hash::Hasher; use std::io::{ErrorKind, Write}; -use std::sync::{Arc, RwLock}; use std::{cmp, io}; use thiserror::Error; @@ -207,13 +206,13 @@ pub(crate) fn slice_cmp_gte(a: &[u8], b: &[u8]) -> cmp::Ordering { } } -const datasyncFileFlag: libc::c_int = 0x0; +const DATA_SYNC_FILE_FLAG: libc::c_int = 0x0; pub(crate) fn open_existing_synced_file(file_name: &str, synced: bool) -> Result { use std::os::unix::fs::OpenOptionsExt; let mut flags = libc::O_RDWR; if synced { - flags |= datasyncFileFlag; + flags |= DATA_SYNC_FILE_FLAG; } File::options() .mode(0) From c762b1c32ba935f3241c0865d3c229e03be7c0bf Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 29 Dec 2022 18:51:49 +0800 Subject: [PATCH 09/77] :coffee: --- src/compaction.rs | 108 ++++++++++++++++++++++++++++++++++++++++++- src/level_handler.rs | 78 +++++++++++++++++++++++++------ src/levels.rs | 51 ++++++++++++++++---- src/manifest.rs | 50 ++++++++++++++------ src/table/table.rs | 24 +++++++++- src/types.rs | 34 +++++++++++++- src/y/mod.rs | 65 +++++++++++++++++++++----- 7 files changed, 360 insertions(+), 50 deletions(-) diff --git a/src/compaction.rs b/src/compaction.rs index 1fcc52d..94280c1 100644 --- a/src/compaction.rs +++ b/src/compaction.rs @@ -1,3 +1,107 @@ -pub(crate) struct CompactStatus{ +use crate::table::table::TableCore; +use parking_lot::Mutex; +use std::fmt::{Display, Formatter}; +use std::sync::Arc; -} \ No newline at end of file +pub(crate) struct CompactStatus { + levels: Arc>>, +} + +impl CompactStatus { + fn to_log(&self) { + todo!() + } + + // fn compare_and_add(&self, cd:) + + fn overlaps_with(&self, level: usize, this: &KeyRange) -> bool { + let compact_status = self.levels.lock(); + compact_status[level].overlaps_with(this) + } + + fn del_size(&self, level: usize) -> i64 { + let compact_status = self.levels.lock(); + compact_status[level].del_size + } +} + +#[derive(Debug)] +pub(crate) struct LevelCompactStatus { + ranges: Vec, + del_size: i64, +} + +impl LevelCompactStatus { + fn overlaps_with(&self, dst: &KeyRange) -> bool { + self.ranges.iter().any(|r| r.overlaps_with(dst)) + } + + fn remove(&mut self, dst: &KeyRange) -> bool { + let len = self.ranges.len(); + self.ranges.retain(|r| r.equals(dst)); + len > self.ranges.len() + } +} + +#[derive(Clone, Debug)] +pub(crate) struct KeyRange { + left: Vec, // TODO zero Copy + right: Vec, + inf: bool, +} + +impl Display for KeyRange { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "[left={:?}, right={:?}, inf={}]", + self.left, self.right, self.inf + ) + } +} + +const INFO_RANGE: KeyRange = KeyRange { + left: vec![], + right: vec![], + inf: false, +}; + +impl KeyRange { + fn get_range(tables: &Vec) -> KeyRange { + assert!(!tables.is_empty()); + let mut smallest = tables[0].smallest(); + let mut biggest = tables[0].biggest(); + for i in 1..tables.len() { + if tables[i].smallest() < smallest { + smallest = tables[i].smallest(); + } + if tables[i].biggest() > biggest { + biggest = tables[i].biggest(); + } + } + + KeyRange { + left: smallest.to_vec(), + right: biggest.to_vec(), + inf: false, + } + } + + fn equals(&self, other: &KeyRange) -> bool { + self.left == other.left && self.right == self.right && self.inf == self.inf + } + + fn overlaps_with(&self, other: &KeyRange) -> bool { + if self.inf || other.inf { + return true; + } + + if self.left > other.right { + return false; + } + if self.right < other.left { + return false; + } + true + } +} diff --git a/src/level_handler.rs b/src/level_handler.rs index 55ff906..7d97ac7 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -1,25 +1,77 @@ -use std::sync::Arc; -use std::sync::atomic::AtomicI32; -use parking_lot::RwLock; use crate::kv::WeakKV; -use crate::table::builder; use crate::table::table::TableCore; +use crate::types::{XArc, XWeak}; +use crate::Result; +use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; +use parking_lot::{RawRwLock, RwLock}; +use std::collections::HashSet; +use std::sync::atomic::{AtomicI32, AtomicI64, AtomicU64, Ordering}; +use std::sync::{Arc, Weak}; -pub(crate) struct LevelHandler { +pub(crate) type LevelHandler = XArc; +pub(crate) type WeakLevelHandler = XWeak; + +pub(crate) struct LevelHandlerInner { // Guards tables, total_size. - tables: Arc, i64)>>, + // For level >= 1, *tables* are sorted by key ranges, which do not overlap. + // For level 0, *tables* are sorted by time. + // For level 0, *newest* table are at the back. Compact the oldest one first, which is at the front. + tables: Arc>>, + total_size: AtomicU64, // The following are initialized once and const. - level: Arc, + level: AtomicI32, str_level: Arc, max_total_size: Arc, kv: WeakKV, } -impl LevelHandler { +impl LevelHandlerInner { fn init_tables(&self, tables: Vec) { - // let total_size = tables.iter().fold(0, |acc, &table| acc + table.size()); - // let mut tb = self.tables.write(); - // tb.0 = tables; - // tb.1 = total_size as i64; + let total_size = tables.iter().fold(0, |acc, table| acc + table.size()); + self.total_size.store(total_size as u64, Ordering::Relaxed); + let mut tb_wl = self.tables_wl(); + (*tb_wl) = tables; + if self.level.load(Ordering::Relaxed) == 0 { + // key range will overlap. Just sort by file_id in ascending order + // because newer tables are at the end of level 0. + tb_wl.sort_by_key(|tb| tb.id()); + } else { + // Sort tables by keys. + // TODO avoid copy + tb_wl.sort_by_key(|tb| tb.smallest().to_vec()); + } + } + + // TODO add deference table deleted + fn delete_tables(&self, to_del: Vec) { + let to_del = to_del.iter().map(|id| *id).collect::>(); + let mut tb_wl = self.tables_wl(); + tb_wl.retain(|tb| !to_del.contains(&tb.id())); } -} \ No newline at end of file + + // Replace tables[left:right] with new_tables, Note this EXCLUDES tables[right]. + // You must be call decr() to delete the old tables _after_ writing the update to the manifest. + fn replace_tables(&self, new_tables: Vec) -> Result<()> { + // Need to re-search the range of tables in this level to be replaced as other goroutines might + // be changing it as well. (They can't touch our tables, but if they add/remove other tables, + // the indices get shifted around.) + if new_tables.is_empty() { + return Ok(()); + } + + Ok(()) + } + + fn overlapping_tables(&self) {} + + fn get_total_siz(&self) -> u64 { + self.total_size.load(Ordering::Relaxed) + } + + fn tables_wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec> { + self.tables.write() + } + fn tables_rd(&self) -> RwLockReadGuard<'_, RawRwLock, Vec> { + self.tables.read() + } +} diff --git a/src/levels.rs b/src/levels.rs index f4541db..70ffeb3 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -1,13 +1,21 @@ -use std::sync::Arc; -use std::sync::atomic::AtomicI64; -use awaitgroup::WaitGroup; use crate::compaction::CompactStatus; -use crate::kv::{KV, WeakKV}; -use crate::level_handler::LevelHandler; +use crate::kv::{WeakKV, KV}; +use crate::level_handler::LevelHandlerInner; +use crate::manifest::Manifest; +use crate::table::table::new_file_name; +use crate::Error::Unexpected; +use crate::Result; +use awaitgroup::WaitGroup; +use log::{error, info}; +use std::collections::HashSet; +use std::fs::remove_file; +use std::path::Path; +use std::sync::atomic::AtomicI64; +use std::sync::Arc; pub(crate) struct LevelsController { // The following are initialized once and const - levels: Arc>, + levels: Arc>, kv: *const KV, // Atomic next_file_id: AtomicI64, @@ -24,7 +32,7 @@ impl Default for LevelsController { struct LevelsControllerInner { // The following are initialized once and const - levels: Arc>, + levels: Arc>, kv: WeakKV, // Atomic next_file_id: AtomicI64, @@ -37,7 +45,32 @@ impl LevelsControllerInner { // Returns true if level zero may be compacted, without accounting for compactions that already // might be happening. fn is_level0_compact_table(&self) -> bool { - // self.levels[0] + // self.levels[0] todo!() } -} \ No newline at end of file + + // Checks that all necessary table files exist and removes all table files not + // referenced by the manifest. *ids* is a set of table file id's that were read from the directory + // listing. + fn revert_to_manifest(dir: &str, mf: Manifest, ids: &HashSet) -> Result<()> { + // 1. check all files in manifest exists. + for (id, _) in &mf.tables { + if !ids.contains(id) { + return Err(format!("file does not exist for table {}", id).into()); + } + } + // 2. delete files that shouldn't exist + for id in ids { + if !mf.tables.contains_key(id) { + info!("Table file {} not referenced in MANIFEST", id); + let file_name = new_file_name(*id, dir.clone().parse().unwrap()); + if let Err(err) = remove_file(file_name) { + error!("While removing table {}, err: {:?}", id, err); + } + } + } + Ok(()) + } +} + +pub(crate) struct CompactDef {} diff --git a/src/manifest.rs b/src/manifest.rs index e303266..01e132f 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -3,15 +3,14 @@ use crate::pb::badgerpb3::manifest_change::Operation; use crate::pb::badgerpb3::{ManifestChange, ManifestChangeSet}; use crate::y::{is_eof, open_existing_synced_file}; use crate::Error::{BadMagic, Unexpected}; -use crate::{Error, Result}; +use crate::Result; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; -use bytes::buf::Reader; -use libc::bind; +use log::info; use parking_lot::RwLock; use protobuf::{Enum, EnumOrUnknown, Message}; use std::collections::{HashMap, HashSet}; use std::fs::{rename, File}; -use std::io::{Cursor, Read, Write}; +use std::io::{Cursor, Read, Seek, SeekFrom, Write}; use std::path::Path; use std::sync::atomic::AtomicU32; use std::sync::Arc; @@ -93,7 +92,7 @@ impl ManifestFile { { self.fp.take(); } - let (fp, n) = self.help_rewrite(&self.directory, self.manifest.clone())?; + let (fp, n) = Self::help_rewrite(&self.directory, self.manifest.clone())?; self.fp = Some(fp); let mut m_lck = self.manifest.write(); m_lck.creations = n; @@ -101,7 +100,7 @@ impl ManifestFile { Ok(()) } - fn help_rewrite(&self, dir: &str, m: Arc>) -> Result<(File, usize)> { + fn help_rewrite(dir: &str, m: Arc>) -> Result<(File, usize)> { let rewrite_path = Path::new(dir).join(MANIFEST_REWRITE_FILENAME); // We explicitly sync. let mut fp = File::options() @@ -140,12 +139,37 @@ impl ManifestFile { Ok((fp, net_creations)) } - fn open_or_create_manifest_file(dir: &str, deletions_threshold: u32) -> Result { + fn open_or_create_manifest_file(dir: &str, deletions_threshold: u32) -> Result { let path = Path::new(dir).join(MANIFEST_FILENAME); // We explicitly sync in add_changes, outside the lock. - let fp = open_existing_synced_file(path.to_str().unwrap(), false)?; - - todo! + let fp = open_existing_synced_file(path.to_str().unwrap(), false); + return match fp { + Ok(mut fp) => { + let (manifest, trunc_offset) = Manifest::replay_manifest_file(&mut fp)?; + fp.set_len(trunc_offset as u64)?; + fp.seek(SeekFrom::End(0))?; + info!("recover a new manifest, offset: {}", trunc_offset); + Ok(ManifestFile { + fp: Some(fp), + directory: dir.to_string(), + deletions_rewrite_threshold: AtomicU32::new(deletions_threshold), + manifest: Arc::new(RwLock::new(manifest)), + }) + } + Err(err) if err.is_io_notfound() => { + let mf = Arc::new(RwLock::new(Manifest::new())); + let (fp, n) = Self::help_rewrite(dir, mf.clone())?; + assert_eq!(n, 0); + info!("create a new manifest"); + Ok(ManifestFile { + fp: Some(fp), + directory: dir.to_string(), + deletions_rewrite_threshold: AtomicU32::new(deletions_threshold), + manifest: mf, + }) + } + Err(err) => Err(err), + }; } } @@ -160,7 +184,7 @@ impl ManifestFile { #[derive(Default, Clone)] pub struct Manifest { levels: Vec, - tables: HashMap, + pub(crate) tables: HashMap, // Contains total number of creation and deletion changes in the manifest --- used to compute // whether it'd be useful to rewrite the manifest creations: usize, @@ -207,14 +231,14 @@ impl Manifest { break; } let crc32 = crc32?; - offset += 8; let mut buffer = vec![0u8; sz as usize]; - offset += fp.read(&mut buffer)?; + assert_eq!(sz as usize, fp.read(&mut buffer)?); if crc32 != crc32fast::hash(&buffer) { break; } let mf_set = ManifestChangeSet::parse_from_bytes(&buffer).map_err(|_| BadMagic)?; apply_manifest_change_set(build.clone(), &mf_set)?; + offset = offset + 8 + sz as usize; } let build = build.write().clone(); diff --git a/src/table/table.rs b/src/table/table.rs index 9f7e9ac..7b17269 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -7,8 +7,9 @@ use byteorder::{BigEndian, ReadBytesExt}; use filename::file_name; use growable_bloom_filter::GrowableBloom; use memmap::{Mmap, MmapMut}; +use std::collections::HashSet; use std::fmt::{Display, Formatter}; -use std::fs::{remove_file, File}; +use std::fs::{read_dir, remove_file, File}; use std::io::{Cursor, Seek, SeekFrom}; use std::path::Path; use std::sync::atomic::{AtomicI32, Ordering}; @@ -22,6 +23,7 @@ use crate::y::iterator::Xiterator; use serde_json::to_vec; #[cfg(target_os = "windows")] use std::os::windows::fs::FileExt; +use std::process::id; use std::str::pattern::Pattern; pub(crate) const FILE_SUFFIX: &str = ".sst"; @@ -330,6 +332,26 @@ pub(crate) struct Block { type ByKey = Vec; +pub fn get_id_map(dir: &str) -> HashSet { + let dir = read_dir(dir).unwrap(); + let mut ids = HashSet::new(); + for el in dir { + if el.is_err() { + continue; + } + let dir_el = el.unwrap(); + if dir_el.metadata().unwrap().is_dir() { + continue; + } + let fid = parse_file_id(dir_el.file_name().to_str().unwrap()); + if fid.is_err() { + continue; + } + ids.insert(fid.unwrap()); + } + ids +} + pub fn parse_file_id(name: &str) -> Result { use std::str::pattern::Pattern; let path = Path::new(name); diff --git a/src/types.rs b/src/types.rs index d140cc6..523bb4b 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,6 +1,7 @@ use parking_lot::*; +use std::fmt::Debug; use std::sync::atomic::{AtomicIsize, AtomicUsize, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, Weak}; use std::time::Duration; use async_channel::{bounded, Receiver, RecvError, SendError, Sender, TryRecvError, TrySendError}; @@ -111,6 +112,37 @@ impl Closer { } } +#[derive(Debug, Clone)] +pub struct XWeak { + x: Weak, +} + +pub struct XArc { + x: Arc, +} + +impl XArc { + fn new(x: T) -> XArc { + XArc { x: Arc::new(x) } + } +} + +impl XWeak { + pub fn new() -> Self { + Self { x: Weak::new() } + } + + pub fn upgrade(&self) -> Option> { + self.x.upgrade().map(|x| XArc { x }) + } + + pub fn from(xarc: &XArc) -> Self { + Self { + x: Arc::downgrade(&xarc.x), + } + } +} + #[test] fn it_closer() { let mut runtime = tokio::runtime::Runtime::new().unwrap(); diff --git a/src/y/mod.rs b/src/y/mod.rs index 270fee3..3a6694a 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -4,10 +4,11 @@ mod metrics; pub use codec::{Decode, Encode}; pub use iterator::ValueStruct; +use libc::{O_DSYNC, O_WRONLY}; use memmap::MmapMut; use std::collections::hash_map::DefaultHasher; use std::error::Error as _; -use std::fs::{File, OpenOptions}; +use std::fs::{File, OpenOptions, Permissions}; use std::hash::Hasher; use std::io::{ErrorKind, Write}; use std::{cmp, io}; @@ -83,7 +84,22 @@ impl Default for Error { impl Error { pub fn is_io_eof(&self) -> bool { match self { - Error::StdIO(err) if err.kind() == io::ErrorKind::UnexpectedEof => true, + Error::StdIO(err) if err.kind() == ErrorKind::UnexpectedEof => true, + _ => false, + } + } + + pub fn is_io_existing(&self) -> bool { + match self { + Error::StdIO(err) if err.kind() == ErrorKind::AlreadyExists => true, + _ => false, + } + } + + + pub fn is_io_notfound(&self) -> bool { + match self { + Error::StdIO(err) if err.kind() == ErrorKind::NotFound => true, _ => false, } } @@ -115,6 +131,16 @@ pub fn is_eof(ret: &io::Result) -> bool { } } +pub fn is_existing(ret: &io::Result) -> bool { + if ret.is_ok() { + return false; + } + match ret { + Err(err) if err.kind() == ErrorKind::AlreadyExists => true, + _ => false, + } +} + pub fn hash(buffer: &[u8]) -> u64 { let mut hasher = DefaultHasher::default(); hasher.write(buffer); @@ -206,19 +232,22 @@ pub(crate) fn slice_cmp_gte(a: &[u8], b: &[u8]) -> cmp::Ordering { } } -const DATA_SYNC_FILE_FLAG: libc::c_int = 0x0; - pub(crate) fn open_existing_synced_file(file_name: &str, synced: bool) -> Result { use std::os::unix::fs::OpenOptionsExt; - let mut flags = libc::O_RDWR; if synced { - flags |= DATA_SYNC_FILE_FLAG; + File::options() + .write(true) + .read(true) + .custom_flags(O_DSYNC) + .open(file_name) + .map_err(|err| err.into()) + } else { + File::options() + .write(true) + .read(true) + .open(file_name) + .map_err(|err| err.into()) } - File::options() - .mode(0) - .custom_flags(flags) - .open(file_name) - .map_err(|err| err.into()) } pub(crate) fn create_synced_file(file_name: &str, synce: bool) -> Result { @@ -252,3 +281,17 @@ fn sync_dir() { let ok = sync_directory(&"/tmp".to_string()); println!("{:?}", ok); } + +#[test] +fn dsync() { + use std::fs::OpenOptions; + use std::os::unix::fs::OpenOptionsExt; + + let mut options = OpenOptions::new(); + options.write(true); + // if cfg!(unix) { + options.custom_flags(libc::O_WRONLY); + // } + let file = options.open("foo.txt"); + println!("{:?}", file.err()); +} From 28e0dfb93b69e4252cdf1c77305417621226e784 Mon Sep 17 00:00:00 2001 From: Rg Date: Tue, 3 Jan 2023 02:30:38 +0800 Subject: [PATCH 10/77] :dog: --- src/compaction.rs | 17 ++-- src/kv.rs | 22 ++--- src/level_handler.rs | 185 +++++++++++++++++++++++++++++++++++++---- src/levels.rs | 194 +++++++++++++++++++++++++++++++++++++++++-- src/lib.rs | 4 + src/table/table.rs | 30 ++++++- src/types.rs | 29 ++++++- 7 files changed, 427 insertions(+), 54 deletions(-) diff --git a/src/compaction.rs b/src/compaction.rs index 94280c1..13f77d4 100644 --- a/src/compaction.rs +++ b/src/compaction.rs @@ -3,6 +3,7 @@ use parking_lot::Mutex; use std::fmt::{Display, Formatter}; use std::sync::Arc; +#[derive(Clone)] pub(crate) struct CompactStatus { levels: Arc>>, } @@ -14,12 +15,12 @@ impl CompactStatus { // fn compare_and_add(&self, cd:) - fn overlaps_with(&self, level: usize, this: &KeyRange) -> bool { + pub(crate) fn overlaps_with(&self, level: usize, this: &KeyRange) -> bool { let compact_status = self.levels.lock(); compact_status[level].overlaps_with(this) } - fn del_size(&self, level: usize) -> i64 { + pub(crate) fn del_size(&self, level: usize) -> u64 { let compact_status = self.levels.lock(); compact_status[level].del_size } @@ -28,7 +29,7 @@ impl CompactStatus { #[derive(Debug)] pub(crate) struct LevelCompactStatus { ranges: Vec, - del_size: i64, + del_size: u64, } impl LevelCompactStatus { @@ -45,9 +46,9 @@ impl LevelCompactStatus { #[derive(Clone, Debug)] pub(crate) struct KeyRange { - left: Vec, // TODO zero Copy - right: Vec, - inf: bool, + pub(crate) left: Vec, // TODO zero Copy + pub(crate) right: Vec, + pub(crate) inf: bool, } impl Display for KeyRange { @@ -60,10 +61,10 @@ impl Display for KeyRange { } } -const INFO_RANGE: KeyRange = KeyRange { +pub(crate) const INFO_RANGE: KeyRange = KeyRange { left: vec![], right: vec![], - inf: false, + inf: true, }; impl KeyRange { diff --git a/src/kv.rs b/src/kv.rs index 81c3c8f..fa9aa8c 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -1,8 +1,8 @@ -use crate::manifest::{Manifest, open_or_create_manifest_file}; +use crate::manifest::{open_or_create_manifest_file, Manifest}; use crate::options::Options; use crate::table::builder::Builder; use crate::table::iterator::IteratorImpl; -use crate::types::{Channel, Closer}; +use crate::types::{Channel, Closer, XArc, XWeak}; use crate::value_log::{Request, ValueLogCore, ValuePointer}; use crate::y::{Encode, Result, ValueStruct}; use crate::{Error, Node, SkipList}; @@ -33,8 +33,6 @@ struct FlushTask { vptr: ValuePointer, } -pub struct ArcKV(KV); - pub struct KV { pub opt: Options, pub vlog: Option, @@ -250,17 +248,13 @@ impl KV { } } -pub struct WeakKV(Weak); +pub type WeakKV = XWeak; -impl WeakKV { - pub(crate) fn new() -> Self { Self(Weak::new()) } - pub(crate) fn upgrade(&self) -> Option { - // self.0.upgrade().map() - todo!() - } - pub(crate) fn from(kv: &ArcKV) -> Self { - // Self(Arc::downgrade(&kv.0)) - todo!() +pub type ArcKV = XArc; + +impl Clone for WeakKV { + fn clone(&self) -> Self { + XWeak { x: self.x.clone() } } } diff --git a/src/level_handler.rs b/src/level_handler.rs index 7d97ac7..0f2712c 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -1,32 +1,81 @@ +use crate::compaction::KeyRange; use crate::kv::WeakKV; -use crate::table::table::TableCore; +use crate::table::iterator::{BlockIteratorItem, IteratorImpl, IteratorItem}; +use crate::table::table::{Table, TableCore}; use crate::types::{XArc, XWeak}; +use crate::y::iterator::Xiterator; +use crate::y::ValueStruct; use crate::Result; +use core::slice::SlicePattern; +use libc::truncate; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; use std::collections::HashSet; +use std::mem; +use std::ops::Deref; use std::sync::atomic::{AtomicI32, AtomicI64, AtomicU64, Ordering}; use std::sync::{Arc, Weak}; + pub(crate) type LevelHandler = XArc; pub(crate) type WeakLevelHandler = XWeak; +impl LevelHandler { + pub(crate) fn close(&self) -> Result<()> { + self.x.close() + } + + pub(crate) fn num_tables(&self) -> usize { + self.x.num_tables() + } + + // Returns true if the non-zero level may be compacted. *del_size* provides the size of the tables + // which are currently being compacted so that we treat them as already having started being + // compacted (because they have been, yet their size is already counted in get_total_size). + pub(crate) fn is_compactable(&self, del_size: u64) -> bool { + self.get_total_size() - del_size >= self.get_max_total_size() + } + + pub(crate) fn get_total_size(&self) -> u64 { + self.x.total_size.load(Ordering::Relaxed) + } + + pub(crate) fn get_max_total_size(&self) -> u64 { + self.x.max_total_size.load(Ordering::Relaxed) + } + + // TODO add deference table deleted + pub(crate) fn delete_tables(&self, to_del: Vec) { + let to_del = to_del.iter().map(|id| *id).collect::>(); + let mut tb_wl = self.x.tables_wl(); + tb_wl.retain_mut(|tb| { + if to_del.contains(&tb.x.id()) { + tb.decr_ref(); + return false; + } + true + }); + } +} + pub(crate) struct LevelHandlerInner { + // TODO this lock maybe global, not only for compacted + pub(crate) self_lock: Arc>, // Guards tables, total_size. // For level >= 1, *tables* are sorted by key ranges, which do not overlap. // For level 0, *tables* are sorted by time. // For level 0, *newest* table are at the back. Compact the oldest one first, which is at the front. - tables: Arc>>, - total_size: AtomicU64, + pub(crate) tables: Arc>>, + pub(crate) total_size: AtomicU64, // The following are initialized once and const. - level: AtomicI32, + pub(crate) level: AtomicI32, str_level: Arc, - max_total_size: Arc, + pub(crate) max_total_size: AtomicU64, kv: WeakKV, } impl LevelHandlerInner { - fn init_tables(&self, tables: Vec) { + fn init_tables(&self, tables: Vec) { let total_size = tables.iter().fold(0, |acc, table| acc + table.size()); self.total_size.store(total_size as u64, Ordering::Relaxed); let mut tb_wl = self.tables_wl(); @@ -34,7 +83,7 @@ impl LevelHandlerInner { if self.level.load(Ordering::Relaxed) == 0 { // key range will overlap. Just sort by file_id in ascending order // because newer tables are at the end of level 0. - tb_wl.sort_by_key(|tb| tb.id()); + tb_wl.sort_by_key(|tb| tb.x.id()); } else { // Sort tables by keys. // TODO avoid copy @@ -42,36 +91,136 @@ impl LevelHandlerInner { } } - // TODO add deference table deleted - fn delete_tables(&self, to_del: Vec) { - let to_del = to_del.iter().map(|id| *id).collect::>(); - let mut tb_wl = self.tables_wl(); - tb_wl.retain(|tb| !to_del.contains(&tb.id())); - } - // Replace tables[left:right] with new_tables, Note this EXCLUDES tables[right]. // You must be call decr() to delete the old tables _after_ writing the update to the manifest. - fn replace_tables(&self, new_tables: Vec) -> Result<()> { + fn replace_tables(&self, new_tables: Vec
) -> Result<()> { // Need to re-search the range of tables in this level to be replaced as other goroutines might // be changing it as well. (They can't touch our tables, but if they add/remove other tables, // the indices get shifted around.) if new_tables.is_empty() { return Ok(()); } + // Increase total_size first. + for tb in &new_tables { + self.total_size + .fetch_add(tb.size() as u64, Ordering::Relaxed); + // add table reference + tb.incr_ref(); + } + let key_range = KeyRange { + left: new_tables.first().unwrap().smallest().to_vec(), + right: new_tables.last().unwrap().biggest().to_vec(), + inf: false, + }; + + // TODO Opz code + { + let mut tables_lck = self.tables_wl(); + tables_lck.retain_mut(|tb| { + let left = tb.biggest() <= key_range.left.as_slice(); + let right = tb.smallest() > key_range.right.as_slice(); + if left || right { + return true; + } else { + // TODO it should be not a good idea decr reference here, slow lock + // decr table reference + tb.decr_ref(); + self.total_size + .fetch_sub(tb.size() as u64, Ordering::Relaxed); + false + } + }); + tables_lck.extend(new_tables); + // TODO avoid resort + tables_lck.sort_by(|a, b| a.smallest().cmp(b.smallest())); + } + Ok(()) + } + + // Return true if ok and no stalling. + fn try_add_level0_table(&self, t: Table) -> bool { + assert_eq!(self.level.load(Ordering::Relaxed), 0); + let tw = self.tables_wl(); + if tw.len() >= self.kv.upgrade().unwrap().x.opt.num_level_zero_tables_stall { + return false; + } + t.incr_ref(); + self.total_size + .fetch_add(t.size() as u64, Ordering::Relaxed); + self.tables.write().push(t); + true + } + fn num_tables(&self) -> usize { + self.tables_rd().len() + } + + // Must be call only once + fn close(&self) -> Result<()> { + let tw = self.tables_wl(); + tw.iter().for_each(|tb| tb.decr_ref()); Ok(()) } - fn overlapping_tables(&self) {} + // Acquires a read-lock to access s.tables. It return a list of table_handlers. + fn get_table_for_key(&self, key: &[u8]) -> Option { + return if self.level.load(Ordering::Relaxed) == 0 { + let tw = self.tables_rd(); + for tb in tw.iter().rev() { + tb.incr_ref(); + let it = IteratorImpl::new(tb.to_ref(), false); + let item = it.seek(key); + tb.decr_ref(); + if item.is_none() { + // todo add metrics + } + } + None + } else { + let tw = self.tables_rd(); + let ok = tw.binary_search_by(|tb| tb.biggest().cmp(key)); + if ok.is_err() { + // todo add metrics + return None; + } + let tb = tw.get(ok.unwrap()).unwrap(); + tb.incr_ref(); + let it = IteratorImpl::new(tb.to_ref(), false); + let item = it.seek(key); + tb.decr_ref(); + if item.is_none() { + // todo add metrics + } + item + }; + } + + // Returns the tables that intersect with key range. Returns a half-interval. + // This function should already have acquired a read lock, and this is so important the caller must + // pass an empty parameter declaring such. + // TODO Opz me + fn overlapping_tables(&self, key_range: &KeyRange) { + let left = self + .tables + .read() + .binary_search_by(|tb| key_range.left.as_slice().cmp(tb.biggest())); + let left_index = left.map_err(|n| n).unwrap(); + + let right = self + .tables + .read() + .binary_search_by(|tb| key_range.right.as_slice().cmp(tb.smallest())); + // let right_index = right + } fn get_total_siz(&self) -> u64 { self.total_size.load(Ordering::Relaxed) } - fn tables_wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec> { + fn tables_wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec
> { self.tables.write() } - fn tables_rd(&self) -> RwLockReadGuard<'_, RawRwLock, Vec> { + fn tables_rd(&self) -> RwLockReadGuard<'_, RawRwLock, Vec
> { self.tables.read() } } diff --git a/src/levels.rs b/src/levels.rs index 70ffeb3..b7eb094 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -1,35 +1,162 @@ -use crate::compaction::CompactStatus; +use crate::compaction::{CompactStatus, KeyRange, INFO_RANGE}; use crate::kv::{WeakKV, KV}; -use crate::level_handler::LevelHandlerInner; +use crate::level_handler::{LevelHandler, LevelHandlerInner, WeakLevelHandler}; use crate::manifest::Manifest; -use crate::table::table::new_file_name; +use crate::options::Options; +use crate::table::table::{new_file_name, Table, TableCore}; +use crate::types::{Closer, XArc, XWeak}; use crate::Error::Unexpected; use crate::Result; use awaitgroup::WaitGroup; use log::{error, info}; +use parking_lot::lock_api::RawRwLock; +use parking_lot::{RwLock, RwLockReadGuard}; +use std::cell::RefCell; use std::collections::HashSet; use std::fs::remove_file; +use std::ops::Deref; use std::path::Path; -use std::sync::atomic::AtomicI64; +use std::sync::atomic::{AtomicI64, AtomicU64}; use std::sync::Arc; +use std::time::Duration; +#[derive(Clone)] pub(crate) struct LevelsController { // The following are initialized once and const - levels: Arc>, - kv: *const KV, + levels: Arc>, + kv: WeakKV, // Atomic - next_file_id: AtomicI64, + next_file_id: Arc, // For ending compactions. - compact_worker_wg: WaitGroup, + compact_worker_wg: Arc, c_status: CompactStatus, } +unsafe impl Sync for LevelsController {} + +unsafe impl Send for LevelsController {} + impl Default for LevelsController { fn default() -> Self { todo!() } } +impl LevelsController { + fn cleanup_levels(&self) -> Result<()> { + for level in self.levels.iter() { + level.close()?; + } + Ok(()) + } + + fn start_compact(&self, lc: Closer) { + for i in 0..self.must_kv().opt.num_compactors { + lc.add_running(1); + let lc = lc.clone(); + let _self = self.clone(); + tokio::spawn(async move { + _self.run_worker(lc).await; + }); + } + } + async fn run_worker(&self, lc: Closer) { + if self.must_kv().opt.do_not_compact { + lc.done(); + return; + } + lc.done(); + // add random time + tokio::time::sleep(Duration::from_millis(1000)).await; + let mut interval = tokio::time::interval(Duration::from_secs(1)); + loop { + // why interval can life long + let done = lc.has_been_closed(); + tokio::select! { + _ = interval.tick() => { + let pick: Vec = self.pick_compact_levels(); + for p in pick.iter() { + + } + }, + _ = done.recv() => {return;} + } + } + } + // Picks some table on level l and compacts it away to the next level. + fn do_compact(&self, p: CompactionPriority) -> Result { + let level = p.level; + assert!(level + 1 < self.must_kv().opt.max_levels); // Sanity check. + let mut cd = CompactDef::default(); + cd.this_level = (self.levels[level]).clone(); + cd.next_level = (self.levels[level + 1]).clone(); + info!("Got compaction priority: {:?}", p); + // While picking tables to be compacted, both level's tables are expected to + // remain unchanged. + if level == 0 {} + Ok(true) + } + + fn fill_tables_l0<'a>( + &'a self, + cd: &'a CompactDef, + ) -> parking_lot::lock_api::RwLockReadGuard<'_, parking_lot::RawRwLock, Vec
> { + cd.lock_levels(); + let top = cd.this_level.to_ref().tables.read(); + cd.unlock_levels(); + top + } + + fn f<'a>(&'a self, cd: &'a mut CompactDef<'a>) { + let c = self.fill_tables_l0(cd); + let _ = cd._c.replace(c); + } + + // Determines which level to compact. + // Base on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction. + fn pick_compact_levels(&self) -> Vec { + // This function must use identical criteria for guaranteeing compaction's progress that + // add_level0_table use. + + let mut prios = vec![]; + // cstatus is checked to see if level 0's tables are already being compacted. + if !self.c_status.overlaps_with(0, &INFO_RANGE) && self.is_level0_compactable() { + prios.push(CompactionPriority { + level: 0, + score: (self.levels[0].num_tables() as f64) + / (self.must_kv().opt.num_level_zero_tables as f64), + }) + } + + for (i, level) in self.levels[1..].iter().enumerate() { + // Don't consider those tables that are already being compacted right now. + let del_size = self.c_status.del_size(i + 1); + if level.is_compactable(del_size) { + prios.push(CompactionPriority { + level: i + 1, + score: (level.get_total_size() as f64 / level.get_max_total_size() as f64), + }); + } + } + prios.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap()); + prios + } + + fn is_level0_compactable(&self) -> bool { + self.levels[0].num_tables() >= self.must_kv().opt.num_level_zero_tables + } + + fn must_kv(&self) -> Arc { + self.kv.x.upgrade().unwrap() + } +} + +#[derive(Debug, Clone)] +struct CompactionPriority { + level: usize, + score: f64, +} + struct LevelsControllerInner { // The following are initialized once and const levels: Arc>, @@ -73,4 +200,53 @@ impl LevelsControllerInner { } } -pub(crate) struct CompactDef {} +pub(crate) struct CompactDef<'a> { + this_level: LevelHandler, + next_level: LevelHandler, + // top: RwLockReadGuard<'a, Vec
>, + bot: RwLock>, + _c: RefCell>>, + this_range: KeyRange, + next_range: KeyRange, + this_size: AtomicU64, +} + +impl<'a> Default for CompactDef<'a> { + fn default() -> Self { + // CompactDef { + // this_level: XWeak::new(), + // next_level: XWeak::new(), + // top: RwLockReadGuard::, + // bot: RwLock::new(vec![]), + // this_range: KeyRange { + // left: vec![], + // right: vec![], + // inf: false, + // }, + // next_range: KeyRange { + // left: vec![], + // right: vec![], + // inf: false, + // }, + // this_size: Default::default(), + // } + todo!() + } +} + +impl<'a> CompactDef<'a> { + fn lock_levels(&self) { + use parking_lot::lock_api::RawRwLock; + unsafe { + self.this_level.x.self_lock.raw().lock_shared(); + self.next_level.x.self_lock.raw().lock_shared(); + } + } + + fn unlock_levels(&self) { + unsafe { + self.next_level.x.self_lock.raw().unlock_shared(); + self.this_level.x.self_lock.raw().unlock_shared(); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 193eac9..f652a82 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,10 @@ #![feature(fs_try_exists)] #![feature(generic_associated_types)] #![feature(unwrap_infallible)] +#![feature(slice_pattern)] +#![feature(slice_take)] + +extern crate core; use std::io; use std::mem::align_of; diff --git a/src/table/table.rs b/src/table/table.rs index 7b17269..0cfa2fa 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -19,6 +19,7 @@ use std::{fmt, io}; #[cfg(target_os = "macos")] use std::os::unix::fs::FileExt; +use crate::types::{XArc, XWeak}; use crate::y::iterator::Xiterator; use serde_json::to_vec; #[cfg(target_os = "windows")] @@ -44,7 +45,29 @@ impl fmt::Display for KeyOffset { } } -pub struct Table {} +pub type Table = XArc; +pub type WeakTable = XWeak; + +impl Table { + pub(crate) fn incr_ref(&self) { + self.x.incr_ref() + } + + pub(crate) fn decr_ref(&self) { + self.x.decr_ref() + } + + pub(crate) fn size(&self) -> usize { + self.x.size() + } + + pub(crate) fn biggest(&self) -> &[u8] { + &self.x.biggest + } + pub(crate) fn smallest(&self) -> &[u8] { + &self.x.smallest + } +} pub struct TableCore { _ref: AtomicI32, @@ -120,11 +143,11 @@ impl TableCore { } // increments the refcount (having to do with whether the file should be deleted) - fn incr_ref(&self) { + pub(crate) fn incr_ref(&self) { self._ref.fetch_add(1, Ordering::Relaxed); } // decrements the refcount and possibly deletes the table - fn decr_ref(&self) { + pub(crate) fn decr_ref(&self) { self._ref.fetch_sub(1, Ordering::Relaxed); } @@ -153,6 +176,7 @@ impl TableCore { self.read(off, sz).unwrap() } + // TODO maybe use &self fn read_index(&mut self) -> Result<()> { let mut read_pos = self.table_size; // Read bloom filter. diff --git a/src/types.rs b/src/types.rs index 523bb4b..1c9aeaf 100644 --- a/src/types.rs +++ b/src/types.rs @@ -114,17 +114,28 @@ impl Closer { #[derive(Debug, Clone)] pub struct XWeak { - x: Weak, + pub(crate) x: Weak, } +#[derive(Debug)] pub struct XArc { - x: Arc, + pub(crate) x: Arc, +} + +impl Clone for XArc { + fn clone(&self) -> Self { + XArc { x: self.x.clone() } + } } impl XArc { fn new(x: T) -> XArc { XArc { x: Arc::new(x) } } + + pub fn to_ref(&self) -> &T { + self.x.as_ref() + } } impl XWeak { @@ -158,3 +169,17 @@ fn it_closer() { println!("Hello Word"); }); } + +#[test] +fn lck() { + // #![feature(slice_take)] + // use bytes::Buf; + // + // let x = Arc::new(RwLock::new(vec![0u8; 10])); + // let xr = x.read(); + // let mut v1 = xr.take(1).into_inner(); + // // let mut v2 = x.read().take(2).into_inner(); + // x.write()[0] = 10; + // + // println!("{:?}", v1); +} From 29beed0b614d73fd96f1eb5678e4db46411607da Mon Sep 17 00:00:00 2001 From: Rg Date: Tue, 3 Jan 2023 18:49:59 +0800 Subject: [PATCH 11/77] :coffee: --- Cargo.toml | 1 + src/compaction.rs | 14 +++- src/level_handler.rs | 148 +++++++++++++++++++++---------------------- src/levels.rs | 40 +++++++----- src/types.rs | 70 ++++++++++++++++---- 5 files changed, 165 insertions(+), 108 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6627f17..cc6ae54 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,7 @@ file-guard = "0.1.0" fs2 = "0.4.3" awaitgroup = "0.6.0" protobuf = { version = "3.2.0", features = ["with-bytes"] } +range-lock = "0.2.2" [dev-dependencies] chrono = "0.4.22" env_logger = "0.9.1" diff --git a/src/compaction.rs b/src/compaction.rs index 13f77d4..df073cc 100644 --- a/src/compaction.rs +++ b/src/compaction.rs @@ -1,7 +1,9 @@ -use crate::table::table::TableCore; +use crate::levels::CompactDef; +use crate::table::table::{Table, TableCore}; use parking_lot::Mutex; use std::fmt::{Display, Formatter}; use std::sync::Arc; +use std::sync::atomic::Ordering; #[derive(Clone)] pub(crate) struct CompactStatus { @@ -24,6 +26,14 @@ impl CompactStatus { let compact_status = self.levels.lock(); compact_status[level].del_size } + + // Check whether we can run this `CompactDef`. That it doesn't overlap with any + // other running compaction. If it can be run, it would store this run in the `compact_status` state. + pub(crate) fn compare_and_add(&self, cd: CompactDef) { + let compact_status = self.levels.lock(); + let level = cd.this_level.x.level.load(Ordering::Relaxed); + // assert!(level < compact_status.levels.len()) + } } #[derive(Debug)] @@ -68,7 +78,7 @@ pub(crate) const INFO_RANGE: KeyRange = KeyRange { }; impl KeyRange { - fn get_range(tables: &Vec) -> KeyRange { + pub fn get_range(tables: &Vec
) -> KeyRange { assert!(!tables.is_empty()); let mut smallest = tables[0].smallest(); let mut biggest = tables[0].biggest(); diff --git a/src/level_handler.rs b/src/level_handler.rs index 0f2712c..009509d 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -1,34 +1,22 @@ use crate::compaction::KeyRange; -use crate::kv::WeakKV; -use crate::table::iterator::{BlockIteratorItem, IteratorImpl, IteratorItem}; -use crate::table::table::{Table, TableCore}; +use crate::kv::{WeakKV, KV}; +use crate::table::iterator::{IteratorImpl, IteratorItem}; +use crate::table::table::Table; use crate::types::{XArc, XWeak}; use crate::y::iterator::Xiterator; -use crate::y::ValueStruct; use crate::Result; use core::slice::SlicePattern; -use libc::truncate; + use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; use std::collections::HashSet; -use std::mem; -use std::ops::Deref; -use std::sync::atomic::{AtomicI32, AtomicI64, AtomicU64, Ordering}; -use std::sync::{Arc, Weak}; - +use std::sync::atomic::{AtomicI32, AtomicU64, Ordering}; +use std::sync::Arc; pub(crate) type LevelHandler = XArc; pub(crate) type WeakLevelHandler = XWeak; impl LevelHandler { - pub(crate) fn close(&self) -> Result<()> { - self.x.close() - } - - pub(crate) fn num_tables(&self) -> usize { - self.x.num_tables() - } - // Returns true if the non-zero level may be compacted. *del_size* provides the size of the tables // which are currently being compacted so that we treat them as already having started being // compacted (because they have been, yet their size is already counted in get_total_size). @@ -47,7 +35,7 @@ impl LevelHandler { // TODO add deference table deleted pub(crate) fn delete_tables(&self, to_del: Vec) { let to_del = to_del.iter().map(|id| *id).collect::>(); - let mut tb_wl = self.x.tables_wl(); + let mut tb_wl = self.tables_wl(); tb_wl.retain_mut(|tb| { if to_del.contains(&tb.x.id()) { tb.decr_ref(); @@ -56,31 +44,15 @@ impl LevelHandler { true }); } -} -pub(crate) struct LevelHandlerInner { - // TODO this lock maybe global, not only for compacted - pub(crate) self_lock: Arc>, - // Guards tables, total_size. - // For level >= 1, *tables* are sorted by key ranges, which do not overlap. - // For level 0, *tables* are sorted by time. - // For level 0, *newest* table are at the back. Compact the oldest one first, which is at the front. - pub(crate) tables: Arc>>, - pub(crate) total_size: AtomicU64, - // The following are initialized once and const. - pub(crate) level: AtomicI32, - str_level: Arc, - pub(crate) max_total_size: AtomicU64, - kv: WeakKV, -} - -impl LevelHandlerInner { - fn init_tables(&self, tables: Vec
) { + pub(crate) fn init_tables(&self, tables: Vec
) { let total_size = tables.iter().fold(0, |acc, table| acc + table.size()); - self.total_size.store(total_size as u64, Ordering::Relaxed); + self.x + .total_size + .store(total_size as u64, Ordering::Relaxed); let mut tb_wl = self.tables_wl(); (*tb_wl) = tables; - if self.level.load(Ordering::Relaxed) == 0 { + if self.x.level.load(Ordering::Relaxed) == 0 { // key range will overlap. Just sort by file_id in ascending order // because newer tables are at the end of level 0. tb_wl.sort_by_key(|tb| tb.x.id()); @@ -91,6 +63,32 @@ impl LevelHandlerInner { } } + fn tables_wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec
> { + self.x.tables.write() + } + fn tables_rd(&self) -> RwLockReadGuard<'_, RawRwLock, Vec
> { + self.x.tables.read() + } + + // Returns the tables that intersect with key range. Returns a half-interval. + // This function should already have acquired a read lock, and this is so important the caller must + // pass an empty parameter declaring such. + // TODO Opz me + pub(crate) fn overlapping_tables(&self, key_range: &KeyRange) -> (usize, usize) { + let left = self + .tables_rd() + .binary_search_by(|tb| key_range.left.as_slice().cmp(tb.biggest())); + + let right = self + .tables_rd() + .binary_search_by(|tb| key_range.right.as_slice().cmp(tb.smallest())); + (left.unwrap(), right.unwrap()) + } + + pub(crate) fn get_total_siz(&self) -> u64 { + self.x.total_size.load(Ordering::Relaxed) + } + // Replace tables[left:right] with new_tables, Note this EXCLUDES tables[right]. // You must be call decr() to delete the old tables _after_ writing the update to the manifest. fn replace_tables(&self, new_tables: Vec
) -> Result<()> { @@ -102,7 +100,8 @@ impl LevelHandlerInner { } // Increase total_size first. for tb in &new_tables { - self.total_size + self.x + .total_size .fetch_add(tb.size() as u64, Ordering::Relaxed); // add table reference tb.incr_ref(); @@ -125,7 +124,8 @@ impl LevelHandlerInner { // TODO it should be not a good idea decr reference here, slow lock // decr table reference tb.decr_ref(); - self.total_size + self.x + .total_size .fetch_sub(tb.size() as u64, Ordering::Relaxed); false } @@ -138,33 +138,34 @@ impl LevelHandlerInner { } // Return true if ok and no stalling. - fn try_add_level0_table(&self, t: Table) -> bool { - assert_eq!(self.level.load(Ordering::Relaxed), 0); + pub(crate) fn try_add_level0_table(&self, t: Table) -> bool { + assert_eq!(self.x.level.load(Ordering::Relaxed), 0); let tw = self.tables_wl(); - if tw.len() >= self.kv.upgrade().unwrap().x.opt.num_level_zero_tables_stall { + if tw.len() >= self.kv().x.opt.num_level_zero_tables_stall { return false; } t.incr_ref(); - self.total_size + self.x + .total_size .fetch_add(t.size() as u64, Ordering::Relaxed); - self.tables.write().push(t); + self.tables_wl().push(t); true } - fn num_tables(&self) -> usize { + pub(crate) fn num_tables(&self) -> usize { self.tables_rd().len() } // Must be call only once - fn close(&self) -> Result<()> { + pub(crate) fn close(&self) -> Result<()> { let tw = self.tables_wl(); tw.iter().for_each(|tb| tb.decr_ref()); Ok(()) } // Acquires a read-lock to access s.tables. It return a list of table_handlers. - fn get_table_for_key(&self, key: &[u8]) -> Option { - return if self.level.load(Ordering::Relaxed) == 0 { + pub(crate) fn get_table_for_key(&self, key: &[u8]) -> Option { + return if self.x.level.load(Ordering::Relaxed) == 0 { let tw = self.tables_rd(); for tb in tw.iter().rev() { tb.incr_ref(); @@ -195,32 +196,25 @@ impl LevelHandlerInner { }; } - // Returns the tables that intersect with key range. Returns a half-interval. - // This function should already have acquired a read lock, and this is so important the caller must - // pass an empty parameter declaring such. - // TODO Opz me - fn overlapping_tables(&self, key_range: &KeyRange) { - let left = self - .tables - .read() - .binary_search_by(|tb| key_range.left.as_slice().cmp(tb.biggest())); - let left_index = left.map_err(|n| n).unwrap(); - - let right = self - .tables - .read() - .binary_search_by(|tb| key_range.right.as_slice().cmp(tb.smallest())); - // let right_index = right - } - - fn get_total_siz(&self) -> u64 { - self.total_size.load(Ordering::Relaxed) + fn kv(&self) -> XArc { + self.x.kv.upgrade().unwrap() } +} - fn tables_wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec
> { - self.tables.write() - } - fn tables_rd(&self) -> RwLockReadGuard<'_, RawRwLock, Vec
> { - self.tables.read() - } +pub(crate) struct LevelHandlerInner { + // TODO this lock maybe global, not only for compacted + pub(crate) self_lock: Arc>, + // Guards tables, total_size. + // For level >= 1, *tables* are sorted by key ranges, which do not overlap. + // For level 0, *tables* are sorted by time. + // For level 0, *newest* table are at the back. Compact the oldest one first, which is at the front. + pub(crate) tables: Arc>>, + pub(crate) total_size: AtomicU64, + // The following are initialized once and const. + pub(crate) level: AtomicI32, + str_level: Arc, + pub(crate) max_total_size: AtomicU64, + kv: WeakKV, } + +impl LevelHandlerInner {} diff --git a/src/levels.rs b/src/levels.rs index b7eb094..bb67a9a 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -97,19 +97,30 @@ impl LevelsController { Ok(true) } - fn fill_tables_l0<'a>( - &'a self, - cd: &'a CompactDef, - ) -> parking_lot::lock_api::RwLockReadGuard<'_, parking_lot::RawRwLock, Vec
> { + fn fill_tables_l0(&self, cd: &mut CompactDef) -> bool { cd.lock_levels(); let top = cd.this_level.to_ref().tables.read(); + // TODO here maybe have some issue that i don't understand + let tables = top.to_vec(); + cd.top.borrow_mut().extend(tables); + if cd.top.borrow().is_empty() { + cd.unlock_levels(); + return false; + } + cd.this_range = INFO_RANGE; + let kr = KeyRange::get_range(cd.top.borrow().as_ref()); + let (left, right) = cd.next_level.overlapping_tables(&kr); + let bot = cd.next_level.to_ref().tables.read(); + let tables = bot.to_vec(); + cd.bot.borrow_mut().extend(tables[left..right].to_vec()); + if cd.bot.borrow().is_empty() { + cd.next_range = kr; + } else { + cd.next_range = KeyRange::get_range(cd.bot.borrow().as_ref()); + } + // if !self.c_status. cd.unlock_levels(); - top - } - - fn f<'a>(&'a self, cd: &'a mut CompactDef<'a>) { - let c = self.fill_tables_l0(cd); - let _ = cd._c.replace(c); + true } // Determines which level to compact. @@ -201,10 +212,10 @@ impl LevelsControllerInner { } pub(crate) struct CompactDef<'a> { - this_level: LevelHandler, - next_level: LevelHandler, - // top: RwLockReadGuard<'a, Vec
>, - bot: RwLock>, + pub(crate) this_level: LevelHandler, + pub(crate) next_level: LevelHandler, + top: RefCell>, + bot: RefCell>, _c: RefCell>>, this_range: KeyRange, next_range: KeyRange, @@ -236,7 +247,6 @@ impl<'a> Default for CompactDef<'a> { impl<'a> CompactDef<'a> { fn lock_levels(&self) { - use parking_lot::lock_api::RawRwLock; unsafe { self.this_level.x.self_lock.raw().lock_shared(); self.next_level.x.self_lock.raw().lock_shared(); diff --git a/src/types.rs b/src/types.rs index 1c9aeaf..0a09d4c 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,10 +1,14 @@ use parking_lot::*; use std::fmt::Debug; -use std::sync::atomic::{AtomicIsize, AtomicUsize, Ordering}; -use std::sync::{Arc, Weak}; +use std::ops::{Deref, RangeBounds}; +use std::sync::atomic::{AtomicI32, AtomicIsize, AtomicUsize, Ordering}; +use std::sync::{Arc, TryLockResult, Weak}; use std::time::Duration; +use std::{hint, thread}; use async_channel::{bounded, Receiver, RecvError, SendError, Sender, TryRecvError, TrySendError}; + +use range_lock::{VecRangeLock, VecRangeLockGuard}; use tokio::time::sleep; #[derive(Clone)] @@ -154,11 +158,52 @@ impl XWeak { } } +#[derive(Clone)] +pub struct XVec(pub Arc>); + +impl XVec { + pub fn new(v: Vec) -> Self { + XVec(Arc::new(VecRangeLock::new(v))) + } + + pub fn lock(&self, left: usize, right: usize) { + loop { + let range = left..right; + if self.0.try_lock(range).is_ok() { + break; + } else { + hint::spin_loop(); + } + } + } + + pub fn try_lock(&self, range: impl RangeBounds) -> TryLockResult> { + self.0.try_lock(range) + } + + // fn to_owned(self) -> Vec { + // self.0.into_inner() + // } +} + +impl Deref for XVec { + type Target = VecRangeLock; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +// impl DerefMut for XVec { +// fn deref_mut(&mut self) -> &mut Self::Target { +// &mut self.0 +// } +// } + #[test] fn it_closer() { - let mut runtime = tokio::runtime::Runtime::new().unwrap(); + let runtime = tokio::runtime::Runtime::new().unwrap(); runtime.block_on(async { - let mut closer = Closer::new(1); + let closer = Closer::new(1); let c = closer.clone(); tokio::spawn(async move { sleep(Duration::from_millis(20000)).await; @@ -172,14 +217,11 @@ fn it_closer() { #[test] fn lck() { - // #![feature(slice_take)] - // use bytes::Buf; - // - // let x = Arc::new(RwLock::new(vec![0u8; 10])); - // let xr = x.read(); - // let mut v1 = xr.take(1).into_inner(); - // // let mut v2 = x.read().take(2).into_inner(); - // x.write()[0] = 10; - // - // println!("{:?}", v1); + // let x: &'static [i32; 3] = Box::leak(Box::new([1, 2, 3])); + // thread::spawn(move || dbg!(x)); + // thread::spawn(move || dbg!(x)); + let v = Arc::new(RwLock::new(vec![Arc::new(AtomicI32::new(10))])); + let lck = v.write().to_vec(); + lck[0].store(100, Ordering::Relaxed); + println!("{:?}", v.read()); } From 8b5a5eae3ea7a48b2cef86039fa84ddb8271d7c0 Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 4 Jan 2023 01:58:22 +0800 Subject: [PATCH 12/77] :dog: --- src/compaction.rs | 60 +++++++++++++++++++++++++++++++++++--------- src/level_handler.rs | 4 +++ src/levels.rs | 17 ++++++------- 3 files changed, 60 insertions(+), 21 deletions(-) diff --git a/src/compaction.rs b/src/compaction.rs index df073cc..0f8567f 100644 --- a/src/compaction.rs +++ b/src/compaction.rs @@ -1,13 +1,15 @@ use crate::levels::CompactDef; use crate::table::table::{Table, TableCore}; -use parking_lot::Mutex; +use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; +use parking_lot::{RawRwLock, RwLock}; use std::fmt::{Display, Formatter}; -use std::sync::Arc; use std::sync::atomic::Ordering; +use std::sync::Arc; +use tokio::sync::RwLockMappedWriteGuard; #[derive(Clone)] pub(crate) struct CompactStatus { - levels: Arc>>, + levels: Arc>>, } impl CompactStatus { @@ -15,24 +17,58 @@ impl CompactStatus { todo!() } - // fn compare_and_add(&self, cd:) + // Check whether we can run this *CompactDef*. That it doesn't overlap with any + // other running Compaction. If it can be run, it would store this run in the compactStatus state. + pub(crate) fn compare_and_add(&self, cd: &mut CompactDef) -> bool { + let level = cd.this_level.level(); + assert!( + level < self.rl().len() - 1, + "Got level {}, max level {}", + level, + self.rl().len() + ); + + let mut this_level = + RwLockWriteGuard::map(self.levels.write(), |lc| lc.get_mut(level).unwrap()); + let mut next_level = + RwLockWriteGuard::map(self.levels.write(), |lc| lc.get_mut(level + 1).unwrap()); + + if this_level.overlaps_with(&cd.this_range) { + return false; + } + if next_level.overlaps_with(&cd.next_range) { + return false; + } + + // Check whether this level really needs compaction or not. Otherwise, we'll end up + // running parallel compactions for the same level. + // *NOTE*: We can directly call this_level.total_size, because we already have acquire a read lock + // over this and the next level. + if cd.this_level.get_total_size() - this_level.del_size < cd.this_level.get_max_total_size() + { + return false; + } + this_level.ranges.push(cd.this_range.clone()); + next_level.ranges.push(cd.next_range.clone()); + this_level.del_size += cd.this_size.load(Ordering::Relaxed); + true + } pub(crate) fn overlaps_with(&self, level: usize, this: &KeyRange) -> bool { - let compact_status = self.levels.lock(); + let compact_status = self.wl(); compact_status[level].overlaps_with(this) } pub(crate) fn del_size(&self, level: usize) -> u64 { - let compact_status = self.levels.lock(); + let compact_status = self.rl(); compact_status[level].del_size } - // Check whether we can run this `CompactDef`. That it doesn't overlap with any - // other running compaction. If it can be run, it would store this run in the `compact_status` state. - pub(crate) fn compare_and_add(&self, cd: CompactDef) { - let compact_status = self.levels.lock(); - let level = cd.this_level.x.level.load(Ordering::Relaxed); - // assert!(level < compact_status.levels.len()) + fn wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec> { + self.levels.write() + } + fn rl(&self) -> RwLockReadGuard<'_, RawRwLock, Vec> { + self.levels.read() } } diff --git a/src/level_handler.rs b/src/level_handler.rs index 009509d..5b0a32e 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -196,6 +196,10 @@ impl LevelHandler { }; } + pub(crate) fn level(&self) -> usize { + self.x.level.load(Ordering::Relaxed) as usize + } + fn kv(&self) -> XArc { self.x.kv.upgrade().unwrap() } diff --git a/src/levels.rs b/src/levels.rs index bb67a9a..203b3e1 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -211,18 +211,17 @@ impl LevelsControllerInner { } } -pub(crate) struct CompactDef<'a> { +pub(crate) struct CompactDef { pub(crate) this_level: LevelHandler, pub(crate) next_level: LevelHandler, - top: RefCell>, - bot: RefCell>, - _c: RefCell>>, - this_range: KeyRange, - next_range: KeyRange, - this_size: AtomicU64, + pub(crate) top: RefCell>, + pub(crate) bot: RefCell>, + pub(crate) this_range: KeyRange, + pub(crate) next_range: KeyRange, + pub(crate) this_size: AtomicU64, } -impl<'a> Default for CompactDef<'a> { +impl Default for CompactDef { fn default() -> Self { // CompactDef { // this_level: XWeak::new(), @@ -245,7 +244,7 @@ impl<'a> Default for CompactDef<'a> { } } -impl<'a> CompactDef<'a> { +impl CompactDef { fn lock_levels(&self) { unsafe { self.this_level.x.self_lock.raw().lock_shared(); From 62797323b0ed51b0fd6428d357dd19497d484a04 Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 4 Jan 2023 20:19:53 +0800 Subject: [PATCH 13/77] :coffee: --- src/compaction.rs | 76 ++++++++++++--------- src/levels.rs | 163 ++++++++++++++++++++++++++++++---------------- src/types.rs | 81 +++++++++++------------ 3 files changed, 189 insertions(+), 131 deletions(-) diff --git a/src/compaction.rs b/src/compaction.rs index 0f8567f..e7ee304 100644 --- a/src/compaction.rs +++ b/src/compaction.rs @@ -1,25 +1,21 @@ use crate::levels::CompactDef; -use crate::table::table::{Table, TableCore}; +use crate::table::table::Table; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; use std::fmt::{Display, Formatter}; -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; -use tokio::sync::RwLockMappedWriteGuard; -#[derive(Clone)] +#[derive(Debug)] pub(crate) struct CompactStatus { - levels: Arc>>, + // every level has a *CompactionStatus* that includes multipart *KeyRange* + levels: RwLock>, } impl CompactStatus { - fn to_log(&self) { - todo!() - } - // Check whether we can run this *CompactDef*. That it doesn't overlap with any // other running Compaction. If it can be run, it would store this run in the compactStatus state. - pub(crate) fn compare_and_add(&self, cd: &mut CompactDef) -> bool { + pub(crate) fn compare_and_add(&self, cd: &CompactDef) -> bool { let level = cd.this_level.level(); assert!( level < self.rl().len() - 1, @@ -27,12 +23,9 @@ impl CompactStatus { level, self.rl().len() ); - - let mut this_level = - RwLockWriteGuard::map(self.levels.write(), |lc| lc.get_mut(level).unwrap()); - let mut next_level = - RwLockWriteGuard::map(self.levels.write(), |lc| lc.get_mut(level + 1).unwrap()); - + let lc = self.levels.read(); + let this_level = lc.get(level).unwrap(); + let next_level = lc.get(level + 1).unwrap(); if this_level.overlaps_with(&cd.this_range) { return false; } @@ -44,13 +37,14 @@ impl CompactStatus { // running parallel compactions for the same level. // *NOTE*: We can directly call this_level.total_size, because we already have acquire a read lock // over this and the next level. - if cd.this_level.get_total_size() - this_level.del_size < cd.this_level.get_max_total_size() + if cd.this_level.get_total_size() - this_level.get_del_size() + < cd.this_level.get_max_total_size() { return false; } - this_level.ranges.push(cd.this_range.clone()); - next_level.ranges.push(cd.next_range.clone()); - this_level.del_size += cd.this_size.load(Ordering::Relaxed); + this_level.ranges.write().push(cd.this_range.clone()); + next_level.ranges.write().push(cd.next_range.clone()); + this_level.incr_del_size(cd.this_size.load(Ordering::Relaxed)); true } @@ -59,34 +53,51 @@ impl CompactStatus { compact_status[level].overlaps_with(this) } + // Return level's deleted data count pub(crate) fn del_size(&self, level: usize) -> u64 { let compact_status = self.rl(); - compact_status[level].del_size + compact_status[level].get_del_size() } + // Return Level's compaction status with *WriteLockGuard* fn wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec> { self.levels.write() } + + // Return Level's compaction status with *ReadLockGuard* fn rl(&self) -> RwLockReadGuard<'_, RawRwLock, Vec> { self.levels.read() } } -#[derive(Debug)] +#[derive(Clone, Debug)] pub(crate) struct LevelCompactStatus { - ranges: Vec, - del_size: u64, + ranges: Arc>>, + del_size: Arc, } impl LevelCompactStatus { fn overlaps_with(&self, dst: &KeyRange) -> bool { - self.ranges.iter().any(|r| r.overlaps_with(dst)) + self.ranges.write().iter().any(|r| r.overlaps_with(dst)) } fn remove(&mut self, dst: &KeyRange) -> bool { - let len = self.ranges.len(); - self.ranges.retain(|r| r.equals(dst)); - len > self.ranges.len() + let mut rlock = self.ranges.write(); + let len = rlock.len(); + rlock.retain(|r| r.equals(dst)); + len > rlock.len() + } + + fn get_del_size(&self) -> u64 { + self.del_size.load(Ordering::Relaxed) + } + + fn incr_del_size(&self, n: u64) { + self.del_size.fetch_add(n, Ordering::Relaxed); + } + + fn decr_del_size(&self, n: u64) { + self.del_size.fetch_sub(n, Ordering::Relaxed); } } @@ -114,7 +125,8 @@ pub(crate) const INFO_RANGE: KeyRange = KeyRange { }; impl KeyRange { - pub fn get_range(tables: &Vec
) -> KeyRange { + // Get the KeyRange of tables + pub(crate) fn get_range(tables: &Vec
) -> KeyRange { assert!(!tables.is_empty()); let mut smallest = tables[0].smallest(); let mut biggest = tables[0].biggest(); @@ -134,11 +146,13 @@ impl KeyRange { } } - fn equals(&self, other: &KeyRange) -> bool { + // Left, right, inf all same, indicate equal + pub(crate) fn equals(&self, other: &KeyRange) -> bool { self.left == other.left && self.right == self.right && self.inf == self.inf } - fn overlaps_with(&self, other: &KeyRange) -> bool { + // Check for overlap, *Notice*, if a and b are all inf, indicate has overlap. + pub(crate) fn overlaps_with(&self, other: &KeyRange) -> bool { if self.inf || other.inf { return true; } diff --git a/src/levels.rs b/src/levels.rs index 203b3e1..a377178 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -7,6 +7,7 @@ use crate::table::table::{new_file_name, Table, TableCore}; use crate::types::{Closer, XArc, XWeak}; use crate::Error::Unexpected; use crate::Result; +use atomic::Ordering; use awaitgroup::WaitGroup; use log::{error, info}; use parking_lot::lock_api::RawRwLock; @@ -19,17 +20,17 @@ use std::path::Path; use std::sync::atomic::{AtomicI64, AtomicU64}; use std::sync::Arc; use std::time::Duration; +use tokio::macros::support::thread_rng_n; #[derive(Clone)] pub(crate) struct LevelsController { // The following are initialized once and const levels: Arc>, kv: WeakKV, - // Atomic next_file_id: Arc, // For ending compactions. compact_worker_wg: Arc, - c_status: CompactStatus, + c_status: Arc, } unsafe impl Sync for LevelsController {} @@ -50,6 +51,7 @@ impl LevelsController { Ok(()) } + // start compact fn start_compact(&self, lc: Closer) { for i in 0..self.must_kv().opt.num_compactors { lc.add_running(1); @@ -60,14 +62,16 @@ impl LevelsController { }); } } + async fn run_worker(&self, lc: Closer) { if self.must_kv().opt.do_not_compact { lc.done(); return; } - lc.done(); - // add random time - tokio::time::sleep(Duration::from_millis(1000)).await; + { + let duration = thread_rng_n(1000); + tokio::time::sleep(Duration::from_millis(duration as u64)).await; + } let mut interval = tokio::time::interval(Duration::from_secs(1)); loop { // why interval can life long @@ -75,25 +79,46 @@ impl LevelsController { tokio::select! { _ = interval.tick() => { let pick: Vec = self.pick_compact_levels(); - for p in pick.iter() { - + for p in pick { + match self.do_compact(p) { + Ok(true) => { + info!("succeed to compacted") + }, + Ok(false) => { + info!("failed to do compacted"); + break; + }, + Err(err) => { // TODO handle error + error!("failed to do compacted, {:?}", err); + }, + } } }, - _ = done.recv() => {return;} + _ = done.recv() => { + info!("closing compact job"); + return; + } } } } + // Picks some table on level l and compacts it away to the next level. fn do_compact(&self, p: CompactionPriority) -> Result { - let level = p.level; - assert!(level + 1 < self.must_kv().opt.max_levels); // Sanity check. + let l = p.level; + assert!(l + 1 < self.must_kv().opt.max_levels); // Sanity check. let mut cd = CompactDef::default(); - cd.this_level = (self.levels[level]).clone(); - cd.next_level = (self.levels[level + 1]).clone(); + cd.this_level = (self.levels[l]).clone(); + cd.next_level = (self.levels[l + 1]).clone(); info!("Got compaction priority: {:?}", p); // While picking tables to be compacted, both level's tables are expected to // remain unchanged. - if level == 0 {} + if l == 0 { + if !self.fill_tables_l0(&mut cd) { + info!("failed to fill tables for level {}", l); + return Ok(false); + } + } else { + } Ok(true) } @@ -123,6 +148,67 @@ impl LevelsController { true } + fn fill_tables(&self, cd: &mut CompactDef) -> bool { + cd.lock_levels(); + let mut tables = cd.this_level.to_ref().tables.read().to_vec(); + if tables.is_empty() { + cd.unlock_levels(); + return false; + } + // Find the biggest table, and compact taht first. + // TODO: Try other table picking strategies. + tables.sort_by(|a, b| b.size().cmp(&a.size())); + for t in tables { + cd.this_size.store(t.size() as u64, Ordering::Relaxed); + cd.this_range = KeyRange { + left: t.smallest().to_vec(), + right: t.biggest().to_vec(), + inf: false, + }; + if self + .c_status + .overlaps_with(cd.this_level.level(), &cd.this_range) + { + continue; + } + + cd.top.borrow_mut().clear(); + cd.top.borrow_mut().push(t); + let (left, right) = cd.next_level.overlapping_tables(&cd.this_range); + let bot = cd.next_level.to_ref().tables.read(); + let tables = bot.to_vec(); + cd.bot.borrow_mut().clear(); + cd.bot.borrow_mut().extend(tables[left..right].to_vec()); + + if cd.bot.borrow().is_empty() { + cd.bot.borrow_mut().clear(); + cd.next_range = cd.this_range.clone(); + if !self.c_status.compare_and_add(cd) { + continue; + } + cd.unlock_levels(); + return true; + } + + cd.next_range = KeyRange::get_range(cd.bot.borrow().as_ref()); + + if self + .c_status + .overlaps_with(cd.next_level.level(), &cd.next_range) + { + continue; + } + + if !self.c_status.compare_and_add(&cd) { + continue; + } + cd.unlock_levels(); + return true; + } + cd.unlock_levels(); + false + } + // Determines which level to compact. // Base on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction. fn pick_compact_levels(&self) -> Vec { @@ -131,6 +217,7 @@ impl LevelsController { let mut prios = vec![]; // cstatus is checked to see if level 0's tables are already being compacted. + // *NOTICE* level 0 only has one compact job if !self.c_status.overlaps_with(0, &INFO_RANGE) && self.is_level0_compactable() { prios.push(CompactionPriority { level: 0, @@ -139,20 +226,25 @@ impl LevelsController { }) } + // stats level 1..n for (i, level) in self.levels[1..].iter().enumerate() { // Don't consider those tables that are already being compacted right now. let del_size = self.c_status.del_size(i + 1); if level.is_compactable(del_size) { prios.push(CompactionPriority { level: i + 1, - score: (level.get_total_size() as f64 / level.get_max_total_size() as f64), + score: ((level.get_total_size() - del_size) as f64 + / level.get_max_total_size() as f64), }); } } + // sort from big to small. prios.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap()); prios } + // Return true if level zero may be compacted, without accounting for compactions that already + // might be happening. fn is_level0_compactable(&self) -> bool { self.levels[0].num_tables() >= self.must_kv().opt.num_level_zero_tables } @@ -168,49 +260,6 @@ struct CompactionPriority { score: f64, } -struct LevelsControllerInner { - // The following are initialized once and const - levels: Arc>, - kv: WeakKV, - // Atomic - next_file_id: AtomicI64, - // For ending compactions. - compact_worker_wg: WaitGroup, - c_status: CompactStatus, -} - -impl LevelsControllerInner { - // Returns true if level zero may be compacted, without accounting for compactions that already - // might be happening. - fn is_level0_compact_table(&self) -> bool { - // self.levels[0] - todo!() - } - - // Checks that all necessary table files exist and removes all table files not - // referenced by the manifest. *ids* is a set of table file id's that were read from the directory - // listing. - fn revert_to_manifest(dir: &str, mf: Manifest, ids: &HashSet) -> Result<()> { - // 1. check all files in manifest exists. - for (id, _) in &mf.tables { - if !ids.contains(id) { - return Err(format!("file does not exist for table {}", id).into()); - } - } - // 2. delete files that shouldn't exist - for id in ids { - if !mf.tables.contains_key(id) { - info!("Table file {} not referenced in MANIFEST", id); - let file_name = new_file_name(*id, dir.clone().parse().unwrap()); - if let Err(err) = remove_file(file_name) { - error!("While removing table {}, err: {:?}", id, err); - } - } - } - Ok(()) - } -} - pub(crate) struct CompactDef { pub(crate) this_level: LevelHandler, pub(crate) next_level: LevelHandler, diff --git a/src/types.rs b/src/types.rs index 0a09d4c..7bc20b6 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,10 +1,11 @@ use parking_lot::*; use std::fmt::Debug; +use std::mem::ManuallyDrop; use std::ops::{Deref, RangeBounds}; -use std::sync::atomic::{AtomicI32, AtomicIsize, AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicI32, AtomicIsize, AtomicPtr, AtomicUsize, Ordering}; use std::sync::{Arc, TryLockResult, Weak}; use std::time::Duration; -use std::{hint, thread}; +use std::{hint, mem, thread}; use async_channel::{bounded, Receiver, RecvError, SendError, Sender, TryRecvError, TrySendError}; @@ -58,10 +59,9 @@ impl Channel { } pub(crate) fn close(&self) { - if self.tx.is_none() { - return; + if let Some(tx) = &self.tx { + tx.close(); } - self.tx.as_ref().unwrap().close(); } } @@ -76,6 +76,7 @@ pub(crate) struct Closer { impl Closer { pub(crate) fn new(initial: isize) -> Self { + assert!(initial >= 0, "Sanity check"); let mut close = Closer { closed: Channel::new(1), wait: Arc::from(AtomicIsize::new(initial)), @@ -83,33 +84,40 @@ impl Closer { close } + // Incr delta to the WaitGroup. pub(crate) fn add_running(&self, delta: isize) { - self.wait.fetch_add(delta, Ordering::Relaxed); + let old = self.wait.fetch_add(delta, Ordering::Relaxed); + assert!(old >= 0, "Sanity check"); + } + + // Decr delta to the WaitGroup. + pub(crate) fn done(&self) { + let old = self.wait.fetch_sub(1, Ordering::Relaxed); + assert!(old >= 0, "Sanity check"); } + // Signals the `has_been_closed` signal. pub(crate) fn signal(&self) { self.closed.close(); } - // todo + // Gets signaled when signal() is called. pub(crate) fn has_been_closed(&self) -> Channel<()> { self.closed.clone() } - pub(crate) fn done(&self) { - self.wait.fetch_sub(1, Ordering::Relaxed); - } - + // Waiting until done pub(crate) async fn wait(&self) { loop { if self.wait.load(Ordering::Relaxed) <= 0 { break; } - println!("wait"); + hint::spin_loop(); sleep(Duration::from_millis(10)).await; } } + // Send a close signal and waiting util done pub(crate) async fn signal_and_wait(&self) { self.signal(); self.wait().await; @@ -166,6 +174,11 @@ impl XVec { XVec(Arc::new(VecRangeLock::new(v))) } + pub fn lock_all(&self) { + let right = self.0.data_len(); + self.lock(0, right) + } + pub fn lock(&self, left: usize, right: usize) { loop { let range = left..right; @@ -180,10 +193,6 @@ impl XVec { pub fn try_lock(&self, range: impl RangeBounds) -> TryLockResult> { self.0.try_lock(range) } - - // fn to_owned(self) -> Vec { - // self.0.into_inner() - // } } impl Deref for XVec { @@ -193,35 +202,21 @@ impl Deref for XVec { } } -// impl DerefMut for XVec { -// fn deref_mut(&mut self) -> &mut Self::Target { -// &mut self.0 -// } -// } - #[test] fn it_closer() { - let runtime = tokio::runtime::Runtime::new().unwrap(); - runtime.block_on(async { - let closer = Closer::new(1); - let c = closer.clone(); - tokio::spawn(async move { - sleep(Duration::from_millis(20000)).await; - println!("Hello Word1"); - c.done(); - }); - closer.signal_and_wait().await; - println!("Hello Word"); - }); + // let runtime = tokio::runtime::Runtime::new().unwrap(); + // runtime.block_on(async { + // let closer = Closer::new(1); + // let c = closer.clone(); + // tokio::spawn(async move { + // sleep(Duration::from_millis(20000)).await; + // println!("Hello Word1"); + // c.done(); + // }); + // closer.signal_and_wait().await; + // println!("Hello Word"); + // }); } #[test] -fn lck() { - // let x: &'static [i32; 3] = Box::leak(Box::new([1, 2, 3])); - // thread::spawn(move || dbg!(x)); - // thread::spawn(move || dbg!(x)); - let v = Arc::new(RwLock::new(vec![Arc::new(AtomicI32::new(10))])); - let lck = v.write().to_vec(); - lck[0].store(100, Ordering::Relaxed); - println!("{:?}", v.read()); -} +fn lck() {} From 4ec9d92c48311d43b1d664f61085739f9c125bb0 Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 5 Jan 2023 01:15:15 +0800 Subject: [PATCH 14/77] :dog: --- src/levels.rs | 7 ++++ src/value_log_tests.rs | 86 +++++++++++++++++++++--------------------- 2 files changed, 50 insertions(+), 43 deletions(-) diff --git a/src/levels.rs b/src/levels.rs index a377178..ea2cd8c 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -118,7 +118,14 @@ impl LevelsController { return Ok(false); } } else { + if !self.fill_tables(&mut cd) { + info!("failed to fill tables for level {}", l); + return Ok(false); + } } + info!("Running for level: {}", cd.this_level.level()); + info!("{:?}", self.c_status); + info!("Compaction for level: {} DONE", cd.this_level.level()); Ok(true) } diff --git a/src/value_log_tests.rs b/src/value_log_tests.rs index f7d93c9..d93e910 100644 --- a/src/value_log_tests.rs +++ b/src/value_log_tests.rs @@ -17,46 +17,46 @@ fn new_test_options(dir: String) -> Options { return opt; } -#[test] -fn value_basic() { - mock_log_terminal(); - let dir = random_tmp_dir(); - println!("{}", dir); - let mut kv = KV::new(new_test_options(dir)).unwrap(); - // Use value big enough that the value log writes them even if SyncWrites is false. - let val1 = b"sampleval012345678901234567890123"; - let val2 = b"samplevalb012345678901234567890123"; - assert!(val1.len() >= kv.opt.value_threshold); - - let entry1 = Entry { - key: b"samplekey".to_vec(), - value: val1.to_vec(), - meta: MetaBit::BitValuePointer.bits(), - cas_counter_check: 22222, - cas_counter: 33333, - offset: 0, - user_meta: 0, - }; - let entry2 = Entry { - key: b"samplekeyb".to_vec(), - value: val2.to_vec(), - meta: MetaBit::BitValuePointer.bits(), - cas_counter_check: 22225, - cas_counter: 33335, - offset: 0, - user_meta: 0, - }; - - let mut wait = WaitGroup::new(); - let b = Request { - entries: vec![RefCell::new(entry1), RefCell::new(entry2)], - ptrs: RefCell::new(vec![]), - wait_group: RefCell::new(Some(wait.worker())), - err: RefCell::new(Arc::new(Ok(()))), - }; - // todo add event stats - - kv.must_mut_vlog() - .write(&vec![b]) - .expect("TODO: panic message"); -} +// #[test] +// fn value_basic() { +// mock_log_terminal(); +// let dir = random_tmp_dir(); +// println!("{}", dir); +// let mut kv = KV::new(new_test_options(dir)).unwrap(); +// // Use value big enough that the value log writes them even if SyncWrites is false. +// let val1 = b"sampleval012345678901234567890123"; +// let val2 = b"samplevalb012345678901234567890123"; +// assert!(val1.len() >= kv.opt.value_threshold); +// +// let entry1 = Entry { +// key: b"samplekey".to_vec(), +// value: val1.to_vec(), +// meta: MetaBit::BitValuePointer.bits(), +// cas_counter_check: 22222, +// cas_counter: 33333, +// offset: 0, +// user_meta: 0, +// }; +// let entry2 = Entry { +// key: b"samplekeyb".to_vec(), +// value: val2.to_vec(), +// meta: MetaBit::BitValuePointer.bits(), +// cas_counter_check: 22225, +// cas_counter: 33335, +// offset: 0, +// user_meta: 0, +// }; +// +// let mut wait = WaitGroup::new(); +// let b = Request { +// entries: vec![RefCell::new(entry1), RefCell::new(entry2)], +// ptrs: RefCell::new(vec![]), +// wait_group: RefCell::new(Some(wait.worker())), +// err: RefCell::new(Arc::new(Ok(()))), +// }; +// // todo add event stats +// +// kv.must_mut_vlog() +// .write(&vec![b]) +// .expect("TODO: panic message"); +// } From cecbbfca045edd4517e81ff03675f3e0097e2e88 Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 5 Jan 2023 20:01:09 +0800 Subject: [PATCH 15/77] :coffee: --- Cargo.toml | 3 +++ src/level_handler.rs | 20 +++++++++++++- src/levels.rs | 62 +++++++++++++++++++++++++++++++------------- src/test_util.rs | 50 +++++++++++++++++++++++++++++++++++ src/types.rs | 2 +- 5 files changed, 117 insertions(+), 20 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cc6ae54..5fe35b7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,7 +37,10 @@ fs2 = "0.4.3" awaitgroup = "0.6.0" protobuf = { version = "3.2.0", features = ["with-bytes"] } range-lock = "0.2.2" +tracing = "0.1.37" [dev-dependencies] +tracing-subscriber = "0.3.16" +tracing-log = "0.1.3" chrono = "0.4.22" env_logger = "0.9.1" console_log = { version = "0.2.0", features = ["color"] } diff --git a/src/level_handler.rs b/src/level_handler.rs index 5b0a32e..3a0efc0 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -16,6 +16,12 @@ use std::sync::Arc; pub(crate) type LevelHandler = XArc; pub(crate) type WeakLevelHandler = XWeak; +impl From for LevelHandler { + fn from(value: LevelHandlerInner) -> Self { + XArc::new(value) + } +} + impl LevelHandler { // Returns true if the non-zero level may be compacted. *del_size* provides the size of the tables // which are currently being compacted so that we treat them as already having started being @@ -221,4 +227,16 @@ pub(crate) struct LevelHandlerInner { kv: WeakKV, } -impl LevelHandlerInner {} +impl LevelHandlerInner { + pub(crate) fn new(kv: WeakKV, level: usize) -> LevelHandlerInner { + LevelHandlerInner { + self_lock: Arc::new(Default::default()), + tables: Arc::new(Default::default()), + total_size: Default::default(), + level: Default::default(), + str_level: Arc::new(format!("L{}", level)), + max_total_size: Default::default(), + kv, + } + } +} diff --git a/src/levels.rs b/src/levels.rs index ea2cd8c..a3f4745 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -1,5 +1,5 @@ use crate::compaction::{CompactStatus, KeyRange, INFO_RANGE}; -use crate::kv::{WeakKV, KV}; +use crate::kv::{ArcKV, WeakKV, KV}; use crate::level_handler::{LevelHandler, LevelHandlerInner, WeakLevelHandler}; use crate::manifest::Manifest; use crate::options::Options; @@ -19,7 +19,7 @@ use std::ops::Deref; use std::path::Path; use std::sync::atomic::{AtomicI64, AtomicU64}; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, SystemTime}; use tokio::macros::support::thread_rng_n; #[derive(Clone)] @@ -44,6 +44,20 @@ impl Default for LevelsController { } impl LevelsController { + fn new(kv: ArcKV, mf: &Manifest) -> Result { + assert!(kv.x.opt.num_level_zero_tables_stall > kv.x.opt.num_level_zero_tables); + let mut levels = vec![]; + for i in 0..kv.x.opt.max_levels { + let lh = LevelHandlerInner::new(WeakKV::from(&kv), i); + levels.push(LevelHandler::from(lh)); + if i == 0 { + } else if i == 1 { + } else { + } + } + todo!() + } + fn cleanup_levels(&self) -> Result<()> { for level in self.levels.iter() { level.close()?; @@ -125,30 +139,42 @@ impl LevelsController { } info!("Running for level: {}", cd.this_level.level()); info!("{:?}", self.c_status); + info!("Compaction for level: {} DONE", cd.this_level.level()); Ok(true) } + fn run_compact_def(&self, l: usize, cd: &mut CompactDef) -> Result<()> { + let time_start = SystemTime::now(); + let this_level = cd.this_level.clone(); + let next_level = cd.next_level.clone(); + + if this_level.level() >= 1 && cd.bot.is_empty() { + assert_eq!(cd.top.len(), 1); + } + todo!() + } + fn fill_tables_l0(&self, cd: &mut CompactDef) -> bool { cd.lock_levels(); let top = cd.this_level.to_ref().tables.read(); // TODO here maybe have some issue that i don't understand let tables = top.to_vec(); - cd.top.borrow_mut().extend(tables); - if cd.top.borrow().is_empty() { + cd.top.extend(tables); + if cd.top.is_empty() { cd.unlock_levels(); return false; } cd.this_range = INFO_RANGE; - let kr = KeyRange::get_range(cd.top.borrow().as_ref()); + let kr = KeyRange::get_range(cd.top.as_ref()); let (left, right) = cd.next_level.overlapping_tables(&kr); let bot = cd.next_level.to_ref().tables.read(); let tables = bot.to_vec(); - cd.bot.borrow_mut().extend(tables[left..right].to_vec()); - if cd.bot.borrow().is_empty() { + cd.bot.extend(tables[left..right].to_vec()); + if cd.bot.is_empty() { cd.next_range = kr; } else { - cd.next_range = KeyRange::get_range(cd.bot.borrow().as_ref()); + cd.next_range = KeyRange::get_range(cd.bot.as_ref()); } // if !self.c_status. cd.unlock_levels(); @@ -179,16 +205,16 @@ impl LevelsController { continue; } - cd.top.borrow_mut().clear(); - cd.top.borrow_mut().push(t); + cd.top.clear(); + cd.top.push(t); let (left, right) = cd.next_level.overlapping_tables(&cd.this_range); let bot = cd.next_level.to_ref().tables.read(); let tables = bot.to_vec(); - cd.bot.borrow_mut().clear(); - cd.bot.borrow_mut().extend(tables[left..right].to_vec()); + cd.bot.clear(); + cd.bot.extend(tables[left..right].to_vec()); - if cd.bot.borrow().is_empty() { - cd.bot.borrow_mut().clear(); + if cd.bot.is_empty() { + cd.bot.clear(); cd.next_range = cd.this_range.clone(); if !self.c_status.compare_and_add(cd) { continue; @@ -197,7 +223,7 @@ impl LevelsController { return true; } - cd.next_range = KeyRange::get_range(cd.bot.borrow().as_ref()); + cd.next_range = KeyRange::get_range(cd.bot.as_ref()); if self .c_status @@ -217,7 +243,7 @@ impl LevelsController { } // Determines which level to compact. - // Base on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction. + // Base on https://github.com/facebook/rocksdb/wiki/Leveled-Compaction. fn pick_compact_levels(&self) -> Vec { // This function must use identical criteria for guaranteeing compaction's progress that // add_level0_table use. @@ -270,8 +296,8 @@ struct CompactionPriority { pub(crate) struct CompactDef { pub(crate) this_level: LevelHandler, pub(crate) next_level: LevelHandler, - pub(crate) top: RefCell>, - pub(crate) bot: RefCell>, + pub(crate) top: Vec
, + pub(crate) bot: Vec
, pub(crate) this_range: KeyRange, pub(crate) next_range: KeyRange, pub(crate) this_size: AtomicU64, diff --git a/src/test_util.rs b/src/test_util.rs index 73660d3..dc71e25 100644 --- a/src/test_util.rs +++ b/src/test_util.rs @@ -1,8 +1,12 @@ +use chrono::Local; use log::{info, kv::source::as_map, kv::Source, Level}; use rand::random; use std::collections::HashMap; use std::env::temp_dir; use std::fs::create_dir_all; +use std::io; +use tracing_subscriber::fmt::format::Writer; +use tracing_subscriber::fmt::time::FormatTime; #[cfg(test)] pub(crate) fn mock_log() { @@ -54,9 +58,55 @@ pub(crate) fn mock_log_terminal() { console_log::init_with_level(Level::Debug); } +#[cfg(test)] +pub(crate) fn tracing_log() { + use tracing::{info, Level}; + use tracing_subscriber; + struct LocalTimer; + + impl FormatTime for LocalTimer { + fn format_time(&self, w: &mut Writer<'_>) -> std::fmt::Result { + write!(w, "{}", Local::now().format("%FT%T%.3f")) + } + } + + let _ = tracing_log::LogTracer::init(); + let format = tracing_subscriber::fmt::format() + .with_level(true) + .with_target(true) + .with_timer(LocalTimer); + + let _ = tracing_subscriber::fmt() + .with_max_level(tracing::Level::TRACE) + .with_writer(io::stdout) + .with_ansi(true) + .event_format(format) + .try_init(); + tracing::info!("log setting done"); +} + pub fn random_tmp_dir() -> String { let id = random::(); let path = temp_dir().join(id.to_string()).join("badger"); // create_dir_all(&path).unwrap(); path.to_str().unwrap().to_string() } + +#[test] +fn itwork() { + + #[tracing::instrument(skip_all)] + fn call() { + info!("call c"); + } + + #[tracing::instrument(skip_all)] + fn my_function(my_arg: usize) { + info!("execute my function"); + call(); + } + + tracing_log(); + my_function(1000); + info!("Hello Body"); +} diff --git a/src/types.rs b/src/types.rs index 7bc20b6..30d8b44 100644 --- a/src/types.rs +++ b/src/types.rs @@ -141,7 +141,7 @@ impl Clone for XArc { } impl XArc { - fn new(x: T) -> XArc { + pub fn new(x: T) -> XArc { XArc { x: Arc::new(x) } } From f3bca941403d5b69512e0a7a9b316f1a32034e31 Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 18 Jan 2023 18:16:13 +0800 Subject: [PATCH 16/77] :coffee: --- src/kv.rs | 44 +++++++++++++++++++++++++------------------- src/value_log.rs | 15 +++++++-------- src/y/mod.rs | 2 ++ 3 files changed, 34 insertions(+), 27 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index fa9aa8c..870b4a1 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -5,6 +5,7 @@ use crate::table::iterator::IteratorImpl; use crate::types::{Channel, Closer, XArc, XWeak}; use crate::value_log::{Request, ValueLogCore, ValuePointer}; use crate::y::{Encode, Result, ValueStruct}; +use crate::Error::Unexpected; use crate::{Error, Node, SkipList}; use fs2::FileExt; use log::info; @@ -51,19 +52,8 @@ pub struct KV { last_used_cas_counter: AtomicU64, } -impl Drop for KV { - fn drop(&mut self) { - self.dir_lock_guard.unlock().unwrap(); - self.value_dir_guard.unlock().unwrap(); - self.closers.compactors.signal_and_wait(); - self.closers.mem_table.signal_and_wait(); - self.closers.writes.signal_and_wait(); - self.closers.update_size.signal_and_wait(); - } -} - impl KV { - pub fn new(opt: Options) -> Result { + pub fn new(opt: Options) -> Result> { let mut _opt = opt.clone(); _opt.max_batch_size = (15 * opt.max_table_size) / 100; _opt.max_batch_count = opt.max_batch_size / Node::size() as u64; @@ -94,7 +84,7 @@ impl KV { value_gc: Closer::new(0), }; // go out.updateSize(out.closers.updateSize) - let mut mt = SkipList::new(arena_size(&opt)); + let mt = SkipList::new(arena_size(&opt)); let mut out = KV { opt: opt.clone(), vlog: None, @@ -110,16 +100,32 @@ impl KV { }; let mut vlog = ValueLogCore::default(); vlog.open(&out, opt)?; + out.vlog = Some(vlog); - Ok(out) + Ok(XArc::new(out)) } - pub fn must_vlog(&self) -> &ValueLogCore { - self.vlog.as_ref().unwrap() - } + // pub fn must_vlog(&self) -> &ValueLogCore { + // self.vlog.as_ref().unwrap() + // } + + // pub fn must_mut_vlog(&mut self) -> &mut ValueLogCore { + // self.vlog.as_mut().unwrap() + // } - pub fn must_mut_vlog(&mut self) -> &mut ValueLogCore { - self.vlog.as_mut().unwrap() + /// close kv, should be call only once + pub async fn close(&self) -> Result<()> { + self.dir_lock_guard + .unlock() + .map_err(|err| Unexpected(err.to_string()))?; + self.value_dir_guard + .unlock() + .map_err(|err| Unexpected(err.to_string()))?; + self.closers.compactors.signal_and_wait().await; + self.closers.mem_table.signal_and_wait().await; + self.closers.writes.signal_and_wait().await; + self.closers.update_size.signal_and_wait().await; + Ok(()) } } diff --git a/src/value_log.rs b/src/value_log.rs index 4102af3..d0f6413 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -23,12 +23,12 @@ use std::sync::Arc; use std::{fmt, fs, thread}; use tabled::object::Entity::Cell; -use crate::kv::KV; +use crate::kv::{ArcKV, KV, WeakKV}; use crate::log_file::LogFile; use crate::options::Options; use crate::skl::BlockBytes; use crate::table::iterator::BlockSlice; -use crate::types::Channel; +use crate::types::{Channel, XArc}; use crate::y::{ create_synced_file, is_eof, open_existing_synced_file, read_at, sync_directory, Decode, Encode, }; @@ -40,12 +40,12 @@ use crate::{Error, Result}; bitflags! { pub struct MetaBit: u8{ /// Set if the key has been deleted. - const BitDelete = 1; + const BIT_DELETE = 1; /// Set if the value is NOT stored directly next to key. - const BitValuePointer = 2; - const BitUnused = 4; + const BIT_VALUE_POINTER = 2; + const BIT_UNUSED = 4; /// Set if the key is set using SetIfAbsent. - const BitSetIfAbsent = 8; + const BIT_SET_IF_ABSENT = 8; } } @@ -116,7 +116,6 @@ impl Entry { impl Encode for Entry { fn enc(&self, wt: &mut dyn Write) -> Result { - use crc32fast::Hasher; let mut h = Header::default(); h.k_len = self.key.len() as u32; h.v_len = self.value.len() as u32; @@ -387,7 +386,7 @@ impl ValueLogCore { let mut cursor = Cursor::new(buffer); let mut h = Header::default(); h.dec(&mut cursor)?; - if (h.meta & MetaBit::BitDelete.bits()) != 0 { + if (h.meta & MetaBit::BIT_DELETE.bits()) != 0 { // Tombstone key return consumer(&vec![]); } diff --git a/src/y/mod.rs b/src/y/mod.rs index 3a6694a..4ed851a 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -121,6 +121,7 @@ impl From for Error { pub type Result = std::result::Result; +#[inline] pub fn is_eof(ret: &io::Result) -> bool { if ret.is_ok() { return false; @@ -131,6 +132,7 @@ pub fn is_eof(ret: &io::Result) -> bool { } } +#[inline] pub fn is_existing(ret: &io::Result) -> bool { if ret.is_ok() { return false; From e74213994fd49c0ea45a87ef15d794d97a10745c Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 19 Jan 2023 18:14:44 +0800 Subject: [PATCH 17/77] :coffee: --- src/level_handler.rs | 8 ++++++-- src/levels.rs | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/level_handler.rs b/src/level_handler.rs index 3a0efc0..3ce1ac9 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -5,7 +5,6 @@ use crate::table::table::Table; use crate::types::{XArc, XWeak}; use crate::y::iterator::Xiterator; use crate::Result; -use core::slice::SlicePattern; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; @@ -13,6 +12,7 @@ use std::collections::HashSet; use std::sync::atomic::{AtomicI32, AtomicU64, Ordering}; use std::sync::Arc; + pub(crate) type LevelHandler = XArc; pub(crate) type WeakLevelHandler = XWeak; @@ -38,12 +38,12 @@ impl LevelHandler { self.x.max_total_size.load(Ordering::Relaxed) } - // TODO add deference table deleted pub(crate) fn delete_tables(&self, to_del: Vec) { let to_del = to_del.iter().map(|id| *id).collect::>(); let mut tb_wl = self.tables_wl(); tb_wl.retain_mut(|tb| { if to_del.contains(&tb.x.id()) { + // delete table reference tb.decr_ref(); return false; } @@ -51,6 +51,7 @@ impl LevelHandler { }); } + /// init with tables pub(crate) fn init_tables(&self, tables: Vec
) { let total_size = tables.iter().fold(0, |acc, table| acc + table.size()); self.x @@ -69,9 +70,12 @@ impl LevelHandler { } } + // Get table write lock guards. fn tables_wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec
> { self.x.tables.write() } + + // Get table read lock guards fn tables_rd(&self) -> RwLockReadGuard<'_, RawRwLock, Vec
> { self.x.tables.read() } diff --git a/src/levels.rs b/src/levels.rs index a3f4745..fd61245 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -293,6 +293,7 @@ struct CompactionPriority { score: f64, } +// Compact deference pub(crate) struct CompactDef { pub(crate) this_level: LevelHandler, pub(crate) next_level: LevelHandler, From 1d4d3400831700748d51b97cb6380550d04a20d0 Mon Sep 17 00:00:00 2001 From: Rg Date: Tue, 31 Jan 2023 01:49:02 +0800 Subject: [PATCH 18/77] ?:dog: --- src/compaction.rs | 2 ++ src/event/mod.rs | 6 ++---- src/table/table.rs | 16 ++++++++++------ src/types.rs | 35 +++++++++++++++++++++++------------ 4 files changed, 37 insertions(+), 22 deletions(-) diff --git a/src/compaction.rs b/src/compaction.rs index e7ee304..cc387b1 100644 --- a/src/compaction.rs +++ b/src/compaction.rs @@ -101,6 +101,7 @@ impl LevelCompactStatus { } } +// [left, right], Special inf is range all if it be set `true` #[derive(Clone, Debug)] pub(crate) struct KeyRange { pub(crate) left: Vec, // TODO zero Copy @@ -118,6 +119,7 @@ impl Display for KeyRange { } } +// Including all keys pub(crate) const INFO_RANGE: KeyRange = KeyRange { left: vec![], right: vec![], diff --git a/src/event/mod.rs b/src/event/mod.rs index 9898eb2..154fa03 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -1,7 +1,5 @@ - - /// An `EventLog` provides a log of events associated with a specific object. -pub trait EventLog{ +pub trait EventLog { /// Formats its arguments with fmt.Sprintf and adds the /// result to the event log. fn printf(&self); @@ -12,4 +10,4 @@ pub trait EventLog{ /// Declares that this event log is complete. /// The event log should not be used after calling this method. fn finish(&self); -} \ No newline at end of file +} diff --git a/src/table/table.rs b/src/table/table.rs index 0cfa2fa..071c061 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -13,7 +13,6 @@ use std::fs::{read_dir, remove_file, File}; use std::io::{Cursor, Seek, SeekFrom}; use std::path::Path; use std::sync::atomic::{AtomicI32, Ordering}; -use std::time::Duration; use std::{fmt, io}; #[cfg(target_os = "macos")] @@ -49,22 +48,23 @@ pub type Table = XArc; pub type WeakTable = XWeak; impl Table { - pub(crate) fn incr_ref(&self) { + pub fn incr_ref(&self) { self.x.incr_ref() } - pub(crate) fn decr_ref(&self) { + pub fn decr_ref(&self) { self.x.decr_ref() } - pub(crate) fn size(&self) -> usize { + pub fn size(&self) -> usize { self.x.size() } - pub(crate) fn biggest(&self) -> &[u8] { + pub fn biggest(&self) -> &[u8] { &self.x.biggest } - pub(crate) fn smallest(&self) -> &[u8] { + + pub fn smallest(&self) -> &[u8] { &self.x.smallest } } @@ -307,6 +307,10 @@ impl TableCore { impl Drop for TableCore { fn drop(&mut self) { + dbg!( + "table reference count: {}", + self._ref.load(Ordering::Relaxed) + ); // We can safely delete this file, because for all the current files, we always have // at least one reference pointing to them. #[cfg(any(target_os = "macos", target_os = "linux"))] diff --git a/src/types.rs b/src/types.rs index 30d8b44..49055f7 100644 --- a/src/types.rs +++ b/src/types.rs @@ -12,6 +12,7 @@ use async_channel::{bounded, Receiver, RecvError, SendError, Sender, TryRecvErro use range_lock::{VecRangeLock, VecRangeLockGuard}; use tokio::time::sleep; +// Channel like to go's channel #[derive(Clone)] pub(crate) struct Channel { rx: Option>, @@ -19,6 +20,7 @@ pub(crate) struct Channel { } impl Channel { + // create a *Channel* with n cap pub(crate) fn new(n: usize) -> Self { let (tx, rx) = bounded(n); Channel { @@ -26,6 +28,8 @@ impl Channel { tx: Some(tx), } } + + // try to send message T without blocking pub(crate) fn try_send(&self, msg: T) -> Result<(), TrySendError> { if let Some(tx) = &self.tx { return tx.try_send(msg); @@ -33,6 +37,7 @@ impl Channel { Ok(()) } + // try to receive a message without blocking pub(crate) fn try_recv(&self) -> Result { if let Some(rx) = &self.rx { return rx.try_recv(); @@ -40,24 +45,29 @@ impl Channel { Err(TryRecvError::Empty) } + // async receive a message with blocking pub(crate) async fn recv(&self) -> Result { let rx = self.rx.as_ref().unwrap(); rx.recv().await } + // async send a message with blocking pub(crate) async fn send(&self, msg: T) -> Result<(), SendError> { let tx = self.tx.as_ref().unwrap(); tx.send(msg).await } + // returns Sender pub(crate) fn tx(&self) -> Sender { self.tx.as_ref().unwrap().clone() } + // consume tx and return it if exist pub(crate) fn take_tx(&mut self) -> Option> { self.tx.take() } + // close *Channel*, Sender will be consumed pub(crate) fn close(&self) { if let Some(tx) = &self.tx { tx.close(); @@ -75,6 +85,7 @@ pub(crate) struct Closer { } impl Closer { + // create a Closer with *initial* cap Workers pub(crate) fn new(initial: isize) -> Self { assert!(initial >= 0, "Sanity check"); let mut close = Closer { @@ -204,18 +215,18 @@ impl Deref for XVec { #[test] fn it_closer() { - // let runtime = tokio::runtime::Runtime::new().unwrap(); - // runtime.block_on(async { - // let closer = Closer::new(1); - // let c = closer.clone(); - // tokio::spawn(async move { - // sleep(Duration::from_millis(20000)).await; - // println!("Hello Word1"); - // c.done(); - // }); - // closer.signal_and_wait().await; - // println!("Hello Word"); - // }); + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.block_on(async { + let closer = Closer::new(1); + let c = closer.clone(); + tokio::spawn(async move { + sleep(Duration::from_millis(200)).await; + println!("Hello Word1"); + c.done(); + }); + closer.signal_and_wait().await; + println!("Hello Word"); + }); } #[test] From f92bb7c045b1856dbce19e1f7ecdc824247027db Mon Sep 17 00:00:00 2001 From: Rg Date: Tue, 31 Jan 2023 18:45:05 +0800 Subject: [PATCH 19/77] :dog: --- src/level_handler.rs | 17 ++++++---- src/levels.rs | 13 ++++--- src/table/builder.rs | 81 +++++++++++++++++++++++++++----------------- src/table/table.rs | 5 +-- src/types.rs | 44 ++++++++++++++++++------ src/y/codec.rs | 11 ++++-- src/y/mod.rs | 2 +- 7 files changed, 111 insertions(+), 62 deletions(-) diff --git a/src/level_handler.rs b/src/level_handler.rs index 3ce1ac9..4f03ff3 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -12,9 +12,8 @@ use std::collections::HashSet; use std::sync::atomic::{AtomicI32, AtomicU64, Ordering}; use std::sync::Arc; - pub(crate) type LevelHandler = XArc; -pub(crate) type WeakLevelHandler = XWeak; +// pub(crate) type WeakLevelHandler = XWeak; impl From for LevelHandler { fn from(value: LevelHandlerInner) -> Self { @@ -38,6 +37,7 @@ impl LevelHandler { self.x.max_total_size.load(Ordering::Relaxed) } + // delete current level's tables of to_del pub(crate) fn delete_tables(&self, to_del: Vec) { let to_del = to_del.iter().map(|id| *id).collect::>(); let mut tb_wl = self.tables_wl(); @@ -51,7 +51,7 @@ impl LevelHandler { }); } - /// init with tables + // init with tables pub(crate) fn init_tables(&self, tables: Vec
) { let total_size = tables.iter().fold(0, |acc, table| acc + table.size()); self.x @@ -65,7 +65,6 @@ impl LevelHandler { tb_wl.sort_by_key(|tb| tb.x.id()); } else { // Sort tables by keys. - // TODO avoid copy tb_wl.sort_by_key(|tb| tb.smallest().to_vec()); } } @@ -80,10 +79,9 @@ impl LevelHandler { self.x.tables.read() } - // Returns the tables that intersect with key range. Returns a half-interval. + // Returns the tables that intersect with key range. Returns a half-interval [left, right]. // This function should already have acquired a read lock, and this is so important the caller must // pass an empty parameter declaring such. - // TODO Opz me pub(crate) fn overlapping_tables(&self, key_range: &KeyRange) -> (usize, usize) { let left = self .tables_rd() @@ -108,6 +106,7 @@ impl LevelHandler { if new_tables.is_empty() { return Ok(()); } + // TODO Add lock (think of level's sharing lock) // Increase total_size first. for tb in &new_tables { self.x @@ -173,7 +172,7 @@ impl LevelHandler { Ok(()) } - // Acquires a read-lock to access s.tables. It return a list of table_handlers. + // Acquires a read-lock to access s.tables. It returns a list of table_handlers. pub(crate) fn get_table_for_key(&self, key: &[u8]) -> Option { return if self.x.level.load(Ordering::Relaxed) == 0 { let tw = self.tables_rd(); @@ -184,6 +183,8 @@ impl LevelHandler { tb.decr_ref(); if item.is_none() { // todo add metrics + } else { + return item; } } None @@ -206,6 +207,7 @@ impl LevelHandler { }; } + // returns current level pub(crate) fn level(&self) -> usize { self.x.level.load(Ordering::Relaxed) as usize } @@ -222,6 +224,7 @@ pub(crate) struct LevelHandlerInner { // For level >= 1, *tables* are sorted by key ranges, which do not overlap. // For level 0, *tables* are sorted by time. // For level 0, *newest* table are at the back. Compact the oldest one first, which is at the front. + // TODO tables and total_size maybe should be lock with same lock. pub(crate) tables: Arc>>, pub(crate) total_size: AtomicU64, // The following are initialized once and const. diff --git a/src/levels.rs b/src/levels.rs index fd61245..20f2bff 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -1,6 +1,6 @@ use crate::compaction::{CompactStatus, KeyRange, INFO_RANGE}; use crate::kv::{ArcKV, WeakKV, KV}; -use crate::level_handler::{LevelHandler, LevelHandlerInner, WeakLevelHandler}; +use crate::level_handler::{LevelHandler, LevelHandlerInner}; use crate::manifest::Manifest; use crate::options::Options; use crate::table::table::{new_file_name, Table, TableCore}; @@ -58,6 +58,7 @@ impl LevelsController { todo!() } + // cleanup all level's handler fn cleanup_levels(&self) -> Result<()> { for level in self.levels.iter() { level.close()?; @@ -68,8 +69,7 @@ impl LevelsController { // start compact fn start_compact(&self, lc: Closer) { for i in 0..self.must_kv().opt.num_compactors { - lc.add_running(1); - let lc = lc.clone(); + let lc = lc.spawn(); let _self = self.clone(); tokio::spawn(async move { _self.run_worker(lc).await; @@ -77,15 +77,18 @@ impl LevelsController { } } + // compact worker async fn run_worker(&self, lc: Closer) { if self.must_kv().opt.do_not_compact { lc.done(); return; } + // random sleep avoid all worker compact at same time { let duration = thread_rng_n(1000); tokio::time::sleep(Duration::from_millis(duration as u64)).await; } + // 1 seconds to check compact let mut interval = tokio::time::interval(Duration::from_secs(1)); loop { // why interval can life long @@ -99,7 +102,7 @@ impl LevelsController { info!("succeed to compacted") }, Ok(false) => { - info!("failed to do compacted"); + info!("skip to do compacted"); break; }, Err(err) => { // TODO handle error @@ -109,7 +112,7 @@ impl LevelsController { } }, _ = done.recv() => { - info!("closing compact job"); + info!("receive a closer signal for closing compact job"); return; } } diff --git a/src/table/builder.rs b/src/table/builder.rs index 6fe16b8..205ed47 100644 --- a/src/table/builder.rs +++ b/src/table/builder.rs @@ -1,5 +1,6 @@ -use crate::y::{hash, is_eof, ValueStruct}; +use crate::y::{hash, is_eof, AsyncEncDec, Decode, Encode, ValueStruct}; use crate::Error; +use async_trait::async_trait; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, BytesMut}; use growable_bloom_filter::GrowableBloom; @@ -11,6 +12,7 @@ use std::fmt; use std::fmt::Formatter; use std::hash::Hasher; use std::io::{self, Cursor, Read, Write}; +use std::str::pattern::Searcher; #[derive(Clone, Default)] pub(crate) struct Header { @@ -24,7 +26,7 @@ impl fmt::Display for Header { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, - "plen:{}, klen:{}, vlen:{}, prev:{}", + "p_len:{}, k_len:{}, v_len:{}, prev:{}", self.p_len, self.k_len, self.v_len, self.prev ) } @@ -34,49 +36,65 @@ impl Header { pub(crate) const fn size() -> usize { 10 } - fn decode(buffer: &[u8]) -> Self { - let mut header = Header::default(); - let mut cursor = Cursor::new(buffer); - header.p_len = cursor.read_u16::().unwrap(); - header.k_len = cursor.read_u16::().unwrap(); - header.v_len = cursor.read_u16::().unwrap(); - header.prev = cursor.read_u32::().unwrap(); - header + + pub(crate) fn is_dummy(&self) -> bool { + self.k_len == 0 && self.p_len == 0 } +} - fn encode(&self, buffer: &mut [u8]) { - let mut cursor = Cursor::new(buffer); - cursor.write_u16::(self.p_len).unwrap(); - cursor.write_u16::(self.k_len).unwrap(); - cursor.write_u16::(self.v_len).unwrap(); - cursor.write_u32::(self.prev).unwrap(); +impl Decode for Header { + fn dec(&mut self, rd: &mut dyn Read) -> crate::Result<()> { + self.p_len = rd.read_u16::()?; + self.k_len = rd.read_u16::()?; + self.v_len = rd.read_u16::()?; + self.prev = rd.read_u32::()?; + Ok(()) } +} - pub(crate) fn is_dummy(&self) -> bool { - self.k_len == 0 && self.p_len == 0 +impl Encode for Header { + fn enc(&self, wt: &mut dyn Write) -> crate::Result { + wt.write_u16::(self.p_len)?; + wt.write_u16::(self.k_len)?; + wt.write_u16::(self.v_len)?; + wt.write_u32::(self.prev)?; + Ok(Header::size()) } } +// #[async_trait] +// impl AsyncEncDec for Header +// where +// R: AsyncRead + Unpin + Sync + Send, +// W: AsyncWrite + Unpin + Sync + Send, +// { +// async fn enc(&self, wt: &mut W) -> crate::Result { +// wt.write_u16(self.p_len).await?; +// wt.write_u16(self.k_len).await?; +// wt.write_u16(self.v_len).await?; +// wt.write_u32(self.prev).await?; +// wt.flush().await?; +// Ok(Header::size()) +// } +// +// async fn dec(&mut self, rd: &R) -> crate::Result<()> { +// todo!() +// } +// } + impl From<&[u8]> for Header { fn from(buffer: &[u8]) -> Self { let mut header = Header::default(); - let mut cursor = Cursor::new(buffer); - header.p_len = cursor.read_u16::().unwrap(); - header.k_len = cursor.read_u16::().unwrap(); - header.v_len = cursor.read_u16::().unwrap(); - header.prev = cursor.read_u32::().unwrap(); + Decode::dec(&mut header, &mut Cursor::new(buffer)).unwrap(); header } } impl Into> for Header { fn into(self) -> Vec { - let mut cursor = Cursor::new(vec![0u8; Header::size()]); - cursor.write_u16::(self.p_len).unwrap(); - cursor.write_u16::(self.k_len).unwrap(); - cursor.write_u16::(self.v_len).unwrap(); - cursor.write_u32::(self.prev).unwrap(); - cursor.into_inner() + let mut wt = Cursor::new(vec![0u8; Header::size()]); + Encode::enc(&self, &mut wt).unwrap(); + wt.into_inner() } } @@ -94,7 +112,6 @@ pub struct Builder { impl Builder { const RESTART_INTERVAL: usize = 100; - fn close(&self) {} fn empty(&self) -> bool { self.buf.is_empty() @@ -189,10 +206,10 @@ impl Builder { // ReachedCapacity returns true if we... roughly (?) reached capacity? fn reached_capacity(&self, cap: u64) -> bool { - let estimateSz = + let estimate_sz = self.buf.get_ref().len() + 8 /* empty header */ + 4*self.restarts.len() + 8; // 8 = end of buf offset + len(restarts). - estimateSz as u64 > cap + estimate_sz as u64 > cap } // blockIndex generates the block index for the table. diff --git a/src/table/table.rs b/src/table/table.rs index 071c061..90e3d64 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -307,10 +307,7 @@ impl TableCore { impl Drop for TableCore { fn drop(&mut self) { - dbg!( - "table reference count: {}", - self._ref.load(Ordering::Relaxed) - ); + dbg!(self._ref.load(Ordering::Relaxed)); // We can safely delete this file, because for all the current files, we always have // at least one reference pointing to them. #[cfg(any(target_os = "macos", target_os = "linux"))] diff --git a/src/types.rs b/src/types.rs index 49055f7..6b2c76e 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,7 +1,7 @@ use parking_lot::*; use std::fmt::Debug; use std::mem::ManuallyDrop; -use std::ops::{Deref, RangeBounds}; +use std::ops::{Deref, DerefMut, RangeBounds}; use std::sync::atomic::{AtomicI32, AtomicIsize, AtomicPtr, AtomicUsize, Ordering}; use std::sync::{Arc, TryLockResult, Weak}; use std::time::Duration; @@ -84,6 +84,12 @@ pub(crate) struct Closer { wait: Arc, } +impl Drop for Closer { + fn drop(&mut self) { + assert!(self.wait.load(Ordering::Relaxed) >= 0, "Sanity check!"); + } +} + impl Closer { // create a Closer with *initial* cap Workers pub(crate) fn new(initial: isize) -> Self { @@ -101,7 +107,13 @@ impl Closer { assert!(old >= 0, "Sanity check"); } - // Decr delta to the WaitGroup. + // Spawn a worker + pub(crate) fn spawn(&self) -> Self { + self.add_running(1); + self.clone() + } + + // Decr delta to the WaitGroup(Note: must be call for every worker avoid leak). pub(crate) fn done(&self) { let old = self.wait.fetch_sub(1, Ordering::Relaxed); assert!(old >= 0, "Sanity check"); @@ -145,6 +157,14 @@ pub struct XArc { pub(crate) x: Arc, } +impl Deref for XArc { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.x.deref() + } +} + impl Clone for XArc { fn clone(&self) -> Self { XArc { x: self.x.clone() } @@ -217,15 +237,19 @@ impl Deref for XVec { fn it_closer() { let runtime = tokio::runtime::Runtime::new().unwrap(); runtime.block_on(async { - let closer = Closer::new(1); - let c = closer.clone(); - tokio::spawn(async move { - sleep(Duration::from_millis(200)).await; - println!("Hello Word1"); - c.done(); - }); + let closer = Closer::new(0); + let count = Arc::new(AtomicUsize::new(100)); + for i in 0..count.load(Ordering::Relaxed) { + let c = closer.spawn(); + let n = count.clone(); + tokio::spawn(async move { + sleep(Duration::from_millis(200)).await; + n.fetch_add(1, Ordering::Relaxed); + c.done(); + }); + } closer.signal_and_wait().await; - println!("Hello Word"); + assert_eq!(count.load(Ordering::Relaxed), 200); }); } diff --git a/src/y/codec.rs b/src/y/codec.rs index 4b8436f..88f492e 100644 --- a/src/y/codec.rs +++ b/src/y/codec.rs @@ -10,9 +10,14 @@ pub trait Encode { pub trait Decode { fn dec(&mut self, rd: &mut dyn Read) -> Result<()>; } +use tokio::io::{AsyncRead, AsyncWrite}; #[async_trait] -pub trait AsyncEncDec { - async fn enc(&self, wt: &mut dyn Write) -> Result; - async fn dec(&mut self, rd: &mut dyn Read) -> Result<()>; +pub trait AsyncEncDec +where + R: AsyncRead, + W: AsyncWrite, +{ + async fn enc(&self, wt: &mut W) -> Result; + async fn dec(&mut self, rd: &R) -> Result<()>; } diff --git a/src/y/mod.rs b/src/y/mod.rs index 4ed851a..5505be0 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -2,7 +2,7 @@ mod codec; pub(crate) mod iterator; mod metrics; -pub use codec::{Decode, Encode}; +pub use codec::{Decode, Encode, AsyncEncDec}; pub use iterator::ValueStruct; use libc::{O_DSYNC, O_WRONLY}; use memmap::MmapMut; From c89a933ff36482fe7538b104d383113dbe19a484 Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 1 Feb 2023 19:39:22 +0800 Subject: [PATCH 20/77] :coffee: --- Cargo.toml | 1 + src/compaction.rs | 21 +++- src/kv.rs | 17 +++- src/level_handler.rs | 94 +++++++++++++++++- src/levels.rs | 219 ++++++++++++++++++++++++++++++++++++------ src/lib.rs | 2 - src/manifest.rs | 12 +-- src/table/builder.rs | 2 +- src/table/iterator.rs | 1 + src/table/table.rs | 11 ++- src/types.rs | 3 + 11 files changed, 328 insertions(+), 55 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5fe35b7..36685cb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,7 @@ awaitgroup = "0.6.0" protobuf = { version = "3.2.0", features = ["with-bytes"] } range-lock = "0.2.2" tracing = "0.1.37" +drop_cell = "0.0.0" [dev-dependencies] tracing-subscriber = "0.3.16" tracing-log = "0.1.3" diff --git a/src/compaction.rs b/src/compaction.rs index cc387b1..199187e 100644 --- a/src/compaction.rs +++ b/src/compaction.rs @@ -35,7 +35,7 @@ impl CompactStatus { // Check whether this level really needs compaction or not. Otherwise, we'll end up // running parallel compactions for the same level. - // *NOTE*: We can directly call this_level.total_size, because we already have acquire a read lock + // *NOTE*: We can directly call this_level.total_size, because we already have acquired a read lock // over this and the next level. if cd.this_level.get_total_size() - this_level.get_del_size() < cd.this_level.get_max_total_size() @@ -48,6 +48,18 @@ impl CompactStatus { true } + pub(crate) fn delete(&self, cd: &CompactDef) { + let level = cd.this_level.level(); + let levels = self.wl(); + assert!( + level < levels.len() - 1, + "Got level {}, Max levels {}", + level, + levels.len() + ); + + } + pub(crate) fn overlaps_with(&self, level: usize, this: &KeyRange) -> bool { let compact_status = self.wl(); compact_status[level].overlaps_with(this) @@ -70,17 +82,20 @@ impl CompactStatus { } } +// Every level compacted status(ranges). #[derive(Clone, Debug)] pub(crate) struct LevelCompactStatus { - ranges: Arc>>, - del_size: Arc, + ranges: Arc>>, // not any overlaps + del_size: Arc, // all KeyRange size } impl LevelCompactStatus { + // returns true if self.ranges and dst has overlap, otherwise returns false fn overlaps_with(&self, dst: &KeyRange) -> bool { self.ranges.write().iter().any(|r| r.overlaps_with(dst)) } + // remove dst from self.ranges fn remove(&mut self, dst: &KeyRange) -> bool { let mut rlock = self.ranges.write(); let len = rlock.len(); diff --git a/src/kv.rs b/src/kv.rs index 870b4a1..75f3fa2 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -1,4 +1,4 @@ -use crate::manifest::{open_or_create_manifest_file, Manifest}; +use crate::manifest::{open_or_create_manifest_file, Manifest, ManifestFile}; use crate::options::Options; use crate::table::builder::Builder; use crate::table::iterator::IteratorImpl; @@ -16,6 +16,7 @@ use std::io::Write; use std::path::Path; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; +use tokio::sync::{RwLock, RwLockWriteGuard}; const _BADGER_PREFIX: &[u8; 8] = b"!badger!"; // Prefix for internal keys used by badger. @@ -37,7 +38,7 @@ struct FlushTask { pub struct KV { pub opt: Options, pub vlog: Option, - pub manifest: Manifest, + pub manifest: Arc>, flush_chan: Channel, // write_chan: Channel, dir_lock_guard: File, @@ -52,6 +53,10 @@ pub struct KV { last_used_cas_counter: AtomicU64, } +// TODO not add bellow lines +unsafe impl Send for KV {} +unsafe impl Sync for KV {} + impl KV { pub fn new(opt: Options) -> Result> { let mut _opt = opt.clone(); @@ -88,7 +93,7 @@ impl KV { let mut out = KV { opt: opt.clone(), vlog: None, - manifest, + manifest: Arc::new(RwLock::new(manifest_file)), flush_chan: Channel::new(1), // write_chan: Channel::new(1), dir_lock_guard, @@ -258,6 +263,12 @@ pub type WeakKV = XWeak; pub type ArcKV = XArc; +impl ArcKV { + pub async fn manifest_wl(&self) -> RwLockWriteGuard<'_, ManifestFile> { + self.manifest.write().await + } +} + impl Clone for WeakKV { fn clone(&self) -> Self { XWeak { x: self.x.clone() } diff --git a/src/level_handler.rs b/src/level_handler.rs index 4f03ff3..0a453b9 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -1,19 +1,21 @@ use crate::compaction::KeyRange; use crate::kv::{WeakKV, KV}; -use crate::table::iterator::{IteratorImpl, IteratorItem}; -use crate::table::table::Table; +use crate::table::iterator::{ConcatIterator, IteratorImpl, IteratorItem}; +use crate::table::table::{Table, TableCore}; use crate::types::{XArc, XWeak}; use crate::y::iterator::Xiterator; use crate::Result; +use core::slice::SlicePattern; +use crate::levels::CompactDef; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; use std::collections::HashSet; +use std::ops::Deref; use std::sync::atomic::{AtomicI32, AtomicU64, Ordering}; use std::sync::Arc; pub(crate) type LevelHandler = XArc; -// pub(crate) type WeakLevelHandler = XWeak; impl From for LevelHandler { fn from(value: LevelHandlerInner) -> Self { @@ -99,7 +101,7 @@ impl LevelHandler { // Replace tables[left:right] with new_tables, Note this EXCLUDES tables[right]. // You must be call decr() to delete the old tables _after_ writing the update to the manifest. - fn replace_tables(&self, new_tables: Vec
) -> Result<()> { + pub(crate) fn replace_tables(&self, new_tables: Vec
) -> Result<()> { // Need to re-search the range of tables in this level to be replaced as other goroutines might // be changing it as well. (They can't touch our tables, but if they add/remove other tables, // the indices get shifted around.) @@ -215,6 +217,39 @@ impl LevelHandler { fn kv(&self) -> XArc { self.x.kv.upgrade().unwrap() } + + // Merge top tables and bot tables to from a List of new tables. + pub(crate) fn compact_build_tables(&self, l: usize, cd: &CompactDef) -> Result
{ + let top_tables = &cd.top; + let bot_tables = &cd.bot; + + // Create iterators across all the tables involved first. + let mut itr: Vec>> = vec![]; + if l == 0 { + Self::append_iterators_reversed(&mut itr, top_tables, false); + } else { + assert_eq!(1, top_tables.len()); + Self::append_iterators_reversed(&mut itr, &top_tables[..1].to_vec(), false); + } + + // Next level has level>=1 and we can use ConcatIterator as key ranges do not overlap. + // TODO + // itr.push(ConcatIterator::new(bot_tables, false)); + todo!() + } + + // TODO + fn append_iterators_reversed( + out: &mut Vec>>, + th: &Vec
, + reversed: bool, + ) { + // for itr_th in th.iter().rev() { + // // This will increment the reference of the table handler. + // let itr = IteratorImpl::new(itr_th, reversed); + // out.push(Box::new(itr)); + // } + } } pub(crate) struct LevelHandlerInner { @@ -246,4 +281,55 @@ impl LevelHandlerInner { kv, } } + + #[inline] + pub(crate) fn lock_shared(&self) { + use parking_lot::lock_api::RawRwLock; + unsafe { self.self_lock.raw().lock_shared() } + } + + #[inline] + pub(crate) fn try_lock_share(&self) -> bool { + use parking_lot::lock_api::RawRwLock; + unsafe { self.self_lock.raw().try_lock_shared() } + } + + #[inline] + pub(crate) fn unlock_shared(&self) { + use parking_lot::lock_api::RawRwLock; + unsafe { self.self_lock.raw().unlock_shared() } + } + + #[inline] + pub(crate) fn lock_exclusive(&self) { + use parking_lot::lock_api::RawRwLock; + unsafe { self.self_lock.raw().lock_exclusive() } + } + + #[inline] + pub(crate) fn try_lock_exclusive(&self) -> bool { + use parking_lot::lock_api::RawRwLock; + unsafe { self.self_lock.raw().try_lock_exclusive() } + } + + #[inline] + pub(crate) fn unlock_exclusive(&self) { + use parking_lot::lock_api::RawRwLock; + unsafe { self.self_lock.raw().unlock_exclusive() } + } +} + +#[test] +fn raw_lock() { + let lock = LevelHandlerInner::new(WeakKV::new(), 10); + lock.lock_shared(); + lock.lock_shared(); + assert_eq!(false, lock.try_lock_exclusive()); + lock.unlock_shared(); + lock.unlock_shared(); + + assert_eq!(true, lock.try_lock_exclusive()); + assert_eq!(false, lock.try_lock_share()); + lock.unlock_exclusive(); + assert_eq!(true, lock.try_lock_share()); } diff --git a/src/levels.rs b/src/levels.rs index 20f2bff..b408100 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -1,25 +1,30 @@ use crate::compaction::{CompactStatus, KeyRange, INFO_RANGE}; use crate::kv::{ArcKV, WeakKV, KV}; use crate::level_handler::{LevelHandler, LevelHandlerInner}; -use crate::manifest::Manifest; +use crate::manifest::{Manifest, ManifestChangeBuilder, ManifestFile}; use crate::options::Options; +use crate::pb::badgerpb3::manifest_change::Operation::{CREATE, DELETE}; +use crate::pb::badgerpb3::{ManifestChange, ManifestChangeSet}; use crate::table::table::{new_file_name, Table, TableCore}; use crate::types::{Closer, XArc, XWeak}; use crate::Error::Unexpected; use crate::Result; use atomic::Ordering; use awaitgroup::WaitGroup; +use drop_cell::defer; use log::{error, info}; use parking_lot::lock_api::RawRwLock; use parking_lot::{RwLock, RwLockReadGuard}; use std::cell::RefCell; use std::collections::HashSet; +use std::fmt::{Display, Formatter}; use std::fs::remove_file; use std::ops::Deref; use std::path::Path; use std::sync::atomic::{AtomicI64, AtomicU64}; use std::sync::Arc; use std::time::{Duration, SystemTime}; +use std::vec; use tokio::macros::support::thread_rng_n; #[derive(Clone)] @@ -30,6 +35,7 @@ pub(crate) struct LevelsController { next_file_id: Arc, // For ending compactions. compact_worker_wg: Arc, + // Store compact status that will be run or has running c_status: Arc, } @@ -97,7 +103,7 @@ impl LevelsController { _ = interval.tick() => { let pick: Vec = self.pick_compact_levels(); for p in pick { - match self.do_compact(p) { + match self.do_compact(p).await { Ok(true) => { info!("succeed to compacted") }, @@ -120,7 +126,7 @@ impl LevelsController { } // Picks some table on level l and compacts it away to the next level. - fn do_compact(&self, p: CompactionPriority) -> Result { + async fn do_compact(&self, p: CompactionPriority) -> Result { let l = p.level; assert!(l + 1 < self.must_kv().opt.max_levels); // Sanity check. let mut cd = CompactDef::default(); @@ -142,30 +148,136 @@ impl LevelsController { } info!("Running for level: {}", cd.this_level.level()); info!("{:?}", self.c_status); - + let compacted_res = self.run_compact_def(l, cd).await; + if compacted_res.is_err() { + error!( + "LOG Compact FAILED with error: {}", + compacted_res.unwrap_err().to_string() + ); + } + // Done with compaction. So, remove the ranges from compaction status. + // self.c_status.del_size(;) info!("Compaction for level: {} DONE", cd.this_level.level()); Ok(true) } - fn run_compact_def(&self, l: usize, cd: &mut CompactDef) -> Result<()> { + async fn run_compact_def(&self, l: usize, cd: CompactDef) -> Result<()> { let time_start = SystemTime::now(); let this_level = cd.this_level.clone(); let next_level = cd.next_level.clone(); if this_level.level() >= 1 && cd.bot.is_empty() { assert_eq!(cd.top.len(), 1); + let table_lck = cd.top[0].clone(); + // We write to the manifest _before_ we delete files (and after we created files). + // The order matters here -- you can't temporarily have two copies of the same + // table id when reloading the manifest. + // TODO Why? + let delete_change = ManifestChangeBuilder::new(table_lck.id()) + .with_op(DELETE) + .build(); + let create_change = ManifestChangeBuilder::new(table_lck.id()) + .with_level(next_level.level() as u32) + .with_op(CREATE) + .build(); + let changes = vec![delete_change, create_change]; + let kv = self.must_kv(); + let mut manifest = kv.manifest.write().await; + manifest.add_changes(changes)?; + // We have to add to next_level before we remove from this_level, not after. This way, we + // don't have a bug where reads would see keys missing from both levels. + // + // Note: It's critical that we add tables (replace them) in next_level before deleting them + // in this_level. (We could finagle it atomically somehow.) Also, when reading we must + // read, or at least acquire s.rlock(), in increasing order by level, so that we don't skip + // a compaction. + next_level.replace_tables(cd.top.clone())?; + this_level.replace_tables(cd.top.clone())?; + info!( + "LOG Compact-Move {}->{} smallest:{} biggest:{} took {}", + l, + l + 1, + String::from_utf8_lossy(table_lck.smallest()), + String::from_utf8_lossy(table_lck.biggest()), + time_start.elapsed().unwrap().as_millis(), + ); + return Ok(()); } + + // NOTE: table deref + let new_tables = self.compact_build_tables(l, &cd)?; + let deref_tables = || new_tables.iter().for_each(|tb| tb.decr_ref()); + defer! {deref_tables();} + let change_set = Self::build_change_set(&cd, &new_tables); + + // We write to the manifest _before_ we delete files (and after we created files) + { + let kv = self.must_kv(); + let mut manifest = kv.manifest.write().await; + manifest.add_changes(change_set)?; + } + + // See comment earlier in this function about the ordering of these ops, and the order in which + // we access levels whe reading. + next_level.replace_tables(new_tables.clone())?; + this_level.replace_tables(cd.top.clone())?; + + // Note: For level 0, while do_compact is running, it is possible that new tables are added. + // However, the tables are added only to the end, so it is ok to just delete the first table. + info!( + "LOG Compact {}->{}, del {} tables, add {} tables, took {}", + l, + l + 1, + cd.top.len() + cd.bot.len(), + new_tables.len(), + time_start.elapsed().unwrap().as_millis() + ); + + Ok(()) + } + + fn compact_build_tables(&self, l: usize, cd: &CompactDef) -> Result> { todo!() } + fn build_change_set(cd: &CompactDef, new_tables: &Vec
) -> Vec { + let mut changes = vec![]; + for table in new_tables { + changes.push( + ManifestChangeBuilder::new(table.id()) + .with_level(cd.next_level.level() as u32) + .with_op(CREATE) + .build(), + ); + } + + for table in cd.top.iter() { + changes.push( + ManifestChangeBuilder::new(table.id()) + .with_op(DELETE) + .build(), + ); + } + + for table in cd.bot.iter() { + changes.push( + ManifestChangeBuilder::new(table.id()) + .with_op(DELETE) + .build(), + ); + } + + changes + } + fn fill_tables_l0(&self, cd: &mut CompactDef) -> bool { - cd.lock_levels(); + cd.lock_shared_levels(); let top = cd.this_level.to_ref().tables.read(); // TODO here maybe have some issue that i don't understand let tables = top.to_vec(); cd.top.extend(tables); if cd.top.is_empty() { - cd.unlock_levels(); + cd.unlock_shared_levels(); return false; } cd.this_range = INFO_RANGE; @@ -180,18 +292,19 @@ impl LevelsController { cd.next_range = KeyRange::get_range(cd.bot.as_ref()); } // if !self.c_status. - cd.unlock_levels(); + cd.unlock_shared_levels(); true } fn fill_tables(&self, cd: &mut CompactDef) -> bool { - cd.lock_levels(); + // lock current level and next levels, So there is at most one compression process per layer + cd.lock_shared_levels(); let mut tables = cd.this_level.to_ref().tables.read().to_vec(); if tables.is_empty() { - cd.unlock_levels(); + cd.unlock_shared_levels(); return false; } - // Find the biggest table, and compact taht first. + // Find the biggest table, and compact that first. // TODO: Try other table picking strategies. tables.sort_by(|a, b| b.size().cmp(&a.size())); for t in tables { @@ -208,21 +321,29 @@ impl LevelsController { continue; } - cd.top.clear(); - cd.top.push(t); + { + cd.top.clear(); + cd.top.push(t); + } + + // Find next overlap that will be compacted + // TODO [left, right) let (left, right) = cd.next_level.overlapping_tables(&cd.this_range); let bot = cd.next_level.to_ref().tables.read(); let tables = bot.to_vec(); - cd.bot.clear(); - cd.bot.extend(tables[left..right].to_vec()); + { + cd.bot.clear(); + cd.bot.extend(tables[left..right].to_vec()); + } + // not find any overlap at next levels, so sample insert it if cd.bot.is_empty() { - cd.bot.clear(); cd.next_range = cd.this_range.clone(); if !self.c_status.compare_and_add(cd) { + info!("find a conflict compacted, cd: {}", cd); continue; } - cd.unlock_levels(); + cd.unlock_shared_levels(); return true; } @@ -238,10 +359,10 @@ impl LevelsController { if !self.c_status.compare_and_add(&cd) { continue; } - cd.unlock_levels(); + cd.unlock_shared_levels(); return true; } - cd.unlock_levels(); + cd.unlock_shared_levels(); false } @@ -300,11 +421,35 @@ struct CompactionPriority { pub(crate) struct CompactDef { pub(crate) this_level: LevelHandler, pub(crate) next_level: LevelHandler, - pub(crate) top: Vec
, - pub(crate) bot: Vec
, + pub(crate) top: Vec
, // if the level is not level0, it should be only one table + pub(crate) bot: Vec
, // may be empty tables set pub(crate) this_range: KeyRange, pub(crate) next_range: KeyRange, - pub(crate) this_size: AtomicU64, + pub(crate) this_size: AtomicU64, // the compacted table's size(NOTE: this level compacted table is only one, not zero level) +} + +impl Display for CompactDef { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let top = self + .top + .iter() + .map(|table| table.id().to_string()) + .collect::>(); + let bot = self + .bot + .iter() + .map(|table| table.id().to_string()) + .collect::>(); + write!( + f, + "(this_level: {}, next_level: {}, this_sz: {}, top: {:?}, bot: {:?})", + self.this_level.level(), + self.next_level.level(), + self.this_size.load(Ordering::Relaxed), + top, + bot + ) + } } impl Default for CompactDef { @@ -331,17 +476,27 @@ impl Default for CompactDef { } impl CompactDef { - fn lock_levels(&self) { - unsafe { - self.this_level.x.self_lock.raw().lock_shared(); - self.next_level.x.self_lock.raw().lock_shared(); - } + #[inline] + fn lock_shared_levels(&self) { + self.this_level.lock_shared(); + self.next_level.lock_shared(); } - fn unlock_levels(&self) { - unsafe { - self.next_level.x.self_lock.raw().unlock_shared(); - self.this_level.x.self_lock.raw().unlock_shared(); - } + #[inline] + fn unlock_shared_levels(&self) { + self.next_level.unlock_shared(); + self.this_level.unlock_shared(); + } + + #[inline] + fn lock_exclusive_levels(&self) { + self.this_level.lock_exclusive(); + self.next_level.lock_exclusive(); + } + + #[inline] + fn unlock_exclusive_levels(&self) { + self.next_level.unlock_exclusive(); + self.this_level.unlock_exclusive(); } } diff --git a/src/lib.rs b/src/lib.rs index f652a82..2ed9db3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,8 +14,6 @@ #![feature(slice_pattern)] #![feature(slice_take)] -extern crate core; - use std::io; use std::mem::align_of; diff --git a/src/manifest.rs b/src/manifest.rs index 01e132f..f6ab6c0 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -42,7 +42,7 @@ pub struct TableManifest { } #[derive(Default)] -pub(crate) struct ManifestFile { +pub struct ManifestFile { fp: Option, directory: String, // We make this configurable so that unit tests can hit rewrite() code quickly @@ -357,14 +357,14 @@ pub(crate) fn open_or_create_manifest_file(dir: &str) -> Result<(ManifestFile, M } #[derive(Debug)] -struct ManifestChangeBuilder { +pub(crate) struct ManifestChangeBuilder { id: u64, level: u32, op: Operation, } impl ManifestChangeBuilder { - fn new(id: u64) -> Self { + pub(crate) fn new(id: u64) -> Self { ManifestChangeBuilder { id, level: 0, @@ -377,17 +377,17 @@ impl ManifestChangeBuilder { // self // } - fn with_level(mut self, level: u32) -> Self { + pub(crate) fn with_level(mut self, level: u32) -> Self { self.level = level; self } - fn with_op(mut self, op: Operation) -> Self { + pub(crate) fn with_op(mut self, op: Operation) -> Self { self.op = op; self } - fn build(self) -> ManifestChange { + pub(crate) fn build(self) -> ManifestChange { let mut mf = ManifestChange::new(); mf.Id = self.id; mf.Level = self.level; diff --git a/src/table/builder.rs b/src/table/builder.rs index 205ed47..fc1194c 100644 --- a/src/table/builder.rs +++ b/src/table/builder.rs @@ -182,7 +182,7 @@ impl Builder { if self.counter >= Self::RESTART_INTERVAL { self.finish_block(); println!( - "create new block: base:{}, pre: {}, base-key {:?}", + "create new block, base:{:<10}, pre: {:5}, base-key: {:?}", self.base_offset, self.prev_offset, String::from_utf8_lossy(&self.base_key) diff --git a/src/table/iterator.rs b/src/table/iterator.rs index 3c9a042..fba5fd5 100644 --- a/src/table/iterator.rs +++ b/src/table/iterator.rs @@ -347,6 +347,7 @@ impl<'a> Xiterator for IteratorImpl<'a> { impl<'a> IteratorImpl<'a> { pub fn new(table: &'a TableCore, reversed: bool) -> IteratorImpl<'a> { + table.incr_ref(); // Important IteratorImpl { table, bpos: RefCell::new(0), diff --git a/src/table/table.rs b/src/table/table.rs index 90e3d64..42bde1b 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -40,7 +40,7 @@ impl fmt::Display for KeyOffset { let key = String::from_utf8(self.key.clone()) .map_err(|_| "...") .unwrap(); - write!(f, "key:{}, offset:{}, len:{}", key, self.offset, self.len) + write!(f, "key: {} | offset:{:10}| len:{}", key, self.offset, self.len) } } @@ -338,15 +338,18 @@ impl Display for TableCore { let biggest = String::from_utf8_lossy(self.biggest()); writeln!( f, - "_ref: {}, file_name: {}, block_index: {}, id: {}, table_size:{}, index-size: {:?}, smallest: {}, biggest: {}", + "_ref: {}, file_name: {}, block_index: {}, id: {}, table_size:{}, smallest: {}, biggest: {}", self._ref.load(Ordering::Relaxed), self.file_name, self.block_index.len(), self.id, self.table_size, - index_str, smallest, biggest, - ) + ).unwrap(); + for index in index_str { + writeln!(f, "{}", index).unwrap(); + } + Ok(()) } } diff --git a/src/types.rs b/src/types.rs index 6b2c76e..990a222 100644 --- a/src/types.rs +++ b/src/types.rs @@ -205,11 +205,13 @@ impl XVec { XVec(Arc::new(VecRangeLock::new(v))) } + #[inline] pub fn lock_all(&self) { let right = self.0.data_len(); self.lock(0, right) } + #[inline] pub fn lock(&self, left: usize, right: usize) { loop { let range = left..right; @@ -221,6 +223,7 @@ impl XVec { } } + #[inline] pub fn try_lock(&self, range: impl RangeBounds) -> TryLockResult> { self.0.try_lock(range) } From 434b0a497d829ed6b781142fd25acabb4bcfe94b Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 2 Feb 2023 01:15:25 +0800 Subject: [PATCH 21/77] :dog: --- src/compaction.rs | 1 - src/level_handler.rs | 30 ++++++++++++++++++++++++------ src/levels.rs | 11 +++++++---- src/table/table.rs | 8 ++++++-- src/table/tests.rs | 15 +++++++++++++++ src/y/iterator.rs | 3 +++ 6 files changed, 55 insertions(+), 13 deletions(-) diff --git a/src/compaction.rs b/src/compaction.rs index 199187e..ae34e5c 100644 --- a/src/compaction.rs +++ b/src/compaction.rs @@ -57,7 +57,6 @@ impl CompactStatus { level, levels.len() ); - } pub(crate) fn overlaps_with(&self, level: usize, this: &KeyRange) -> bool { diff --git a/src/level_handler.rs b/src/level_handler.rs index 0a453b9..d2aa54c 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -2,12 +2,13 @@ use crate::compaction::KeyRange; use crate::kv::{WeakKV, KV}; use crate::table::iterator::{ConcatIterator, IteratorImpl, IteratorItem}; use crate::table::table::{Table, TableCore}; -use crate::types::{XArc, XWeak}; -use crate::y::iterator::Xiterator; +use crate::types::{Channel, XArc, XWeak}; +use crate::y::iterator::{MergeIterOverBuilder, Xiterator}; use crate::Result; use core::slice::SlicePattern; use crate::levels::CompactDef; +use drop_cell::defer; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; use std::collections::HashSet; @@ -219,12 +220,16 @@ impl LevelHandler { } // Merge top tables and bot tables to from a List of new tables. - pub(crate) fn compact_build_tables(&self, l: usize, cd: &CompactDef) -> Result
{ + pub(crate) async fn compact_build_tables( + &self, + l: usize, + cd: &'static CompactDef, + ) -> Result
{ let top_tables = &cd.top; let bot_tables = &cd.bot; // Create iterators across all the tables involved first. - let mut itr: Vec>> = vec![]; + let mut itr: Vec<&dyn Xiterator> = vec![]; if l == 0 { Self::append_iterators_reversed(&mut itr, top_tables, false); } else { @@ -234,13 +239,26 @@ impl LevelHandler { // Next level has level>=1 and we can use ConcatIterator as key ranges do not overlap. // TODO - // itr.push(ConcatIterator::new(bot_tables, false)); + let bot_tables = bot_tables.iter().map(|t| t.to_ref()).collect::>(); + let citr = ConcatIterator::new(bot_tables, false); + itr.push(&citr); + let mitr = MergeIterOverBuilder::default().add_batch(itr).build(); + // Important to close the iterator to do ref counting. + defer! {mitr.close()}; + mitr.rewind(); + + // Start generating new tables. + struct NewTableResult { + table: Table, + err: Result<()>, + } + let result_ch: Channel = Channel::new(1); todo!() } // TODO fn append_iterators_reversed( - out: &mut Vec>>, + out: &mut Vec<&dyn Xiterator>, th: &Vec
, reversed: bool, ) { diff --git a/src/levels.rs b/src/levels.rs index b408100..0d00f4e 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -146,7 +146,8 @@ impl LevelsController { return Ok(false); } } - info!("Running for level: {}", cd.this_level.level()); + let level = cd.this_level.level(); + info!("Running for level: {}", level); info!("{:?}", self.c_status); let compacted_res = self.run_compact_def(l, cd).await; if compacted_res.is_err() { @@ -156,8 +157,8 @@ impl LevelsController { ); } // Done with compaction. So, remove the ranges from compaction status. - // self.c_status.del_size(;) - info!("Compaction for level: {} DONE", cd.this_level.level()); + self.c_status.del_size(level); + info!("Compaction for level: {} DONE", level); Ok(true) } @@ -291,7 +292,9 @@ impl LevelsController { } else { cd.next_range = KeyRange::get_range(cd.bot.as_ref()); } - // if !self.c_status. + if !self.c_status.compare_and_add(cd) { + return false; + } cd.unlock_shared_levels(); true } diff --git a/src/table/table.rs b/src/table/table.rs index 42bde1b..4279b65 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -35,12 +35,16 @@ pub(crate) struct KeyOffset { len: usize, } -impl fmt::Display for KeyOffset { +impl Display for KeyOffset { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { let key = String::from_utf8(self.key.clone()) .map_err(|_| "...") .unwrap(); - write!(f, "key: {} | offset:{:10}| len:{}", key, self.offset, self.len) + write!( + f, + "key: {} | offset:{:10}| len:{}", + key, self.offset, self.len + ) } } diff --git a/src/table/tests.rs b/src/table/tests.rs index 0aa9750..a4cfe95 100644 --- a/src/table/tests.rs +++ b/src/table/tests.rs @@ -497,6 +497,21 @@ mod utils { } } + #[test] + fn t_table() { + let f1 = TableBuilder::new() + .mode(FileLoadingMode::MemoryMap) + .key_value(vec![ + (b"k1".to_vec(), b"a1".to_vec()), + (b"k2".to_vec(), b"a2".to_vec()), + ]) + .build(); + let x_table = Table::new(f1); + let core = x_table.to_ref(); + let cores = vec![core]; + let itr = ConcatIterator::new(cores, false); + } + fn build_table(mut key_value: Vec<(Vec, Vec)>) -> (File, String) { let mut builder = Builder::default(); let file_name = format!( diff --git a/src/y/iterator.rs b/src/y/iterator.rs index 4426bda..dad9152 100644 --- a/src/y/iterator.rs +++ b/src/y/iterator.rs @@ -95,6 +95,9 @@ pub trait Xiterator { fn peek(&self) -> Option { todo!() } + fn close(&self) { + todo!() + } } pub trait KeyValue { From 26966c6e3f5f78a644dc5126dc335ac95ca580a2 Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 2 Feb 2023 21:30:38 +0800 Subject: [PATCH 22/77] :coffee: --- src/level_handler.rs | 53 ++--------------------- src/levels.rs | 101 +++++++++++++++++++++++++++++++++++++++++-- src/table/builder.rs | 4 +- src/table/tests.rs | 2 +- 4 files changed, 103 insertions(+), 57 deletions(-) diff --git a/src/level_handler.rs b/src/level_handler.rs index d2aa54c..09971a9 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -8,13 +8,16 @@ use crate::Result; use core::slice::SlicePattern; use crate::levels::CompactDef; +use crate::table::builder::Builder; use drop_cell::defer; +use log::info; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; use std::collections::HashSet; use std::ops::Deref; use std::sync::atomic::{AtomicI32, AtomicU64, Ordering}; use std::sync::Arc; +use std::time::SystemTime; pub(crate) type LevelHandler = XArc; @@ -218,56 +221,6 @@ impl LevelHandler { fn kv(&self) -> XArc { self.x.kv.upgrade().unwrap() } - - // Merge top tables and bot tables to from a List of new tables. - pub(crate) async fn compact_build_tables( - &self, - l: usize, - cd: &'static CompactDef, - ) -> Result
{ - let top_tables = &cd.top; - let bot_tables = &cd.bot; - - // Create iterators across all the tables involved first. - let mut itr: Vec<&dyn Xiterator> = vec![]; - if l == 0 { - Self::append_iterators_reversed(&mut itr, top_tables, false); - } else { - assert_eq!(1, top_tables.len()); - Self::append_iterators_reversed(&mut itr, &top_tables[..1].to_vec(), false); - } - - // Next level has level>=1 and we can use ConcatIterator as key ranges do not overlap. - // TODO - let bot_tables = bot_tables.iter().map(|t| t.to_ref()).collect::>(); - let citr = ConcatIterator::new(bot_tables, false); - itr.push(&citr); - let mitr = MergeIterOverBuilder::default().add_batch(itr).build(); - // Important to close the iterator to do ref counting. - defer! {mitr.close()}; - mitr.rewind(); - - // Start generating new tables. - struct NewTableResult { - table: Table, - err: Result<()>, - } - let result_ch: Channel = Channel::new(1); - todo!() - } - - // TODO - fn append_iterators_reversed( - out: &mut Vec<&dyn Xiterator>, - th: &Vec
, - reversed: bool, - ) { - // for itr_th in th.iter().rev() { - // // This will increment the reference of the table handler. - // let itr = IteratorImpl::new(itr_th, reversed); - // out.push(Box::new(itr)); - // } - } } pub(crate) struct LevelHandlerInner { diff --git a/src/levels.rs b/src/levels.rs index 0d00f4e..041a08e 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -5,8 +5,11 @@ use crate::manifest::{Manifest, ManifestChangeBuilder, ManifestFile}; use crate::options::Options; use crate::pb::badgerpb3::manifest_change::Operation::{CREATE, DELETE}; use crate::pb::badgerpb3::{ManifestChange, ManifestChangeSet}; +use crate::table::builder::Builder; +use crate::table::iterator::{ConcatIterator, IteratorItem}; use crate::table::table::{new_file_name, Table, TableCore}; -use crate::types::{Closer, XArc, XWeak}; +use crate::types::{Channel, Closer, XArc, XWeak}; +use crate::y::iterator::{MergeIterOverBuilder, Xiterator}; use crate::Error::Unexpected; use crate::Result; use atomic::Ordering; @@ -14,7 +17,7 @@ use awaitgroup::WaitGroup; use drop_cell::defer; use log::{error, info}; use parking_lot::lock_api::RawRwLock; -use parking_lot::{RwLock, RwLockReadGuard}; +use parking_lot::{Mutex, RwLock, RwLockReadGuard}; use std::cell::RefCell; use std::collections::HashSet; use std::fmt::{Display, Formatter}; @@ -205,10 +208,13 @@ impl LevelsController { return Ok(()); } + let cd = Arc::new(tokio::sync::RwLock::new(cd)); // NOTE: table deref - let new_tables = self.compact_build_tables(l, &cd)?; + let new_tables = self.compact_build_tables(l, cd.clone()).await?; let deref_tables = || new_tables.iter().for_each(|tb| tb.decr_ref()); defer! {deref_tables();} + + let cd = cd.write().await; let change_set = Self::build_change_set(&cd, &new_tables); // We write to the manifest _before_ we delete files (and after we created files) @@ -237,10 +243,92 @@ impl LevelsController { Ok(()) } - fn compact_build_tables(&self, l: usize, cd: &CompactDef) -> Result> { + // Merge top tables and bot tables to from a List of new tables. + pub(crate) async fn compact_build_tables( + &self, + l: usize, + cd: Arc>, + ) -> Result> { + let cd = cd.read().await; + let top_tables = &cd.top; + let bot_tables = &cd.bot; + + // Create iterators across all the tables involved first. + let mut itr: Vec<&dyn Xiterator> = vec![]; + if l == 0 { + Self::append_iterators_reversed(&mut itr, top_tables, false); + } else { + assert_eq!(1, top_tables.len()); + Self::append_iterators_reversed(&mut itr, &top_tables[..1].to_vec(), false); + } + + // Next level has level>=1 and we can use ConcatIterator as key ranges do not overlap. + // TODO + let bot_tables = bot_tables.iter().map(|t| t.to_ref()).collect::>(); + let citr = ConcatIterator::new(bot_tables, false); + itr.push(&citr); + let mitr = MergeIterOverBuilder::default().add_batch(itr).build(); + // Important to close the iterator to do ref counting. + defer! {mitr.close()}; + mitr.rewind(); + + // Start generating new tables. + struct NewTableResult { + table: Table, + err: Result<()>, + } + let result_ch: Channel = Channel::new(1); + + // TODO + loop { + let start_time = SystemTime::now(); + let mut builder = Builder::default(); + for value in mitr.next() { + if builder.reached_capacity(self.must_kv().opt.max_table_size) { + break; + } + assert!(builder.add(value.key(), value.value()).is_ok()); + } + if builder.empty() { + break; + } + // It was true that it.Valid() at least once in the loop above, which means we + // called Add() at least once, and builder is not Empty(). + info!( + "LOG Compacted: Iteration to generate one table took: {}", + start_time.elapsed().unwrap().as_millis() + ); + + // TODO + let file_id = self.reserve_file_id(); + // async + } + let mut new_tables = vec![]; + let mut first_err = Ok(()); + // Wait for all table builders to finished. + + while let Ok(ret) = result_ch.recv().await { + new_tables.push(ret.table.clone()); + if ret.err.is_err() { + first_err = ret.err; + } + } todo!() } + // TODO + fn append_iterators_reversed( + out: &mut Vec<&dyn Xiterator>, + th: &Vec
, + reversed: bool, + ) { + // for itr_th in th.iter().rev() { + // // This will increment the reference of the table handler. + // let itr = IteratorImpl::new(itr_th, reversed); + // out.push(Box::new(itr)); + // } + } + fn build_change_set(cd: &CompactDef, new_tables: &Vec
) -> Vec { let mut changes = vec![]; for table in new_tables { @@ -412,6 +500,11 @@ impl LevelsController { fn must_kv(&self) -> Arc { self.kv.x.upgrade().unwrap() } + + fn reserve_file_id(&self) -> i64 { + let id = self.next_file_id.fetch_add(1, Ordering::Relaxed); + id + } } #[derive(Debug, Clone)] diff --git a/src/table/builder.rs b/src/table/builder.rs index fc1194c..6e10280 100644 --- a/src/table/builder.rs +++ b/src/table/builder.rs @@ -113,7 +113,7 @@ pub struct Builder { impl Builder { const RESTART_INTERVAL: usize = 100; - fn empty(&self) -> bool { + pub(crate) fn empty(&self) -> bool { self.buf.is_empty() } @@ -205,7 +205,7 @@ impl Builder { // at the end. The diff can vary. // ReachedCapacity returns true if we... roughly (?) reached capacity? - fn reached_capacity(&self, cap: u64) -> bool { + pub(crate) fn reached_capacity(&self, cap: u64) -> bool { let estimate_sz = self.buf.get_ref().len() + 8 /* empty header */ + 4*self.restarts.len() + 8; // 8 = end of buf offset + len(restarts). diff --git a/src/table/tests.rs b/src/table/tests.rs index a4cfe95..3818c0b 100644 --- a/src/table/tests.rs +++ b/src/table/tests.rs @@ -575,7 +575,7 @@ mod utils { format!("{}{:04}", prefix, n) } - struct TableBuilder { + pub(crate) struct TableBuilder { path: String, key_value: Vec<(Vec, Vec)>, mode: FileLoadingMode, From dc29adbeadd9b4b4344168e1e37e62bc2ad684ad Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 9 Feb 2023 18:44:00 +0800 Subject: [PATCH 23/77] :sleep: --- src/compaction.rs | 19 ++++++- src/kv.rs | 27 ++++++---- src/level_handler.rs | 47 ++++++++++++++++++ src/levels.rs | 115 ++++++++++++++++++++++++++++++++++++++++--- src/manifest.rs | 83 ++++++++++++++++++++----------- src/options/mod.rs | 2 +- src/table/table.rs | 4 +- src/y/mod.rs | 2 +- 8 files changed, 249 insertions(+), 50 deletions(-) diff --git a/src/compaction.rs b/src/compaction.rs index ae34e5c..40d1909 100644 --- a/src/compaction.rs +++ b/src/compaction.rs @@ -9,7 +9,15 @@ use std::sync::Arc; #[derive(Debug)] pub(crate) struct CompactStatus { // every level has a *CompactionStatus* that includes multipart *KeyRange* - levels: RwLock>, + pub(crate) levels: RwLock>, +} + +impl Default for CompactStatus { + fn default() -> Self { + CompactStatus { + levels: RwLock::new(vec![]), + } + } } impl CompactStatus { @@ -88,6 +96,15 @@ pub(crate) struct LevelCompactStatus { del_size: Arc, // all KeyRange size } +impl Default for LevelCompactStatus { + fn default() -> Self { + LevelCompactStatus { + ranges: Arc::new(RwLock::new(Vec::new())), + del_size: Arc::new(AtomicU64::new(0)), + } + } +} + impl LevelCompactStatus { // returns true if self.ranges and dst has overlap, otherwise returns false fn overlaps_with(&self, dst: &KeyRange) -> bool { diff --git a/src/kv.rs b/src/kv.rs index 75f3fa2..f8343f0 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -10,12 +10,13 @@ use crate::{Error, Node, SkipList}; use fs2::FileExt; use log::info; use std::borrow::BorrowMut; -use std::fs::{create_dir_all, read_dir, File}; +use std::fs::{read_dir, File}; use std::fs::{try_exists, OpenOptions}; use std::io::Write; use std::path::Path; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; +use tokio::fs::create_dir_all; use tokio::sync::{RwLock, RwLockWriteGuard}; const _BADGER_PREFIX: &[u8; 8] = b"!badger!"; @@ -58,17 +59,15 @@ unsafe impl Send for KV {} unsafe impl Sync for KV {} impl KV { - pub fn new(opt: Options) -> Result> { - let mut _opt = opt.clone(); - _opt.max_batch_size = (15 * opt.max_table_size) / 100; - _opt.max_batch_count = opt.max_batch_size / Node::size() as u64; - create_dir_all(opt.dir.as_str())?; - create_dir_all(opt.value_dir.as_str())?; - // todo add directory lock + pub async fn Open(mut opt: Options) -> Result> { + opt.max_batch_size = (15 * opt.max_table_size) / 100; + opt.max_batch_count = opt.max_batch_size / Node::size() as u64; + create_dir_all(opt.dir.as_str()).await?; + create_dir_all(opt.value_dir.as_str()).await?; if !(opt.value_log_file_size <= 2 << 30 && opt.value_log_file_size >= 1 << 20) { return Err(Error::ValueLogSize); } - let (manifest_file, manifest) = open_or_create_manifest_file(opt.dir.as_str())?; + let (manifest_file, manifest) = open_or_create_manifest_file(opt.dir.as_str()).await?; let dir_lock_guard = OpenOptions::new() .write(true) .append(true) @@ -267,6 +266,16 @@ impl ArcKV { pub async fn manifest_wl(&self) -> RwLockWriteGuard<'_, ManifestFile> { self.manifest.write().await } + + pub async fn close(&self) -> Result<()> { + info!("Closing database"); + // Stop value GC first; + self.to_ref().closers.value_gc.signal_and_wait().await; + // Stop writes next. + self.to_ref().closers.writes.signal_and_wait().await; + + Ok(()) + } } impl Clone for WeakKV { diff --git a/src/level_handler.rs b/src/level_handler.rs index 5ea5816..ee26f45 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -28,6 +28,53 @@ impl From for LevelHandler { } impl LevelHandler { + // Check does some sanity check on one level of data or in-memory index. + pub(crate) fn validate(&self) -> Result<()> { + self.lock_exclusive(); + defer! {self.unlock_exclusive();} + if self.level() == 0 { + return Ok(()); + } + let tables = self.tables.write(); + let num_tables = tables.len(); + for j in 1..num_tables { + if j >= tables.len() { + return Err(format!( + "Level={}, j={}, number_tables={}", + self.level(), + j, + num_tables + ) + .into()); + } + + if tables[j - 1].biggest() >= tables[j].smallest() { + return Err(format!( + "Inter: {} vs {}: level={} j={} numTables={}", + String::from_utf8_lossy(tables[j - 1].biggest()), + String::from_utf8_lossy(tables[j].smallest()), + self.level(), + j, + num_tables + ) + .into()); + } + if tables[j].smallest() > tables[j].biggest() { + return Err(format!( + "Intra: {} vs {}: level={} j={} numTables={}", + String::from_utf8_lossy(tables[j].smallest()), + String::from_utf8_lossy(tables[j].biggest()), + self.level(), + j, + num_tables + ) + .into()); + } + } + + Ok(()) + } + // Returns true if the non-zero level may be compacted. *del_size* provides the size of the tables // which are currently being compacted so that we treat them as already having started being // compacted (because they have been, yet their size is already counted in get_total_size). diff --git a/src/levels.rs b/src/levels.rs index 9562643..bc6eb7f 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -1,4 +1,4 @@ -use crate::compaction::{CompactStatus, KeyRange, INFO_RANGE}; +use crate::compaction::{CompactStatus, KeyRange, LevelCompactStatus, INFO_RANGE}; use crate::kv::{ArcKV, WeakKV, KV}; use crate::level_handler::{LevelHandler, LevelHandlerInner}; use crate::manifest::{Manifest, ManifestChangeBuilder}; @@ -6,9 +6,9 @@ use crate::pb::badgerpb3::manifest_change::Operation::{CREATE, DELETE}; use crate::pb::badgerpb3::ManifestChange; use crate::table::builder::Builder; use crate::table::iterator::{ConcatIterator, IteratorImpl, IteratorItem}; -use crate::table::table::{new_file_name, Table, TableCore}; +use crate::table::table::{get_id_map, new_file_name, Table, TableCore}; use crate::types::{Closer, XArc, XWeak}; -use crate::y::{create_synced_file, sync_directory}; +use crate::y::{create_synced_file, open_existing_synced_file, sync_directory}; use crate::Result; use crate::Xiterator; use crate::{MergeIterOverBuilder, MergeIterOverIterator}; @@ -18,15 +18,18 @@ use drop_cell::defer; use log::{error, info}; use parking_lot::lock_api::RawRwLock; use parking_lot::{Mutex, RwLock, RwLockReadGuard}; +use serde_json::ser::CharEscape::Tab; use std::cell::RefCell; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt::{Display, Formatter}; +use std::fs::remove_file; use std::io::Write; use std::sync::atomic::{AtomicI64, AtomicU64}; use std::sync::Arc; use std::time::{Duration, SystemTime}; use std::vec; use tokio::macros::support::thread_rng_n; +use tracing_subscriber::fmt::format; #[derive(Clone)] pub(crate) struct LevelsController { @@ -54,15 +57,87 @@ impl LevelsController { fn new(kv: ArcKV, mf: &Manifest) -> Result { assert!(kv.x.opt.num_level_zero_tables_stall > kv.x.opt.num_level_zero_tables); let mut levels = vec![]; + let cstatus = CompactStatus::default(); for i in 0..kv.x.opt.max_levels { let lh = LevelHandlerInner::new(WeakKV::from(&kv), i); levels.push(LevelHandler::from(lh)); if i == 0 { + // Do nothing } else if i == 1 { + // Level 1 probably shouldn't be too much bigger than level 0. + levels[i] + .max_total_size + .store(kv.opt.level_one_size, Ordering::Relaxed); } else { + levels[i].max_total_size.store( + levels[i - 1].max_total_size.load(Ordering::Relaxed) + * kv.opt.level_size_multiplier, + Ordering::Relaxed, + ); } + cstatus.levels.write().push(LevelCompactStatus::default()); } - todo!() + // Compare manifest against directory, check for existent/non-existent files, and remove. + revert_to_manifest(&kv, mf, get_id_map(&kv.opt.dir))?; + + // Some files may be deleted. Let's reload. + let mut tables: Vec> = vec![vec![]; levels.len()]; + let mut max_file_id = 0; + for (file_id, table_manifest) in &mf.tables { + let file_name = new_file_name(*file_id, kv.opt.dir.as_str()); + let fd = open_existing_synced_file(&file_name, true); + if fd.is_err() { + return Err( + format!("Openfile file: {}, err: {:?}", file_name, fd.unwrap_err()).into(), + ); + } + + let tb = TableCore::open_table(fd.unwrap(), &file_name, kv.opt.table_loading_mode); + if let Err(err) = tb { + return Err(format!("Openfile file: {}, err: {:?}", file_name, err).into()); + } + let table = Table::new(tb.unwrap()); + tables[table_manifest.level as usize].push(table); + if *file_id > max_file_id { + max_file_id = *file_id; + } + } + + let next_file_id = max_file_id + 1; + for (i, tbls) in tables.into_iter().enumerate() { + levels[i].init_tables(tbls); + } + + // Make sure key ranges do not overlap etc. + let level_controller = LevelsController { + levels: Arc::new(levels), + kv: WeakKV::from(&kv), + next_file_id: Arc::new(AtomicU64::new(next_file_id)), + compact_worker_wg: Arc::new(Default::default()), + c_status: Arc::new(cstatus), + }; + if let Err(err) = level_controller.validate() { + let _ = level_controller.cleanup_levels(); + return Err(format!("Level validation, err:{}", err).into()); + } + // Sync directory (because we have at least removed some files, or previously created the manifest file). + if let Err(err) = sync_directory(kv.opt.dir.as_str()) { + let _ = level_controller.close(); + return Err(err); + } + + Ok(level_controller) + } + + fn validate(&self) -> Result<()> { + for level in self.levels.iter() { + level.validate()?; + } + Ok(()) + } + + fn close(&self) -> Result<()> { + self.cleanup_levels() } // cleanup all level's handler @@ -185,7 +260,7 @@ impl LevelsController { let changes = vec![delete_change, create_change]; let kv = self.must_kv(); let mut manifest = kv.manifest.write().await; - manifest.add_changes(changes)?; + manifest.add_changes(changes).await?; // We have to add to next_level before we remove from this_level, not after. This way, we // don't have a bug where reads would see keys missing from both levels. // @@ -219,7 +294,7 @@ impl LevelsController { { let kv = self.must_kv(); let mut manifest = kv.manifest.write().await; - manifest.add_changes(change_set)?; + manifest.add_changes(change_set).await?; } // See comment earlier in this function about the ordering of these ops, and the order in which @@ -291,7 +366,7 @@ impl LevelsController { let file_id = self.reserve_file_id(); let dir = self.must_kv().opt.dir.clone(); - let file_name = new_file_name(file_id, dir.to_string()); + let file_name = new_file_name(file_id, &dir); let kv = self.must_kv(); let worker = g.worker(); let tx = tx.clone(); @@ -636,3 +711,27 @@ impl CompactDef { self.this_level.unlock_exclusive(); } } + +// Checks that all necessary table files exist and removes all table files not +// referenced by the manifest. id_map is a set of table file id's that were read from the directory +// listing. +fn revert_to_manifest(kv: &XArc, mf: &Manifest, id_map: HashSet) -> Result<()> { + // 1. Check all files in manifest exist. + for id in &mf.tables { + if !id_map.contains(id.0) { + return Err(format!("file does not exist for table {}", id.0).into()); + } + } + + // 2. Delete files that shouldn't exist. + for id in &id_map { + if !mf.tables.contains_key(id) { + error!("table file {} not referenced in MANIFEST", id); + let file_name = new_file_name(*id, &kv.opt.dir); + if let Err(err) = remove_file(file_name) { + error!("While removing table {}, err: {}", id, err); + } + } + } + Ok(()) +} diff --git a/src/manifest.rs b/src/manifest.rs index f6ab6c0..fd3d212 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -3,7 +3,7 @@ use crate::pb::badgerpb3::manifest_change::Operation; use crate::pb::badgerpb3::{ManifestChange, ManifestChangeSet}; use crate::y::{is_eof, open_existing_synced_file}; use crate::Error::{BadMagic, Unexpected}; -use crate::Result; +use crate::{is_existing, Result}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use log::info; use parking_lot::RwLock; @@ -38,7 +38,7 @@ pub struct LevelManifest { /// in the LSM tree. #[derive(Default, Clone)] pub struct TableManifest { - level: u8, + pub level: u8, } #[derive(Default)] @@ -50,22 +50,22 @@ pub struct ManifestFile { // Access must be with a lock. // Used to track the current state of the manifest, used when rewriting. - manifest: Arc>, + manifest: Arc>, } impl ManifestFile { /// Write a batch of changes, atomically, to the file. By "atomically" that means when /// we replay the *MANIFEST* file, we'll either replay all the changes or none of them. (The truth of /// this depends on the filesystem) - pub fn add_changes(&mut self, changes: Vec) -> Result<()> { + pub async fn add_changes(&mut self, changes: Vec) -> Result<()> { let mut mf_changes = ManifestChangeSet::new(); mf_changes.changes.extend(changes); let mf_buffer = mf_changes.write_to_bytes().unwrap(); // Maybe we could user O_APPEND instead (on certain file systems) - apply_manifest_change_set(self.manifest.clone(), &mf_changes)?; - // Rewrite manifest if it'd shrink by 1/10 and it's big enough to care + apply_manifest_change_set(self.manifest.clone(), &mf_changes).await?; + // Rewrite manifest if it'd shrink by 1/10, and it's big enough to care let rewrite = { - let mf_lck = self.manifest.read(); + let mf_lck = self.manifest.read().await; mf_lck.deletions > self .deletions_rewrite_threshold @@ -74,7 +74,7 @@ impl ManifestFile { > MANIFEST_DELETIONS_RATIO * (mf_lck.creations - mf_lck.deletions) }; if rewrite { - self.rewrite()?; + self.rewrite().await?; } else { let mut buffer = Cursor::new(vec![]); buffer.write_u32::(mf_buffer.len() as u32)?; @@ -88,19 +88,22 @@ impl ManifestFile { } /// Must be called while appendLock is held. - pub fn rewrite(&mut self) -> Result<()> { + pub async fn rewrite(&mut self) -> Result<()> { { self.fp.take(); } - let (fp, n) = Self::help_rewrite(&self.directory, self.manifest.clone())?; + let (fp, n) = Self::help_rewrite(&self.directory, &self.manifest).await?; self.fp = Some(fp); - let mut m_lck = self.manifest.write(); + let mut m_lck = self.manifest.write().await; m_lck.creations = n; m_lck.deletions = 0; Ok(()) } - fn help_rewrite(dir: &str, m: Arc>) -> Result<(File, usize)> { + async fn help_rewrite( + dir: &str, + m: &Arc>, + ) -> Result<(File, usize)> { let rewrite_path = Path::new(dir).join(MANIFEST_REWRITE_FILENAME); // We explicitly sync. let mut fp = File::options() @@ -113,7 +116,7 @@ impl ManifestFile { wt.write_all(MAGIC_TEXT)?; wt.write_u32::(MAGIC_VERSION)?; - let m_lck = m.read(); + let m_lck = m.read().await; let net_creations = m_lck.tables.len(); let mut mf_set = ManifestChangeSet::new(); mf_set.changes = m_lck.as_changes(); @@ -139,13 +142,16 @@ impl ManifestFile { Ok((fp, net_creations)) } - fn open_or_create_manifest_file(dir: &str, deletions_threshold: u32) -> Result { + async fn open_or_create_manifest_file( + dir: &str, + deletions_threshold: u32, + ) -> Result { let path = Path::new(dir).join(MANIFEST_FILENAME); // We explicitly sync in add_changes, outside the lock. let fp = open_existing_synced_file(path.to_str().unwrap(), false); return match fp { Ok(mut fp) => { - let (manifest, trunc_offset) = Manifest::replay_manifest_file(&mut fp)?; + let (manifest, trunc_offset) = Manifest::replay_manifest_file(&mut fp).await?; fp.set_len(trunc_offset as u64)?; fp.seek(SeekFrom::End(0))?; info!("recover a new manifest, offset: {}", trunc_offset); @@ -153,12 +159,12 @@ impl ManifestFile { fp: Some(fp), directory: dir.to_string(), deletions_rewrite_threshold: AtomicU32::new(deletions_threshold), - manifest: Arc::new(RwLock::new(manifest)), + manifest: Arc::new(tokio::sync::RwLock::new(manifest)), }) } Err(err) if err.is_io_notfound() => { - let mf = Arc::new(RwLock::new(Manifest::new())); - let (fp, n) = Self::help_rewrite(dir, mf.clone())?; + let mf = Arc::new(tokio::sync::RwLock::new(Manifest::new())); + let (fp, n) = Self::help_rewrite(dir, &mf).await?; assert_eq!(n, 0); info!("create a new manifest"); Ok(ManifestFile { @@ -206,7 +212,7 @@ impl Manifest { /// Also, returns the last offset after a completely read manifest entry -- the file must be /// truncated at that point before further appends are made (if there is a partial entry after /// that). In normal conditions, trunc_offset is the file size. - pub fn replay_manifest_file(fp: &mut File) -> Result<(Manifest, usize)> { + pub async fn replay_manifest_file(fp: &mut File) -> Result<(Manifest, usize)> { let mut magic = vec![0u8; 4]; if fp.read(&mut magic)? != 4 { return Err(BadMagic); @@ -218,7 +224,7 @@ impl Manifest { return Err(BadMagic); } - let build = Arc::new(RwLock::new(Manifest::new())); + let build = Arc::new(tokio::sync::RwLock::new(Manifest::new())); let mut offset = 8; loop { let sz = fp.read_u32::(); @@ -237,11 +243,11 @@ impl Manifest { break; } let mf_set = ManifestChangeSet::parse_from_bytes(&buffer).map_err(|_| BadMagic)?; - apply_manifest_change_set(build.clone(), &mf_set)?; + apply_manifest_change_set(build.clone(), &mf_set).await?; offset = offset + 8 + sz as usize; } - let build = build.write().clone(); + let build = build.write().await.clone(); // so, return the lasted ManifestFile Ok((build, offset)) } @@ -299,19 +305,22 @@ impl Manifest { // this is not a "recoverable" error -- opening the KV store fails because the MANIFEST file // is just plain broken. -fn apply_manifest_change_set( - build: Arc>, +async fn apply_manifest_change_set( + build: Arc>, mf_set: &ManifestChangeSet, ) -> Result<()> { for change in mf_set.changes.iter() { - apply_manifest_change(build.clone(), change)?; + apply_manifest_change(build.clone(), change).await?; } Ok(()) } -fn apply_manifest_change(build: Arc>, tc: &ManifestChange) -> Result<()> { +async fn apply_manifest_change( + build: Arc>, + tc: &ManifestChange, +) -> Result<()> { let op = Operation::from_i32(tc.Op.value()).unwrap(); - let mut build = build.write(); + let mut build = build.write().await; match op { Operation::CREATE => { if build.tables.contains_key(&tc.Id) { @@ -352,10 +361,28 @@ fn apply_manifest_change(build: Arc>, tc: &ManifestChange) -> R Ok(()) } -pub(crate) fn open_or_create_manifest_file(dir: &str) -> Result<(ManifestFile, Manifest)> { +pub(crate) async fn open_or_create_manifest_file(dir: &str) -> Result<(ManifestFile, Manifest)> { + let manifest = Arc::new(tokio::sync::RwLock::new(Manifest::new())); + let (fp, sz) = ManifestFile::help_rewrite(dir, &manifest).await?; + Ok((ManifestFile::default(), Manifest::default())) } +pub(crate) async fn help_open_or_create_manifest_file( + dir: &str, +) -> Result<(ManifestFile, Manifest)> { + let fpath = Path::new(dir).join(MANIFEST_FILENAME); + let fp = open_existing_synced_file(fpath.as_str().unwrap(), true); + if fp.is_err() { + if !is_existing(&fp.map_err()) { + return Err(fp.unwrap_err()); + } + let mt = Manifest::new(); + let fp = mt.help_rewrite(dir)?; + // let fp = mt.help_rewrite(dir).await?; + } +} + #[derive(Debug)] pub(crate) struct ManifestChangeBuilder { id: u64, diff --git a/src/options/mod.rs b/src/options/mod.rs index 1d7010c..0784dd4 100644 --- a/src/options/mod.rs +++ b/src/options/mod.rs @@ -36,7 +36,7 @@ pub struct Options { /// Each table (or file) is at most this size. pub max_table_size: u64, /// Equals SizeOf(Li+1)/SizeOf(Li). - pub level_size_multiplier: usize, + pub level_size_multiplier: u64, /// Maximum number of levels of compaction. pub max_levels: usize, /// If value size >= this threshold, only store value offsets in tree. diff --git a/src/table/table.rs b/src/table/table.rs index 024823b..c1d8c2f 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -403,8 +403,8 @@ pub fn id_to_filename(id: u64) -> String { format!("{}{}", id, FILE_SUFFIX) } -pub fn new_file_name(id: u64, dir: String) -> String { - Path::new(&dir) +pub fn new_file_name(id: u64, dir: &str) -> String { + Path::new(dir) .join(&id_to_filename(id)) .to_str() .unwrap() diff --git a/src/y/mod.rs b/src/y/mod.rs index 6fdcfac..965fbce 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -269,7 +269,7 @@ pub(crate) fn create_synced_file(file_name: &str, synce: bool) -> Result { .map_err(|err| err.into()) } -pub(crate) fn sync_directory(d: &String) -> Result<()> { +pub(crate) fn sync_directory(d: &str) -> Result<()> { let mut fp = File::open(d)?; fp.sync_all().map_err(|err| err.into()) } From bd089a5e6f9d86df3432d8848d01e33b2f8bdbfe Mon Sep 17 00:00:00 2001 From: Rg Date: Fri, 10 Feb 2023 01:38:27 +0800 Subject: [PATCH 24/77] :sleep: --- src/kv.rs | 4 +- src/levels.rs | 1 - src/manifest.rs | 107 ++++++++++++++++++++++++++++++------------------ 3 files changed, 69 insertions(+), 43 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index f8343f0..0f6e3c9 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -67,7 +67,7 @@ impl KV { if !(opt.value_log_file_size <= 2 << 30 && opt.value_log_file_size >= 1 << 20) { return Err(Error::ValueLogSize); } - let (manifest_file, manifest) = open_or_create_manifest_file(opt.dir.as_str()).await?; + let manifest_file = open_or_create_manifest_file(opt.dir.as_str()).await?; let dir_lock_guard = OpenOptions::new() .write(true) .append(true) @@ -80,7 +80,7 @@ impl KV { .create(true) .open(Path::new(opt.value_dir.as_str()).join("value_dir_guard.lock"))?; value_dir_guard.lock_exclusive()?; - let mut closers = Closers { + let closers = Closers { update_size: Closer::new(0), compactors: Closer::new(0), mem_table: Closer::new(0), diff --git a/src/levels.rs b/src/levels.rs index bc6eb7f..d02947a 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -29,7 +29,6 @@ use std::sync::Arc; use std::time::{Duration, SystemTime}; use std::vec; use tokio::macros::support::thread_rng_n; -use tracing_subscriber::fmt::format; #[derive(Clone)] pub(crate) struct LevelsController { diff --git a/src/manifest.rs b/src/manifest.rs index fd3d212..df860c8 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -1,7 +1,7 @@ // use crate::pb::badgerpb3::{ManifestChange, ManifestChangeSet, ManifestChange_Operation}; use crate::pb::badgerpb3::manifest_change::Operation; use crate::pb::badgerpb3::{ManifestChange, ManifestChangeSet}; -use crate::y::{is_eof, open_existing_synced_file}; +use crate::y::{is_eof, open_existing_synced_file, sync_directory}; use crate::Error::{BadMagic, Unexpected}; use crate::{is_existing, Result}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; @@ -11,14 +11,16 @@ use protobuf::{Enum, EnumOrUnknown, Message}; use std::collections::{HashMap, HashSet}; use std::fs::{rename, File}; use std::io::{Cursor, Read, Seek, SeekFrom, Write}; +use std::os::fd::AsRawFd; use std::path::Path; use std::sync::atomic::AtomicU32; use std::sync::Arc; +use tokio::io::AsyncWriteExt; // Manifest file const MANIFEST_FILENAME: &str = "MANIFEST"; const MANIFEST_REWRITE_FILENAME: &str = "MANIFEST-REWRITE"; -const MANIFEST_DELETIONS_REWRITE_THRESHOLD: usize = 10000; +const MANIFEST_DELETIONS_REWRITE_THRESHOLD: u32 = 10000; const MANIFEST_DELETIONS_RATIO: usize = 10; // Has to be 4 bytes. The value can never change, ever, anyway. @@ -43,14 +45,14 @@ pub struct TableManifest { #[derive(Default)] pub struct ManifestFile { - fp: Option, - directory: String, + pub(crate) fp: Option, + pub(crate) directory: String, // We make this configurable so that unit tests can hit rewrite() code quickly - deletions_rewrite_threshold: AtomicU32, + pub(crate) deletions_rewrite_threshold: AtomicU32, // Access must be with a lock. // Used to track the current state of the manifest, used when rewriting. - manifest: Arc>, + pub(crate) manifest: Arc>, } impl ManifestFile { @@ -76,11 +78,11 @@ impl ManifestFile { if rewrite { self.rewrite().await?; } else { - let mut buffer = Cursor::new(vec![]); - buffer.write_u32::(mf_buffer.len() as u32)?; + let mut buffer = tokio::io::BufWriter::new(vec![]); + buffer.write_u32(mf_buffer.len() as u32).await?; let crc32 = crc32fast::hash(&mf_buffer); - buffer.write_u32::(crc32)?; - buffer.write_all(&mf_buffer)?; + buffer.write_u32(crc32).await?; + buffer.write_all(&mf_buffer).await?; self.fp.as_mut().unwrap().write_all(&buffer.into_inner())?; } self.fp.as_mut().unwrap().sync_all()?; @@ -112,19 +114,19 @@ impl ManifestFile { .truncate(true) .read(true) .open(&rewrite_path)?; - let mut wt = Cursor::new(vec![]); - wt.write_all(MAGIC_TEXT)?; - wt.write_u32::(MAGIC_VERSION)?; + let mut wt = tokio::io::BufWriter::new(vec![]); + wt.write_all(MAGIC_TEXT).await?; + wt.write_u32(MAGIC_VERSION).await?; let m_lck = m.read().await; let net_creations = m_lck.tables.len(); let mut mf_set = ManifestChangeSet::new(); mf_set.changes = m_lck.as_changes(); let mf_buffer = mf_set.write_to_bytes().unwrap(); - wt.write_u32::(mf_buffer.len() as u32)?; + wt.write_u32(mf_buffer.len() as u32).await?; let crc32 = crc32fast::hash(&*mf_buffer); - wt.write_u32::(crc32)?; - wt.write_all(&*mf_buffer)?; + wt.write_u32(crc32).await?; + wt.write_all(&*mf_buffer).await?; fp.write_all(&*wt.into_inner())?; fp.sync_all()?; drop(fp); @@ -252,7 +254,8 @@ impl Manifest { Ok((build, offset)) } - fn help_rewrite(&self, dir: &str) -> Result<(File, usize)> { + async fn help_rewrite(&self, dir: &str) -> Result<(File, usize)> { + use tokio::io::AsyncWriteExt; let rewrite_path = Path::new(dir).join(MANIFEST_REWRITE_FILENAME); // We explicitly sync. let mut fp = File::options() @@ -261,26 +264,28 @@ impl Manifest { .truncate(true) .read(true) .open(&rewrite_path)?; - let mut wt = Cursor::new(vec![]); - wt.write_all(MAGIC_TEXT)?; - wt.write_u32::(MAGIC_VERSION)?; + let mut fp = tokio::fs::File::from_std(fp); + let mut wt = tokio::io::BufWriter::new(vec![]); + // let mut wt = Cursor::new(vec![]); + wt.write_all(MAGIC_TEXT).await?; + wt.write_u32(MAGIC_VERSION).await?; let net_creations = self.tables.len(); let mut mf_set = ManifestChangeSet::new(); mf_set.changes = self.as_changes(); let mf_buffer = mf_set.write_to_bytes().unwrap(); - wt.write_u32::(mf_buffer.len() as u32)?; + wt.write_u32(mf_buffer.len() as u32).await?; let crc32 = crc32fast::hash(&*mf_buffer); - wt.write_u32::(crc32)?; - wt.write_all(&*mf_buffer)?; - fp.write_all(&*wt.into_inner())?; - fp.sync_all()?; + wt.write_u32(crc32).await?; + wt.write_all(&*mf_buffer).await?; + fp.write_all(&*wt.into_inner()).await?; + fp.flush().await?; + fp.sync_all().await?; drop(fp); let manifest_path = Path::new(dir).join(MANIFEST_FILENAME); - rename(&rewrite_path, &manifest_path)?; - // TODO add directory sync - + tokio::fs::rename(&rewrite_path, &manifest_path).await?; + sync_directory(dir)?; let fp = File::options() .create(true) .write(true) @@ -361,26 +366,48 @@ async fn apply_manifest_change( Ok(()) } -pub(crate) async fn open_or_create_manifest_file(dir: &str) -> Result<(ManifestFile, Manifest)> { - let manifest = Arc::new(tokio::sync::RwLock::new(Manifest::new())); - let (fp, sz) = ManifestFile::help_rewrite(dir, &manifest).await?; - - Ok((ManifestFile::default(), Manifest::default())) +pub(crate) async fn open_or_create_manifest_file(dir: &str) -> Result { + help_open_or_create_manifest_file(dir, MANIFEST_DELETIONS_REWRITE_THRESHOLD).await } +// Open it if not exist, otherwise create a new manifest file with dir directory pub(crate) async fn help_open_or_create_manifest_file( dir: &str, -) -> Result<(ManifestFile, Manifest)> { + deletions_threshold: u32, +) -> Result { let fpath = Path::new(dir).join(MANIFEST_FILENAME); - let fp = open_existing_synced_file(fpath.as_str().unwrap(), true); + let fpath = fpath.to_str(); + // We explicitly sync in add_changes, outside the lock. + let fp = open_existing_synced_file(fpath.unwrap(), true); if fp.is_err() { - if !is_existing(&fp.map_err()) { - return Err(fp.unwrap_err()); + let err = fp.unwrap_err(); + if !err.is_io_existing() { + return Err(err); } - let mt = Manifest::new(); - let fp = mt.help_rewrite(dir)?; - // let fp = mt.help_rewrite(dir).await?; + // open exist Manifest + let mt = Arc::new(tokio::sync::RwLock::new(Manifest::new())); + let (fp, net_creations) = mt.read().await.help_rewrite(dir).await?; + assert_eq!(net_creations, 0); + let mf = ManifestFile { + fp: Some(fp), + directory: dir.to_string(), + deletions_rewrite_threshold: Default::default(), + manifest: mt, + }; + return Ok(mf); } + let mut fp = fp.unwrap(); + let (mf, trunc_offset) = Manifest::replay_manifest_file(&mut fp).await?; + // Truncate file so we don't have a half-written entry at the end. + fp.set_len(trunc_offset as u64)?; + fp.seek(SeekFrom::Start(0))?; + + Ok(ManifestFile { + fp: Some(fp), + directory: dir.to_string(), + deletions_rewrite_threshold: AtomicU32::new(deletions_threshold), + manifest: Arc::new(tokio::sync::RwLock::new(mf)), + }) } #[derive(Debug)] From 994a99e9ef5b864a9549899ae4f0c45323f73cf5 Mon Sep 17 00:00:00 2001 From: Rg Date: Sun, 12 Feb 2023 17:47:47 +0800 Subject: [PATCH 25/77] :card: --- Cargo.toml | 1 + src/kv.rs | 124 +++++++++++++++++++++++++++++++++++++----- src/level_handler.rs | 15 +++--- src/levels.rs | 126 ++++++++++++++++++++++--------------------- src/types.rs | 68 +++++++++++------------ 5 files changed, 216 insertions(+), 118 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 36685cb..d2ebaca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ protobuf = { version = "3.2.0", features = ["with-bytes"] } range-lock = "0.2.2" tracing = "0.1.37" drop_cell = "0.0.0" +walkdir = "2.3.2" [dev-dependencies] tracing-subscriber = "0.3.16" tracing-log = "0.1.3" diff --git a/src/kv.rs b/src/kv.rs index 0f6e3c9..4a04d24 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -1,3 +1,4 @@ +use crate::levels::{LevelsController, XLevelsController}; use crate::manifest::{open_or_create_manifest_file, Manifest, ManifestFile}; use crate::options::Options; use crate::table::builder::Builder; @@ -6,14 +7,15 @@ use crate::types::{Channel, Closer, XArc, XWeak}; use crate::value_log::{Request, ValueLogCore, ValuePointer}; use crate::y::{Encode, Result, ValueStruct}; use crate::Error::Unexpected; -use crate::{Error, Node, SkipList}; +use crate::{Decode, Error, Node, SkipList}; +use drop_cell::defer; use fs2::FileExt; -use log::info; +use log::{info, Log}; use std::borrow::BorrowMut; use std::fs::{read_dir, File}; use std::fs::{try_exists, OpenOptions}; -use std::io::Write; -use std::path::Path; +use std::io::{Cursor, Write}; +use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use tokio::fs::create_dir_all; @@ -40,6 +42,7 @@ pub struct KV { pub opt: Options, pub vlog: Option, pub manifest: Arc>, + // lc: XWeak, flush_chan: Channel, // write_chan: Channel, dir_lock_guard: File, @@ -93,6 +96,7 @@ impl KV { opt: opt.clone(), vlog: None, manifest: Arc::new(RwLock::new(manifest_file)), + // lc: Default::default(), flush_chan: Channel::new(1), // write_chan: Channel::new(1), dir_lock_guard, @@ -102,20 +106,55 @@ impl KV { imm: Vec::new(), last_used_cas_counter: Default::default(), }; + + let manifest = out.manifest.clone(); + + // handle levels_controller + let lc = LevelsController::new(manifest.clone(), out.opt.clone()).await?; + lc.start_compact(out.closers.compactors.clone()); + let mut vlog = ValueLogCore::default(); vlog.open(&out, opt)?; - out.vlog = Some(vlog); - Ok(XArc::new(out)) - } - // pub fn must_vlog(&self) -> &ValueLogCore { - // self.vlog.as_ref().unwrap() - // } + let xout = XArc::new(out); + // update size + { + let _out = xout.clone(); + tokio::spawn(async move { + _out.spawn_update_size().await; + }); + } + // memtable closer + { + let _out = xout.clone(); + tokio::spawn(async move { + _out.flush_mem_table(_out.closers.mem_table.clone()).await; + }); + } + + let item = xout.get(_HEAD); + if item.is_err() { + return Err("Retrieving head".into()); + } + let item = item.unwrap(); + + let value = &item.value; + if value != _HEAD { + return Err("Retrieving head".into()); + } + + // lastUsedCasCounter will either be the value stored in !badger!head, or some subsequently + // written value log entry that we replay. (Subsequent value log entries might be _less_ + // than lastUsedCasCounter, if there was value log gc so we have to max() values while + // replaying.) + xout.last_used_cas_counter + .store(item.cas_counter, Ordering::Relaxed); - // pub fn must_mut_vlog(&mut self) -> &mut ValueLogCore { - // self.vlog.as_mut().unwrap() - // } + let mut vptr = ValuePointer::default(); + vptr.dec(&mut Cursor::new(value))?; + Ok(xout) + } /// close kv, should be call only once pub async fn close(&self) -> Result<()> { @@ -134,6 +173,24 @@ impl KV { } impl KV { + async fn walk_dir(dir: &str) -> Result<(u64, u64)> { + let mut lsm_size = 0; + let mut vlog_size = 0; + let mut entries = tokio::fs::read_dir("dir").await?; + while let Some(entry) = entries.next_entry().await? { + let meta = entry.metadata().await?; + if meta.is_dir() { + continue; + } + if entry.file_name().to_str().unwrap().ends_with(".sst") { + lsm_size += meta.len(); + } else if entry.file_name().to_str().unwrap().ends_with(".vlog") { + vlog_size += meta.len(); + } + } + Ok((lsm_size, vlog_size)) + } + // get returns the value in `mem_table` or disk for given key. // Note that value will include meta byte. pub(crate) fn get(&self, key: &[u8]) -> Result { @@ -218,7 +275,7 @@ impl KV { Arc::new(Ok(())) } - async fn flush_mem_table(&self, lc: &Closer) -> Result<()> { + async fn flush_mem_table(&self, lc: Closer) -> Result<()> { while let Ok(task) = self.flush_chan.recv().await { if task.mt.is_none() { break; @@ -263,6 +320,36 @@ pub type WeakKV = XWeak; pub type ArcKV = XArc; impl ArcKV { + /// data size stats + /// TODO + pub async fn spawn_update_size(&self) { + let lc = self.closers.update_size.spawn(); + defer! { + lc.done(); + info!("exit update size worker"); + } + + let mut tk = tokio::time::interval(tokio::time::Duration::from_secs(5 * 60)); + let dir = self.opt.dir.clone(); + let vdir = self.opt.value_dir.clone(); + loop { + let c = lc.has_been_closed(); + tokio::select! { + _ = tk.tick() => { + info!("ready to update size"); + // If value directory is different from dir, we'd have to do another walk. + let (lsm_sz, vlog_sz) = KV::walk_dir(dir.as_str()).await.unwrap(); + if dir != vdir { + let (_, vlog_sz) = KV::walk_dir(dir.as_str()).await.unwrap(); + } + }, + _ = c.recv() => { + + }, + } + } + } + pub async fn manifest_wl(&self) -> RwLockWriteGuard<'_, ManifestFile> { self.manifest.write().await } @@ -278,6 +365,15 @@ impl ArcKV { } } +impl ArcKV { + async fn do_writes(&self) { + // TODO add metrics + loop { + + } + } +} + impl Clone for WeakKV { fn clone(&self) -> Self { XWeak { x: self.x.clone() } diff --git a/src/level_handler.rs b/src/level_handler.rs index ee26f45..fc3bd1b 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -8,6 +8,7 @@ use crate::Result; use core::slice::SlicePattern; use crate::levels::CompactDef; +use crate::options::Options; use crate::table::builder::Builder; use drop_cell::defer; use log::info; @@ -203,7 +204,7 @@ impl LevelHandler { pub(crate) fn try_add_level0_table(&self, t: Table) -> bool { assert_eq!(self.x.level.load(Ordering::Relaxed), 0); let tw = self.tables_wl(); - if tw.len() >= self.kv().x.opt.num_level_zero_tables_stall { + if tw.len() >= self.opt.num_level_zero_tables_stall { return false; } t.incr_ref(); @@ -264,10 +265,6 @@ impl LevelHandler { pub(crate) fn level(&self) -> usize { self.x.level.load(Ordering::Relaxed) as usize } - - fn kv(&self) -> XArc { - self.x.kv.upgrade().unwrap() - } } pub(crate) struct LevelHandlerInner { @@ -284,11 +281,11 @@ pub(crate) struct LevelHandlerInner { pub(crate) level: AtomicI32, str_level: Arc, pub(crate) max_total_size: AtomicU64, - kv: WeakKV, + opt: Options, } impl LevelHandlerInner { - pub(crate) fn new(kv: WeakKV, level: usize) -> LevelHandlerInner { + pub(crate) fn new(opt: Options, level: usize) -> LevelHandlerInner { LevelHandlerInner { self_lock: Arc::new(Default::default()), tables: Arc::new(Default::default()), @@ -296,7 +293,7 @@ impl LevelHandlerInner { level: Default::default(), str_level: Arc::new(format!("L{}", level)), max_total_size: Default::default(), - kv, + opt, } } @@ -339,7 +336,7 @@ impl LevelHandlerInner { #[test] fn raw_lock() { - let lock = LevelHandlerInner::new(WeakKV::new(), 10); + let lock = LevelHandlerInner::new(Options::default(), 10); lock.lock_shared(); lock.lock_shared(); assert_eq!(false, lock.try_lock_exclusive()); diff --git a/src/levels.rs b/src/levels.rs index d02947a..ab4be3f 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -1,7 +1,8 @@ use crate::compaction::{CompactStatus, KeyRange, LevelCompactStatus, INFO_RANGE}; use crate::kv::{ArcKV, WeakKV, KV}; use crate::level_handler::{LevelHandler, LevelHandlerInner}; -use crate::manifest::{Manifest, ManifestChangeBuilder}; +use crate::manifest::{Manifest, ManifestChangeBuilder, ManifestFile}; +use crate::options::Options; use crate::pb::badgerpb3::manifest_change::Operation::{CREATE, DELETE}; use crate::pb::badgerpb3::ManifestChange; use crate::table::builder::Builder; @@ -34,31 +35,27 @@ use tokio::macros::support::thread_rng_n; pub(crate) struct LevelsController { // The following are initialized once and const levels: Arc>, - kv: WeakKV, next_file_id: Arc, // For ending compactions. compact_worker_wg: Arc, // Store compact status that will be run or has running c_status: Arc, + manifest: Arc>, + opt: Options, } -unsafe impl Sync for LevelsController {} - -unsafe impl Send for LevelsController {} - -impl Default for LevelsController { - fn default() -> Self { - todo!() - } -} +pub(crate) type XLevelsController = XArc; impl LevelsController { - fn new(kv: ArcKV, mf: &Manifest) -> Result { - assert!(kv.x.opt.num_level_zero_tables_stall > kv.x.opt.num_level_zero_tables); + pub(crate) async fn new( + manifest: Arc>, + opt: Options, + ) -> Result { + assert!(opt.num_level_zero_tables_stall > opt.num_level_zero_tables); let mut levels = vec![]; let cstatus = CompactStatus::default(); - for i in 0..kv.x.opt.max_levels { - let lh = LevelHandlerInner::new(WeakKV::from(&kv), i); + for i in 0..opt.max_levels { + let lh = LevelHandlerInner::new(opt.clone(), i); levels.push(LevelHandler::from(lh)); if i == 0 { // Do nothing @@ -66,39 +63,48 @@ impl LevelsController { // Level 1 probably shouldn't be too much bigger than level 0. levels[i] .max_total_size - .store(kv.opt.level_one_size, Ordering::Relaxed); + .store(opt.level_one_size, Ordering::Relaxed); } else { levels[i].max_total_size.store( levels[i - 1].max_total_size.load(Ordering::Relaxed) - * kv.opt.level_size_multiplier, + * opt.level_size_multiplier, Ordering::Relaxed, ); } cstatus.levels.write().push(LevelCompactStatus::default()); } // Compare manifest against directory, check for existent/non-existent files, and remove. - revert_to_manifest(&kv, mf, get_id_map(&kv.opt.dir))?; + let mf = manifest.read().await.manifest.clone(); + { + revert_to_manifest(opt.dir.as_str(), &mf, get_id_map(&opt.dir)).await?; + } // Some files may be deleted. Let's reload. let mut tables: Vec> = vec![vec![]; levels.len()]; let mut max_file_id = 0; - for (file_id, table_manifest) in &mf.tables { - let file_name = new_file_name(*file_id, kv.opt.dir.as_str()); - let fd = open_existing_synced_file(&file_name, true); - if fd.is_err() { - return Err( - format!("Openfile file: {}, err: {:?}", file_name, fd.unwrap_err()).into(), - ); - } + { + let mf = mf.write().await; + for (file_id, table_manifest) in &mf.tables { + let file_name = new_file_name(*file_id, opt.dir.as_str()); + let fd = open_existing_synced_file(&file_name, true); + if fd.is_err() { + return Err(format!( + "Openfile file: {}, err: {:?}", + file_name, + fd.unwrap_err() + ) + .into()); + } - let tb = TableCore::open_table(fd.unwrap(), &file_name, kv.opt.table_loading_mode); - if let Err(err) = tb { - return Err(format!("Openfile file: {}, err: {:?}", file_name, err).into()); - } - let table = Table::new(tb.unwrap()); - tables[table_manifest.level as usize].push(table); - if *file_id > max_file_id { - max_file_id = *file_id; + let tb = TableCore::open_table(fd.unwrap(), &file_name, opt.table_loading_mode); + if let Err(err) = tb { + return Err(format!("Openfile file: {}, err: {:?}", file_name, err).into()); + } + let table = Table::new(tb.unwrap()); + tables[table_manifest.level as usize].push(table); + if *file_id > max_file_id { + max_file_id = *file_id; + } } } @@ -106,21 +112,21 @@ impl LevelsController { for (i, tbls) in tables.into_iter().enumerate() { levels[i].init_tables(tbls); } - // Make sure key ranges do not overlap etc. let level_controller = LevelsController { levels: Arc::new(levels), - kv: WeakKV::from(&kv), next_file_id: Arc::new(AtomicU64::new(next_file_id)), compact_worker_wg: Arc::new(Default::default()), c_status: Arc::new(cstatus), + manifest, + opt: opt.clone(), }; if let Err(err) = level_controller.validate() { let _ = level_controller.cleanup_levels(); return Err(format!("Level validation, err:{}", err).into()); } // Sync directory (because we have at least removed some files, or previously created the manifest file). - if let Err(err) = sync_directory(kv.opt.dir.as_str()) { + if let Err(err) = sync_directory(opt.dir.as_str()) { let _ = level_controller.close(); return Err(err); } @@ -148,8 +154,8 @@ impl LevelsController { } // start compact - fn start_compact(&self, lc: Closer) { - for i in 0..self.must_kv().opt.num_compactors { + pub(crate) fn start_compact(&self, lc: Closer) { + for i in 0..self.opt.num_compactors { let lc = lc.spawn(); let _self = self.clone(); tokio::spawn(async move { @@ -160,7 +166,7 @@ impl LevelsController { // compact worker async fn run_worker(&self, lc: Closer) { - if self.must_kv().opt.do_not_compact { + if self.opt.do_not_compact { lc.done(); return; } @@ -203,7 +209,7 @@ impl LevelsController { // Picks some table on level l and compacts it away to the next level. async fn do_compact(&self, p: CompactionPriority) -> Result { let l = p.level; - assert!(l + 1 < self.must_kv().opt.max_levels); // Sanity check. + assert!(l + 1 < self.opt.max_levels); // Sanity check. let mut cd = CompactDef::default(); cd.this_level = (self.levels[l]).clone(); cd.next_level = (self.levels[l + 1]).clone(); @@ -257,8 +263,7 @@ impl LevelsController { .with_op(CREATE) .build(); let changes = vec![delete_change, create_change]; - let kv = self.must_kv(); - let mut manifest = kv.manifest.write().await; + let mut manifest = self.manifest.write().await; manifest.add_changes(changes).await?; // We have to add to next_level before we remove from this_level, not after. This way, we // don't have a bug where reads would see keys missing from both levels. @@ -291,8 +296,7 @@ impl LevelsController { // We write to the manifest _before_ we delete files (and after we created files) { - let kv = self.must_kv(); - let mut manifest = kv.manifest.write().await; + let mut manifest = self.manifest.write().await; manifest.add_changes(change_set).await?; } @@ -348,7 +352,7 @@ impl LevelsController { let start_time = SystemTime::now(); let mut builder = Builder::default(); while let Some(value) = mitr.next() { - if builder.reached_capacity(self.must_kv().opt.max_table_size) { + if builder.reached_capacity(self.opt.max_table_size) { break; } assert!(builder.add(value.key(), value.value()).is_ok()); @@ -364,11 +368,11 @@ impl LevelsController { ); let file_id = self.reserve_file_id(); - let dir = self.must_kv().opt.dir.clone(); + let dir = self.opt.dir.clone(); let file_name = new_file_name(file_id, &dir); - let kv = self.must_kv(); let worker = g.worker(); let tx = tx.clone(); + let loading_mode = self.opt.table_loading_mode; tokio::spawn(async move { defer! {worker.done();} let fd = create_synced_file(&file_name, true); @@ -382,8 +386,7 @@ impl LevelsController { tx.send(Err(format!("Unable to write to file: {}", file_id).into())); return; } - let tbl = - TableCore::open_table(fd.unwrap(), &file_name, kv.opt.table_loading_mode); + let tbl = TableCore::open_table(fd.unwrap(), &file_name, loading_mode); if tbl.is_err() { let _ = tx.send(Err(format!("Unable to open table: {}", file_name).into())); } else { @@ -418,7 +421,7 @@ impl LevelsController { // Ensure created files's directory entries are visible, We don't mind the extra latency // from not doing this ASAP after all file creation has finished because this is a // background operation - first_err = sync_directory(&self.must_kv().opt.dir); + first_err = sync_directory(&self.opt.dir); } new_tables.sort_by(|a, b| a.to_ref().biggest().cmp(b.to_ref().biggest())); if first_err.is_err() { @@ -584,7 +587,7 @@ impl LevelsController { prios.push(CompactionPriority { level: 0, score: (self.levels[0].num_tables() as f64) - / (self.must_kv().opt.num_level_zero_tables as f64), + / (self.opt.num_level_zero_tables as f64), }) } @@ -608,11 +611,7 @@ impl LevelsController { // Return true if level zero may be compacted, without accounting for compactions that already // might be happening. fn is_level0_compactable(&self) -> bool { - self.levels[0].num_tables() >= self.must_kv().opt.num_level_zero_tables - } - - fn must_kv(&self) -> Arc { - self.kv.x.upgrade().unwrap() + self.levels[0].num_tables() >= self.opt.num_level_zero_tables } fn reserve_file_id(&self) -> u64 { @@ -714,9 +713,14 @@ impl CompactDef { // Checks that all necessary table files exist and removes all table files not // referenced by the manifest. id_map is a set of table file id's that were read from the directory // listing. -fn revert_to_manifest(kv: &XArc, mf: &Manifest, id_map: HashSet) -> Result<()> { +async fn revert_to_manifest( + dir: &str, + mf: &Arc>, + id_map: HashSet, +) -> Result<()> { + let tables = mf.write().await; // 1. Check all files in manifest exist. - for id in &mf.tables { + for id in &tables.tables { if !id_map.contains(id.0) { return Err(format!("file does not exist for table {}", id.0).into()); } @@ -724,9 +728,9 @@ fn revert_to_manifest(kv: &XArc, mf: &Manifest, id_map: HashSet) -> Res // 2. Delete files that shouldn't exist. for id in &id_map { - if !mf.tables.contains_key(id) { + if !tables.tables.contains_key(id) { error!("table file {} not referenced in MANIFEST", id); - let file_name = new_file_name(*id, &kv.opt.dir); + let file_name = new_file_name(*id, dir); if let Err(err) = remove_file(file_name) { error!("While removing table {}, err: {}", id, err); } diff --git a/src/types.rs b/src/types.rs index 0c2aad6..82e6a60 100644 --- a/src/types.rs +++ b/src/types.rs @@ -14,14 +14,14 @@ use tokio::time::sleep; // Channel like to go's channel #[derive(Clone)] -pub(crate) struct Channel { +pub struct Channel { rx: Option>, tx: Option>, } impl Channel { - // create a *Channel* with n cap - pub(crate) fn new(n: usize) -> Self { + /// create a *Channel* with n cap + pub fn new(n: usize) -> Self { let (tx, rx) = bounded(n); Channel { rx: Some(rx), @@ -29,46 +29,46 @@ impl Channel { } } - // try to send message T without blocking - pub(crate) fn try_send(&self, msg: T) -> Result<(), TrySendError> { + /// try to send message T without blocking + pub fn try_send(&self, msg: T) -> Result<(), TrySendError> { if let Some(tx) = &self.tx { return tx.try_send(msg); } Ok(()) } - // try to receive a message without blocking - pub(crate) fn try_recv(&self) -> Result { + /// try to receive a message without blocking + pub fn try_recv(&self) -> Result { if let Some(rx) = &self.rx { return rx.try_recv(); } Err(TryRecvError::Empty) } - // async receive a message with blocking - pub(crate) async fn recv(&self) -> Result { + /// async receive a message with blocking + pub async fn recv(&self) -> Result { let rx = self.rx.as_ref().unwrap(); rx.recv().await } - // async send a message with blocking - pub(crate) async fn send(&self, msg: T) -> Result<(), SendError> { + /// async send a message with blocking + pub async fn send(&self, msg: T) -> Result<(), SendError> { let tx = self.tx.as_ref().unwrap(); tx.send(msg).await } - // returns Sender - pub(crate) fn tx(&self) -> Sender { + /// returns Sender + pub fn tx(&self) -> Sender { self.tx.as_ref().unwrap().clone() } - // consume tx and return it if exist - pub(crate) fn take_tx(&mut self) -> Option> { + /// consume tx and return it if exist + pub fn take_tx(&mut self) -> Option> { self.tx.take() } - // close *Channel*, Sender will be consumed - pub(crate) fn close(&self) { + /// close *Channel*, Sender will be consumed + pub fn close(&self) { if let Some(tx) = &self.tx { tx.close(); } @@ -79,7 +79,7 @@ impl Channel { /// to tell the routine to shut down, and a wait_group with which to wait for it to finish shutting /// down. #[derive(Clone)] -pub(crate) struct Closer { +pub struct Closer { closed: Channel<()>, wait: Arc, } @@ -91,8 +91,8 @@ impl Drop for Closer { } impl Closer { - // create a Closer with *initial* cap Workers - pub(crate) fn new(initial: isize) -> Self { + /// create a Closer with *initial* cap Workers + pub fn new(initial: isize) -> Self { assert!(initial >= 0, "Sanity check"); let mut close = Closer { closed: Channel::new(1), @@ -101,36 +101,36 @@ impl Closer { close } - // Incr delta to the WaitGroup. - pub(crate) fn add_running(&self, delta: isize) { + /// Incr delta to the WaitGroup. + pub fn add_running(&self, delta: isize) { let old = self.wait.fetch_add(delta, Ordering::Relaxed); assert!(old >= 0, "Sanity check"); } - // Spawn a worker - pub(crate) fn spawn(&self) -> Self { + /// Spawn a worker + pub fn spawn(&self) -> Self { self.add_running(1); self.clone() } - // Decr delta to the WaitGroup(Note: must be call for every worker avoid leak). - pub(crate) fn done(&self) { + /// Decr delta to the WaitGroup(Note: must be call for every worker avoid leak). + pub fn done(&self) { let old = self.wait.fetch_sub(1, Ordering::Relaxed); assert!(old >= 0, "Sanity check"); } - // Signals the `has_been_closed` signal. - pub(crate) fn signal(&self) { + /// Signals the `has_been_closed` signal. + pub fn signal(&self) { self.closed.close(); } - // Gets signaled when signal() is called. - pub(crate) fn has_been_closed(&self) -> Channel<()> { + /// Gets signaled when signal() is called. + pub fn has_been_closed(&self) -> Channel<()> { self.closed.clone() } - // Waiting until done - pub(crate) async fn wait(&self) { + /// Waiting until done + pub async fn wait(&self) { loop { if self.wait.load(Ordering::Relaxed) <= 0 { break; @@ -140,8 +140,8 @@ impl Closer { } } - // Send a close signal and waiting util done - pub(crate) async fn signal_and_wait(&self) { + /// Send a close signal and waiting util done + pub async fn signal_and_wait(&self) { self.signal(); self.wait().await; } From 3a804bcb632c4d653e2eca457da47c91ad815671 Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 13 Feb 2023 12:11:30 +0800 Subject: [PATCH 26/77] :coffee: --- src/kv.rs | 41 ++++++++++++++++++----------------------- src/value_log.rs | 16 ++++++++-------- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 4a04d24..a039565 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -228,17 +228,20 @@ impl KV { tables } - fn write_requests(&self, reqs: &[Request]) -> Arc> { + // Called serially by only on goroutine + fn write_requests(&self, reqs: &Vec) -> Result<()> { if reqs.is_empty() { - return Arc::new(Ok(())); + return Ok(()); } - let done = |res: Arc>| { + defer! { + for req in reqs { + let worker = req.wait_group.lock().borrow_mut().take().unwrap(); + worker.done(); + } + } + let done = |res: Result<()>| { for req in reqs { - if res.is_err() { - // todo - *req.err.borrow_mut() = res.clone(); - } - let worker = req.wait_group.borrow_mut().take().unwrap(); + let worker = req.wait_group.lock().borrow_mut().take().unwrap(); worker.done(); } }; @@ -251,28 +254,22 @@ impl KV { // There is code (in flush_mem_table) whose correctness depends on us generating CAS Counter // values _before_ we modify s.vptr here. for req in reqs { - let counter_base = self.new_cas_counter(req.entries.len() as u64); - for (idx, entry) in req.entries.iter().enumerate() { + let counter_base = self.new_cas_counter(req.entries.read().len() as u64); + for (idx, entry) in req.entries.read().iter().enumerate() { entry.borrow_mut().cas_counter = counter_base + idx as u64; } } - let ok = self.vlog.as_ref().unwrap().write(reqs); - if ok.is_err() { - let _ok = Arc::new(ok); - done(_ok.clone()); - return _ok.clone(); - } - + self.vlog.as_ref().unwrap().write(reqs)?; info!("Writing to memory table"); let mut count = 0; for req in reqs { - if req.entries.is_empty() { + if req.entries.read().is_empty() { continue; } - count += req.entries.len(); + count += req.entries.read().len(); } - Arc::new(Ok(())) + Ok(()) } async fn flush_mem_table(&self, lc: Closer) -> Result<()> { @@ -368,9 +365,7 @@ impl ArcKV { impl ArcKV { async fn do_writes(&self) { // TODO add metrics - loop { - - } + loop {} } } diff --git a/src/value_log.rs b/src/value_log.rs index 2980b58..348de1b 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -21,6 +21,7 @@ use std::process::id; use std::sync::atomic::{AtomicI32, AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; use std::{fmt, fs, thread}; +use protobuf::well_known_types::api::Mixin; use tabled::object::Entity::Cell; use crate::kv::{ArcKV, WeakKV, KV}; @@ -205,12 +206,11 @@ impl Decode for ValuePointer { } pub(crate) struct Request { - // Input values - pub(crate) entries: Vec>, + // Input values, NOTE: RefCell is called concurrency + pub(crate) entries: RwLock>>, // Output Values and wait group stuff below - pub(crate) ptrs: RefCell>>, - pub(crate) wait_group: RefCell>, - pub(crate) err: RefCell>>, + pub(crate) ptrs: Mutex>>, + pub(crate) wait_group: Mutex>>, } pub struct ValueLogCore { @@ -462,7 +462,7 @@ impl ValueLogCore { } // write is thread-unsafe by design and should not be called concurrently. - pub(crate) fn write(&self, reqs: &[Request]) -> Result<()> { + pub(crate) fn write(&self, reqs: &Vec) -> Result<()> { let cur_vlog_file = self.pick_log_by_vlog_id(&self.max_fid.load(Ordering::Acquire)); let to_disk = || -> Result<()> { if self.buf.borrow().buffer().is_empty() { @@ -502,10 +502,10 @@ impl ValueLogCore { }; for req in reqs { - for (idx, entry) in req.entries.iter().enumerate() { + for (idx, entry) in req.entries.read().iter().enumerate() { if !self.opt.sync_writes && entry.borrow().value.len() < self.opt.value_threshold { // No need to write to value log. - req.ptrs.borrow_mut()[idx] = None; + req.ptrs.lock()[idx] = None; continue; } From f2f75cfb39187bb6bfd5c3f2ae6a56cc26e22ec4 Mon Sep 17 00:00:00 2001 From: Rg Date: Tue, 14 Feb 2023 19:31:54 +0800 Subject: [PATCH 27/77] :coffee: --- src/kv.rs | 105 ++++++++++++++++++++++++++++++++++------------- src/types.rs | 55 +++++++++++++++++++++---- src/value_log.rs | 68 ++++++++++++++++++++++++++---- 3 files changed, 184 insertions(+), 44 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index a039565..49418c1 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -4,13 +4,15 @@ use crate::options::Options; use crate::table::builder::Builder; use crate::table::iterator::IteratorImpl; use crate::types::{Channel, Closer, XArc, XWeak}; -use crate::value_log::{Request, ValueLogCore, ValuePointer}; +use crate::value_log::{ArcRequest, Request, ValueLogCore, ValuePointer}; use crate::y::{Encode, Result, ValueStruct}; use crate::Error::Unexpected; use crate::{Decode, Error, Node, SkipList}; +use bytes::BufMut; use drop_cell::defer; use fs2::FileExt; use log::{info, Log}; +use parking_lot::Mutex; use std::borrow::BorrowMut; use std::fs::{read_dir, File}; use std::fs::{try_exists, OpenOptions}; @@ -19,6 +21,7 @@ use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use tokio::fs::create_dir_all; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::sync::{RwLock, RwLockWriteGuard}; const _BADGER_PREFIX: &[u8; 8] = b"!badger!"; @@ -52,6 +55,7 @@ pub struct KV { mt: SkipList, // Add here only AFTER pushing to flush_ch imm: Vec, + write_ch: Channel, // Incremented in the non-concurrently accessed write loop. But also accessed outside. So // we use an atomic op. last_used_cas_counter: AtomicU64, @@ -84,11 +88,11 @@ impl KV { .open(Path::new(opt.value_dir.as_str()).join("value_dir_guard.lock"))?; value_dir_guard.lock_exclusive()?; let closers = Closers { - update_size: Closer::new(0), - compactors: Closer::new(0), - mem_table: Closer::new(0), - writes: Closer::new(0), - value_gc: Closer::new(0), + update_size: Closer::new(), + compactors: Closer::new(), + mem_table: Closer::new(), + writes: Closer::new(), + value_gc: Closer::new(), }; // go out.updateSize(out.closers.updateSize) let mt = SkipList::new(arena_size(&opt)); @@ -104,6 +108,7 @@ impl KV { closers, mt, imm: Vec::new(), + write_ch: Channel::new(1), last_used_cas_counter: Default::default(), }; @@ -153,6 +158,14 @@ impl KV { let mut vptr = ValuePointer::default(); vptr.dec(&mut Cursor::new(value))?; + + let replay_closer = Closer::new(); + { + let _out = xout.clone(); + tokio::spawn(async move { + _out.do_writes(replay_closer).await; + }); + } Ok(xout) } @@ -229,22 +242,10 @@ impl KV { } // Called serially by only on goroutine - fn write_requests(&self, reqs: &Vec) -> Result<()> { + async fn write_requests(&self, reqs: Arc>) -> Result<()> { if reqs.is_empty() { return Ok(()); } - defer! { - for req in reqs { - let worker = req.wait_group.lock().borrow_mut().take().unwrap(); - worker.done(); - } - } - let done = |res: Result<()>| { - for req in reqs { - let worker = req.wait_group.lock().borrow_mut().take().unwrap(); - worker.done(); - } - }; info!("write_requests called. Writing to value log"); // CAS counter for all operations has to go onto value log. Otherwise, if it is just in // memtable for a long time, and following CAS operations use that as a check, when @@ -253,21 +254,21 @@ impl KV { // There is code (in flush_mem_table) whose correctness depends on us generating CAS Counter // values _before_ we modify s.vptr here. - for req in reqs { - let counter_base = self.new_cas_counter(req.entries.read().len() as u64); - for (idx, entry) in req.entries.read().iter().enumerate() { + for req in reqs.iter() { + let counter_base = self.new_cas_counter(req.get_req().entries.read().len() as u64); + for (idx, entry) in req.get_req().entries.read().iter().enumerate() { entry.borrow_mut().cas_counter = counter_base + idx as u64; } } - self.vlog.as_ref().unwrap().write(reqs)?; + self.vlog.as_ref().unwrap().write(reqs.clone())?; info!("Writing to memory table"); let mut count = 0; - for req in reqs { - if req.entries.read().is_empty() { + for req in reqs.iter() { + if req.get_req().entries.read().is_empty() { continue; } - count += req.entries.read().len(); + count += req.get_req().entries.read().len(); } Ok(()) } @@ -363,9 +364,57 @@ impl ArcKV { } impl ArcKV { - async fn do_writes(&self) { + async fn do_writes(&self, lc: Closer) { // TODO add metrics - loop {} + let has_been_close = lc.has_been_closed(); + let write_ch = self.write_ch.clone(); + let reqs = Arc::new(Mutex::new(vec![])); + loop { + tokio::select! { + _ = has_been_close.recv() => { + break; + }, + req = write_ch.recv() => { + reqs.lock().push(req.unwrap()); + } + } + // TODO avoid memory allocate again + if reqs.lock().len() == 100 { + let to_reqs = reqs + .lock() + .clone() + .into_iter() + .map(|req| req.clone()) + .collect::>(); + let to_reqs = Arc::new(to_reqs); + if let Err(err) = self.write_requests(to_reqs).await { + let ret = Arc::new(Err(err)); + reqs.lock().iter().for_each(|req| req.set_err(ret.clone())); + } + reqs.lock().clear(); + } + } + + // clear future requests + write_ch.close(); + loop { + let req = write_ch.try_recv(); + if req.is_err() { + break; + } + let req = req.unwrap(); + reqs.lock().push(req); + let to_reqs = reqs + .lock() + .clone() + .into_iter() + .map(|req| req.clone()) + .collect::>(); + if let Err(err) = self.write_requests(Arc::new(to_reqs)).await { + let ret = Arc::new(Err(err)); + reqs.lock().iter().for_each(|req| req.set_err(ret.clone())); + } + } } } diff --git a/src/types.rs b/src/types.rs index 82e6a60..713dc08 100644 --- a/src/types.rs +++ b/src/types.rs @@ -7,9 +7,13 @@ use std::sync::{Arc, TryLockResult, Weak}; use std::time::Duration; use std::{hint, mem, thread}; -use async_channel::{bounded, Receiver, RecvError, SendError, Sender, TryRecvError, TrySendError}; +use async_channel::{ + bounded, unbounded, Receiver, Recv, RecvError, SendError, Sender, TryRecvError, TrySendError, +}; +use log::info; use range_lock::{VecRangeLock, VecRangeLockGuard}; +use tokio::sync::mpsc::{UnboundedSender, WeakUnboundedSender}; use tokio::time::sleep; // Channel like to go's channel @@ -75,6 +79,40 @@ impl Channel { } } +#[derive(Clone)] +pub struct UnChannel { + rx: Option>, + tx: Option>, +} + +impl UnChannel { + pub fn new() -> UnChannel { + let (tx, rx) = unbounded(); + UnChannel { + rx: Some(rx), + tx: Some(tx), + } + } + + /// returns Sender + pub fn tx(&self) -> Option<&Sender> { + self.tx.as_ref() + } + + // /// async receive a message with blocking + pub async fn recv(&mut self) -> Result { + let rx = self.rx.as_ref().unwrap(); + rx.recv().await + } + + /// close *Channel*, Sender will be consumed + pub fn close(&self) { + if let Some(tx) = &self.tx { + tx.close(); + } + } +} + /// Holds the two things we need to close a routine and wait for it to finish: a chan /// to tell the routine to shut down, and a wait_group with which to wait for it to finish shutting /// down. @@ -92,11 +130,10 @@ impl Drop for Closer { impl Closer { /// create a Closer with *initial* cap Workers - pub fn new(initial: isize) -> Self { - assert!(initial >= 0, "Sanity check"); + pub fn new() -> Self { let mut close = Closer { closed: Channel::new(1), - wait: Arc::from(AtomicIsize::new(initial)), + wait: Arc::from(AtomicIsize::new(0)), }; close } @@ -135,7 +172,7 @@ impl Closer { if self.wait.load(Ordering::Relaxed) <= 0 { break; } - hint::spin_loop(); + // hint::spin_loop(); sleep(Duration::from_millis(10)).await; } } @@ -244,19 +281,19 @@ impl Deref for XVec { fn it_closer() { let runtime = tokio::runtime::Runtime::new().unwrap(); runtime.block_on(async { - let closer = Closer::new(0); + let closer = Closer::new(); let count = Arc::new(AtomicUsize::new(100)); for i in 0..count.load(Ordering::Relaxed) { let c = closer.spawn(); let n = count.clone(); tokio::spawn(async move { - sleep(Duration::from_millis(200)).await; - n.fetch_add(1, Ordering::Relaxed); + sleep(Duration::from_millis(10000)).await; + n.fetch_sub(1, Ordering::Relaxed); c.done(); }); } closer.signal_and_wait().await; - assert_eq!(count.load(Ordering::Relaxed), 200); + assert_eq!(count.load(Ordering::Relaxed), 0); }); } diff --git a/src/value_log.rs b/src/value_log.rs index 348de1b..4c66108 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -1,11 +1,14 @@ +use async_channel::RecvError; use awaitgroup::{WaitGroup, Worker}; use bitflags::bitflags; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use crc32fast::Hasher; +use libc::difftime; use log::info; use log::kv::Source; use memmap::MmapMut; use parking_lot::*; +use protobuf::well_known_types::api::Mixin; use rand::random; use serde_json::to_vec; use std::cell::{Ref, RefCell, RefMut}; @@ -21,7 +24,6 @@ use std::process::id; use std::sync::atomic::{AtomicI32, AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; use std::{fmt, fs, thread}; -use protobuf::well_known_types::api::Mixin; use tabled::object::Entity::Cell; use crate::kv::{ArcKV, WeakKV, KV}; @@ -29,7 +31,7 @@ use crate::log_file::LogFile; use crate::options::Options; use crate::skl::BlockBytes; use crate::table::iterator::BlockSlice; -use crate::types::{Channel, XArc}; +use crate::types::{Channel, Closer, XArc}; use crate::y::{ create_synced_file, is_eof, open_existing_synced_file, read_at, sync_directory, Decode, Encode, }; @@ -210,7 +212,49 @@ pub(crate) struct Request { pub(crate) entries: RwLock>>, // Output Values and wait group stuff below pub(crate) ptrs: Mutex>>, - pub(crate) wait_group: Mutex>>, + pub(crate) res: Channel>, +} + +impl Request { + pub(crate) async fn get_resp(&self) -> Result<()> { + self.res.recv().await.unwrap() + } + + pub(crate) async fn set_resp(&self, ret: Result<()>) { + self.res.send(ret).await.unwrap() + } +} + +#[derive(Clone)] +pub(crate) struct ArcRequest { + inner: Arc, + err: Arc>>>, +} + +unsafe impl Send for ArcRequest {} + +unsafe impl Sync for ArcRequest {} + +impl ArcRequest { + pub(crate) fn get_req(&self) -> Arc { + self.inner.clone() + } + pub(crate) fn set_err(&self, err: Arc>) { + *self.err.lock() = err; + } + + pub(crate) fn to_inner(self) -> Request { + Arc::into_inner(self.inner).unwrap() + } +} + +impl From for ArcRequest { + fn from(value: Request) -> Self { + ArcRequest { + inner: Arc::new(value), + err: Arc::new(Mutex::new(Arc::new(Ok(())))), + } + } } pub struct ValueLogCore { @@ -462,7 +506,7 @@ impl ValueLogCore { } // write is thread-unsafe by design and should not be called concurrently. - pub(crate) fn write(&self, reqs: &Vec) -> Result<()> { + pub(crate) fn write(&self, reqs: Arc>) -> Result<()> { let cur_vlog_file = self.pick_log_by_vlog_id(&self.max_fid.load(Ordering::Acquire)); let to_disk = || -> Result<()> { if self.buf.borrow().buffer().is_empty() { @@ -501,11 +545,11 @@ impl ValueLogCore { Ok(()) }; - for req in reqs { - for (idx, entry) in req.entries.read().iter().enumerate() { + for req in reqs.iter() { + for (idx, entry) in req.get_req().entries.read().iter().enumerate() { if !self.opt.sync_writes && entry.borrow().value.len() < self.opt.value_threshold { // No need to write to value log. - req.ptrs.lock()[idx] = None; + req.get_req().ptrs.lock()[idx] = None; continue; } @@ -715,6 +759,16 @@ fn it() { // println!("WHat??? {:?}", value); } +#[tokio::test] +async fn lock1() { + + let req: RwLock>> = RwLock::new(Vec::new()); + + tokio::spawn(async move { + let _a = &req.write()[0]; + }); +} + #[tokio::test] async fn lock() { use parking_lot::*; From 4ef15571b3c5a196365fa851695b8031e47ff92c Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 15 Feb 2023 02:22:42 +0800 Subject: [PATCH 28/77] :card: --- src/kv.rs | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/kv.rs b/src/kv.rs index 49418c1..4e07c00 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -134,7 +134,9 @@ impl KV { { let _out = xout.clone(); tokio::spawn(async move { - _out.flush_mem_table(_out.closers.mem_table.clone()).await; + _out.flush_mem_table(_out.closers.mem_table.clone()) + .await + .expect("TODO: panic message"); }); } @@ -162,10 +164,32 @@ impl KV { let replay_closer = Closer::new(); { let _out = xout.clone(); + let replay_closer = replay_closer.clone(); tokio::spawn(async move { _out.do_writes(replay_closer).await; }); } + + let mut first = true; + xout.vlog.as_ref().unwrap().replay(&vptr, |entry, vptr| { + if first { + info!("First key={}", String::from_utf8_lossy(&entry.key)); + } + first = false; + if xout.last_used_cas_counter.load(Ordering::Relaxed) < entry.cas_counter { + xout.last_used_cas_counter + .store(entry.cas_counter, Ordering::Relaxed); + } + if entry.cas_counter_check != 0 { + let old_value = xout.get(&entry.key)?; + if old_value.cas_counter != entry.cas_counter_check { + return Ok(true); + } + } + todo!() + })?; + // Wait for replay to be applied first. + replay_closer.signal_and_wait().await; Ok(xout) } From 3d5889f4a25732dd40b0e6518d403f501c0803a1 Mon Sep 17 00:00:00 2001 From: Rg Date: Fri, 17 Feb 2023 01:12:08 +0800 Subject: [PATCH 29/77] :card: --- src/kv.rs | 86 +++++++++++++++++++++++++++++++++++---------------- src/levels.rs | 2 +- src/y/mod.rs | 6 ++++ 3 files changed, 67 insertions(+), 27 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 4e07c00..a13e1e6 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -3,9 +3,12 @@ use crate::manifest::{open_or_create_manifest_file, Manifest, ManifestFile}; use crate::options::Options; use crate::table::builder::Builder; use crate::table::iterator::IteratorImpl; +use crate::table::table::{new_file_name, TableCore}; use crate::types::{Channel, Closer, XArc, XWeak}; use crate::value_log::{ArcRequest, Request, ValueLogCore, ValuePointer}; -use crate::y::{Encode, Result, ValueStruct}; +use crate::y::{ + async_sync_directory, create_synced_file, sync_directory, Encode, Result, ValueStruct, +}; use crate::Error::Unexpected; use crate::{Decode, Error, Node, SkipList}; use bytes::BufMut; @@ -17,9 +20,11 @@ use std::borrow::BorrowMut; use std::fs::{read_dir, File}; use std::fs::{try_exists, OpenOptions}; use std::io::{Cursor, Write}; +use std::ops::Deref; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; +use tabled::Table; use tokio::fs::create_dir_all; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::sync::{RwLock, RwLockWriteGuard}; @@ -41,11 +46,17 @@ struct FlushTask { vptr: ValuePointer, } +impl FlushTask { + fn must_mt(&self) -> &SkipList { + self.mt.as_ref().unwrap() + } +} + pub struct KV { pub opt: Options, pub vlog: Option, pub manifest: Arc>, - // lc: XWeak, + lc: XWeak, flush_chan: Channel, // write_chan: Channel, dir_lock_guard: File, @@ -101,6 +112,7 @@ impl KV { vlog: None, manifest: Arc::new(RwLock::new(manifest_file)), // lc: Default::default(), + lc: XWeak::new(), flush_chan: Channel::new(1), // write_chan: Channel::new(1), dir_lock_guard, @@ -297,35 +309,50 @@ impl KV { Ok(()) } + // async to flush memory table into zero level async fn flush_mem_table(&self, lc: Closer) -> Result<()> { while let Ok(task) = self.flush_chan.recv().await { if task.mt.is_none() { break; } - if task.vptr.is_zero() { - continue; + // TODO if is zero? + if !task.vptr.is_zero() { + info!("Storing offset: {:?}", task.vptr); + let mut offset = vec![0u8; ValuePointer::value_pointer_encoded_size()]; + task.vptr.enc(&mut offset).unwrap(); + // CAS counter is needed and is desirable -- it's the first value log entry + // we replay, so to speak, perhaps the only, and we use it to re-initialize + // the CAS counter. + // + // The write loop generates CAS counter values _before_ it sets vptr. It + // is crucial that we read the cas counter here _after_ reading vptr. That + // way, our value here is guaranteed to be >= the CASCounter values written + // before vptr (because they don't get replayed). + let value = ValueStruct { + meta: 0, + user_meta: 0, + cas_counter: self.last_used_cas_counter.load(Ordering::Acquire), + value: offset, + }; + task.must_mt().put(_HEAD, value); } - - info!("Storing offset: {:?}", task.vptr); - let mut offset = vec![0u8; ValuePointer::value_pointer_encoded_size()]; - task.vptr.enc(&mut offset).unwrap(); - // CAS counter is needed and is desirable -- it's the first value log entry - // we replay, so to speak, perhaps the only, and we use it to re-initialize - // the CAS counter. - // - // The write loop generates CAS counter values _before_ it sets vptr. It - // is crucial that we read the cas counter here _after_ reading vptr. That - // way, our value here is guaranteed to be >= the CASCounter values written - // before vptr (because they don't get replayed). - let value = ValueStruct { - meta: 0, - user_meta: 0, - cas_counter: self.last_used_cas_counter.load(Ordering::Acquire), - value: offset, - }; - // todo - task.mt.as_ref().unwrap().put(_HEAD, value); + let fid = self.must_lc().reserve_file_id(); + let f_name = new_file_name(fid, &self.opt.dir); + let fp = create_synced_file(&f_name, true)?; + // Don't block just to sync the directory entry. + // TODO use currency + async_sync_directory(self.opt.dir.clone().to_string()).await?; + let mut fp = tokio::fs::File::from_std(fp); + write_level0_table( + &task.mt.as_ref().unwrap(), + &mut fp, + ) + .await?; + + let table = TableCore::open_table(fp.into_std(), &f_name, self.opt.table_loading_mode)?; + // We own a ref on tbl. + // task.must_mt()..try_add_level0_table(Table::from(table)); } Ok(()) @@ -337,6 +364,13 @@ impl KV { } } +impl KV { + fn must_lc(&self) -> XArc { + let lc = self.lc.upgrade().unwrap(); + lc + } +} + pub type WeakKV = XWeak; pub type ArcKV = XArc; @@ -448,7 +482,7 @@ impl Clone for WeakKV { } } -fn write_level0_table(st: &SkipList, mut f: &File) -> Result<()> { +async fn write_level0_table(st: &SkipList, f: &mut tokio::fs::File) -> Result<()> { let cur = st.new_cursor(); let mut builder = Builder::default(); while let Some(_) = cur.next() { @@ -456,7 +490,7 @@ fn write_level0_table(st: &SkipList, mut f: &File) -> Result<()> { let value = cur.value(); builder.add(key, &value)?; } - f.write_all(&builder.finish())?; + f.write_all(&builder.finish()).await?; Ok(()) } diff --git a/src/levels.rs b/src/levels.rs index ab4be3f..1e1ad4e 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -614,7 +614,7 @@ impl LevelsController { self.levels[0].num_tables() >= self.opt.num_level_zero_tables } - fn reserve_file_id(&self) -> u64 { + pub(crate) fn reserve_file_id(&self) -> u64 { let id = self.next_file_id.fetch_add(1, Ordering::Relaxed); id } diff --git a/src/y/mod.rs b/src/y/mod.rs index 965fbce..7edf1fd 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -274,6 +274,12 @@ pub(crate) fn sync_directory(d: &str) -> Result<()> { fp.sync_all().map_err(|err| err.into()) } +pub(crate) async fn async_sync_directory(d: String) -> Result<()> { + let fp = tokio::fs::File::open(d).await?; + fp.sync_all().await?; + Ok(()) +} + #[test] fn it_cpu() { println!("{:?}", num_cpu()); From 0c0776211576062437287c8798bb126a23a5a16c Mon Sep 17 00:00:00 2001 From: Rg Date: Fri, 17 Feb 2023 22:11:24 +0800 Subject: [PATCH 30/77] :coffee: --- src/compaction.rs | 8 +++- src/kv.rs | 5 ++- src/level_handler.rs | 2 +- src/levels.rs | 46 +++++++++++++++++++++++ src/lib.rs | 1 + src/test_util.rs | 89 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 147 insertions(+), 4 deletions(-) diff --git a/src/compaction.rs b/src/compaction.rs index 40d1909..2a9af24 100644 --- a/src/compaction.rs +++ b/src/compaction.rs @@ -105,6 +105,12 @@ impl Default for LevelCompactStatus { } } +impl Display for LevelCompactStatus { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + todo!() + } +} + impl LevelCompactStatus { // returns true if self.ranges and dst has overlap, otherwise returns false fn overlaps_with(&self, dst: &KeyRange) -> bool { @@ -119,7 +125,7 @@ impl LevelCompactStatus { len > rlock.len() } - fn get_del_size(&self) -> u64 { + pub(crate) fn get_del_size(&self) -> u64 { self.del_size.load(Ordering::Relaxed) } diff --git a/src/kv.rs b/src/kv.rs index a13e1e6..85c4bd6 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -350,9 +350,10 @@ impl KV { ) .await?; - let table = TableCore::open_table(fp.into_std(), &f_name, self.opt.table_loading_mode)?; + let fp = fp.into_std().await; + let table = TableCore::open_table(fp, &f_name, self.opt.table_loading_mode)?; // We own a ref on tbl. - // task.must_mt()..try_add_level0_table(Table::from(table)); + // task.must_mt().try_add_level0_table(Table::from(table)); } Ok(()) diff --git a/src/level_handler.rs b/src/level_handler.rs index fc3bd1b..d4e9e95 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -201,7 +201,7 @@ impl LevelHandler { } // Return true if ok and no stalling. - pub(crate) fn try_add_level0_table(&self, t: Table) -> bool { + pub(crate) async fn try_add_level0_table(&self, t: Table) -> bool { assert_eq!(self.x.level.load(Ordering::Relaxed), 0); let tw = self.tables_wl(); if tw.len() >= self.opt.num_level_zero_tables_stall { diff --git a/src/levels.rs b/src/levels.rs index 1e1ad4e..47cc90f 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -42,6 +42,7 @@ pub(crate) struct LevelsController { c_status: Arc, manifest: Arc>, opt: Options, + last_unstalled: Arc>, } pub(crate) type XLevelsController = XArc; @@ -120,6 +121,7 @@ impl LevelsController { c_status: Arc::new(cstatus), manifest, opt: opt.clone(), + last_unstalled: Arc::new(SystemTime::now()), }; if let Err(err) = level_controller.validate() { let _ = level_controller.cleanup_levels(); @@ -319,6 +321,50 @@ impl LevelsController { Ok(()) } + async fn add_level0_table(&self, table: Table) -> Result<()> { + // We update the manifest _before_ the table becomes part of a levelHandler, because at that + // point it could get used in some compaction. This ensures the manifest file gets updated in + // the proper order. (That means this update happens before that of some compaction which + // deletes the table.) + self.manifest + .write() + .await + .add_changes(vec![ManifestChangeBuilder::new(table.id()) + .with_level(0) + .build()]) + .await?; + while !self.levels[0].try_add_level0_table(table.clone()).await { + // Stall. Make sure all levels are healthy before we unstall. + let mut time = SystemTime::now(); + { + info!( + "STALLED STALLED STALLED STALLED STALLED STALLED STALLED STALLED: {}ms", + self.last_unstalled.read().await.elapsed().unwrap().as_millis() + ); + let c_status = self.c_status.levels.write(); + for i in 0..self.opt.max_levels { + info!( + "level={}, status={}, size={}", + i, + c_status[i], + c_status[i].get_del_size() + ) + } + time = SystemTime::now(); + } + // Before we unstall, we need to make sure that level 0 and 1 are healthy. Otherwise, we + // will very quickly fill up level 0 again and if the compaction strategy favors level 0, + // then level 1 is going to super full. + loop { + // Passing 0 for delSize to compactable means we're treating incomplete compactions as + // not having finished -- we wait for them to finish. Also, it's crucial this behavior + // replicates pickCompactLevels' behavior in computing compactability in order to + // guarantee progress. + } + } + Ok(()) + } + // Merge top tables and bot tables to from a List of new tables. pub(crate) async fn compact_build_tables( &self, diff --git a/src/lib.rs b/src/lib.rs index 7af0a36..078338e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,6 +14,7 @@ #![feature(slice_pattern)] #![feature(slice_take)] #![feature(arc_into_inner)] +#![feature(async_closure)] use std::io; use std::mem::align_of; diff --git a/src/test_util.rs b/src/test_util.rs index f0e8324..5ae4b39 100644 --- a/src/test_util.rs +++ b/src/test_util.rs @@ -5,6 +5,11 @@ use std::collections::HashMap; use std::env::temp_dir; use std::fs::create_dir_all; use std::io; +use std::sync::Arc; +use std::sync::atomic::{AtomicI32, AtomicU64, Ordering}; +use std::time::Duration; +use atomic::Atomic; +use tokio::runtime::Handle; use tracing_subscriber::fmt::format::Writer; use tracing_subscriber::fmt::time::FormatTime; @@ -109,3 +114,87 @@ fn itwork() { my_function(1000); info!("Hello Body"); } + +#[tokio::test] +async fn runtime_tk() { + use tokio::{sync::RwLock, task::JoinHandle}; + + pub type Future = JoinHandle; + pub type SafeFn = Arc Option + Sync + Send>>; + pub struct SafeFnWrapper { + fn_mut: SafeFn, + } + + impl SafeFnWrapper { + pub fn new(fn_mut: impl FnMut(A) -> Option + Send + Sync + 'static) -> SafeFnWrapper { + SafeFnWrapper::set(Arc::new(RwLock::new(fn_mut))) + } + + pub fn set(fn_mut: SafeFn) -> Self { + Self { fn_mut } + } + + /// Get a clone of the `fn_mut` field (which holds a thread safe `FnMut`). + pub fn get(&self) -> SafeFn { + self.fn_mut.clone() + } + + /// This is an `async` function. Make sure to use `await` on the return value. + pub fn spawn(&self, action: A) -> Future> { + let arc_lock_fn_mut = self.get(); + tokio::spawn(async move { + // Delay before calling the function. + // let delay_ms = rand::thread_rng().gen_range(100..1_000) as u64; + tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; + let mut fn_mut = arc_lock_fn_mut.write().await; // 👀 `unwrap()` for blocking. + fn_mut(action) + }) + } + } + // + fn load() -> impl FnMut(i32) -> Option { + |_| { + println!("HeLoo"); + Some(299) + } + } + + // let (tx, rx) = std::sync::mpsc::sync_channel(1); + // tokio::spawn(async move { + // let ft = SafeFnWrapper::::new(load()); + // let s1 = ft.spawn(200).await.unwrap(); + // tx.send(s1).unwrap(); + // println!("send ok"); + // }); + // tokio::runtime::Handle::spawn() + // let item = rx.recv().unwrap(); + // println!("ret: {:?}", item); + // tokio::time::sleep(Duration::from_millis(200)).await; +} + +#[test] +fn tk2() { + let rt = tokio::runtime::Runtime::new().unwrap(); + let a = Arc::new(std::sync::atomic::AtomicI32::new(10000000)); + let ac = a.clone(); + rt.block_on(async move { + for i in 0..10000 { + let ac = ac.clone(); + tokio::spawn(async move { + ac.fetch_sub(1, Ordering::Relaxed); + }); + } + fn add() -> i32 { + let f = async move {100}; + let r = tokio::task::block_in_place( move || { + tokio::runtime::Handle::current().block_on(f) + }); + r + } + + let ret = add(); + println!("return {}", ret); + }); + + println!("{}", a.load(Ordering::Relaxed)); +} \ No newline at end of file From 4c8e6952942433f5b142499f221fa7a0ede09793 Mon Sep 17 00:00:00 2001 From: Rg Date: Sat, 18 Feb 2023 21:01:12 +0800 Subject: [PATCH 31/77] :coffee: --- src/kv.rs | 77 ++++++++++++++++++++++++++++++++++++------------- src/levels.rs | 35 +++++++++++++++++----- src/skl/skip.rs | 7 +++-- src/types.rs | 3 ++ 4 files changed, 93 insertions(+), 29 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 85c4bd6..4668a91 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -3,9 +3,9 @@ use crate::manifest::{open_or_create_manifest_file, Manifest, ManifestFile}; use crate::options::Options; use crate::table::builder::Builder; use crate::table::iterator::IteratorImpl; -use crate::table::table::{new_file_name, TableCore}; +use crate::table::table::{new_file_name, Table, TableCore}; use crate::types::{Channel, Closer, XArc, XWeak}; -use crate::value_log::{ArcRequest, Request, ValueLogCore, ValuePointer}; +use crate::value_log::{ArcRequest, Entry, MetaBit, Request, ValueLogCore, ValuePointer}; use crate::y::{ async_sync_directory, create_synced_file, sync_directory, Encode, Result, ValueStruct, }; @@ -24,7 +24,6 @@ use std::ops::Deref; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; -use tabled::Table; use tokio::fs::create_dir_all; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::sync::{RwLock, RwLockWriteGuard}; @@ -63,9 +62,9 @@ pub struct KV { value_dir_guard: File, closers: Closers, // Our latest (actively written) in-memory table. - mt: SkipList, + mt: Option>, // Add here only AFTER pushing to flush_ch - imm: Vec, + imm: Vec>, write_ch: Channel, // Incremented in the non-concurrently accessed write loop. But also accessed outside. So // we use an atomic op. @@ -106,7 +105,6 @@ impl KV { value_gc: Closer::new(), }; // go out.updateSize(out.closers.updateSize) - let mt = SkipList::new(arena_size(&opt)); let mut out = KV { opt: opt.clone(), vlog: None, @@ -118,7 +116,7 @@ impl KV { dir_lock_guard, value_dir_guard, closers, - mt, + mt: None, imm: Vec::new(), write_ch: Channel::new(1), last_used_cas_counter: Default::default(), @@ -142,7 +140,7 @@ impl KV { _out.spawn_update_size().await; }); } - // memtable closer + // mem_table closer { let _out = xout.clone(); tokio::spawn(async move { @@ -198,6 +196,22 @@ impl KV { return Ok(true); } } + let mut nv = vec![]; + let mut meta = entry.meta; + if xout.should_write_value_to_lsm(entry) { + nv = entry.value.clone(); + } else { + nv = Vec::with_capacity(ValuePointer::value_pointer_encoded_size()); + vptr.enc(&mut nv).unwrap(); + meta = meta | MetaBit::BIT_VALUE_POINTER.bits(); + } + let v = ValueStruct { + meta, + user_meta: entry.user_meta, + cas_counter: entry.cas_counter, + value: nv, + }; + while let Err(err) = xout.ensure_room_for_write() {} todo!() })?; // Wait for replay to be applied first. @@ -262,17 +276,17 @@ impl KV { } // Returns the current `mem_tables` and get references. - fn get_mem_tables(&self) -> Vec<&SkipList> { + fn get_mem_tables(&self) -> Vec> { // TODO add kv lock let mut tables = Vec::with_capacity(self.imm.len() + 1); // Get mutable `mem_tables`. - tables.push(&self.mt); + tables.push(self.mt.as_ref().unwrap().clone()); tables[0].incr_ref(); // Get immutable `mem_tables`. for tb in self.imm.iter().rev() { tb.incr_ref(); - tables.push(tb); + tables.push(tb.clone()); } tables } @@ -315,7 +329,6 @@ impl KV { if task.mt.is_none() { break; } - // TODO if is zero? if !task.vptr.is_zero() { info!("Storing offset: {:?}", task.vptr); @@ -344,18 +357,21 @@ impl KV { // TODO use currency async_sync_directory(self.opt.dir.clone().to_string()).await?; let mut fp = tokio::fs::File::from_std(fp); - write_level0_table( - &task.mt.as_ref().unwrap(), - &mut fp, - ) - .await?; + write_level0_table(&task.mt.as_ref().unwrap(), &mut fp).await?; let fp = fp.into_std().await; - let table = TableCore::open_table(fp, &f_name, self.opt.table_loading_mode)?; + + let tc = TableCore::open_table(fp, &f_name, self.opt.table_loading_mode)?; + let tb = Table::new(tc); // We own a ref on tbl. - // task.must_mt().try_add_level0_table(Table::from(table)); - } + self.must_lc().add_level0_table(tb.clone()).await?; // This will incr_ref (if we don't error, sure) + tb.decr_ref(); // releases our ref. + // Update s.imm, need a lock. + // assert!(task.mt.as_ref().unwrap(), "{}", self.imm[0]); + // TODO + task.mt.as_ref().unwrap().decr_ref(); // Return memory + } Ok(()) } @@ -370,6 +386,10 @@ impl KV { let lc = self.lc.upgrade().unwrap(); lc } + + fn must_mt(&self) -> &Arc { + self.mt.as_ref().unwrap() + } } pub type WeakKV = XWeak; @@ -475,6 +495,23 @@ impl ArcKV { } } } + + fn should_write_value_to_lsm(&self, entry: &Entry) -> bool { + entry.value.len() < self.opt.value_threshold + } + + // Always called serially. + fn ensure_room_for_write(&self) -> Result<()> { + // TODO a special global lock for this function + if self.must_mt().mem_size() < self.opt.max_table_size as u32 { + return Ok(()); + } + + // A nil mt indicates that KV is being closed. + assert!(!self.must_mt().empty()); + // let flush_task = FlushTask { mt: Some(self.mt), vptr: self.vlog.as_ref().unwrap() } + todo!() + } } impl Clone for WeakKV { diff --git a/src/levels.rs b/src/levels.rs index 47cc90f..43da190 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -9,7 +9,9 @@ use crate::table::builder::Builder; use crate::table::iterator::{ConcatIterator, IteratorImpl, IteratorItem}; use crate::table::table::{get_id_map, new_file_name, Table, TableCore}; use crate::types::{Closer, XArc, XWeak}; -use crate::y::{create_synced_file, open_existing_synced_file, sync_directory}; +use crate::y::{ + async_sync_directory, create_synced_file, open_existing_synced_file, sync_directory, +}; use crate::Result; use crate::Xiterator; use crate::{MergeIterOverBuilder, MergeIterOverIterator}; @@ -30,6 +32,7 @@ use std::sync::Arc; use std::time::{Duration, SystemTime}; use std::vec; use tokio::macros::support::thread_rng_n; +use tokio::time::sleep; #[derive(Clone)] pub(crate) struct LevelsController { @@ -121,14 +124,14 @@ impl LevelsController { c_status: Arc::new(cstatus), manifest, opt: opt.clone(), - last_unstalled: Arc::new(SystemTime::now()), + last_unstalled: Arc::new(tokio::sync::RwLock::new(SystemTime::now())), }; if let Err(err) = level_controller.validate() { let _ = level_controller.cleanup_levels(); return Err(format!("Level validation, err:{}", err).into()); } // Sync directory (because we have at least removed some files, or previously created the manifest file). - if let Err(err) = sync_directory(opt.dir.as_str()) { + if let Err(err) = async_sync_directory(*opt.dir.clone()).await { let _ = level_controller.close(); return Err(err); } @@ -321,7 +324,8 @@ impl LevelsController { Ok(()) } - async fn add_level0_table(&self, table: Table) -> Result<()> { + // async to add level0 table + pub(crate) async fn add_level0_table(&self, table: Table) -> Result<()> { // We update the manifest _before_ the table becomes part of a levelHandler, because at that // point it could get used in some compaction. This ensures the manifest file gets updated in // the proper order. (That means this update happens before that of some compaction which @@ -331,15 +335,21 @@ impl LevelsController { .await .add_changes(vec![ManifestChangeBuilder::new(table.id()) .with_level(0) + .with_op(CREATE) .build()]) .await?; while !self.levels[0].try_add_level0_table(table.clone()).await { // Stall. Make sure all levels are healthy before we unstall. - let mut time = SystemTime::now(); + let mut start_time = SystemTime::now(); { info!( "STALLED STALLED STALLED STALLED STALLED STALLED STALLED STALLED: {}ms", - self.last_unstalled.read().await.elapsed().unwrap().as_millis() + self.last_unstalled + .read() + .await + .elapsed() + .unwrap() + .as_millis() ); let c_status = self.c_status.levels.write(); for i in 0..self.opt.max_levels { @@ -350,7 +360,7 @@ impl LevelsController { c_status[i].get_del_size() ) } - time = SystemTime::now(); + start_time = SystemTime::now(); } // Before we unstall, we need to make sure that level 0 and 1 are healthy. Otherwise, we // will very quickly fill up level 0 again and if the compaction strategy favors level 0, @@ -360,7 +370,18 @@ impl LevelsController { // not having finished -- we wait for them to finish. Also, it's crucial this behavior // replicates pickCompactLevels' behavior in computing compactability in order to // guarantee progress. + if !self.is_level0_compactable() && !self.levels[1].is_compactable(0) { + break; + } + // sleep millis, try it again + sleep(Duration::from_millis(10)).await; } + + info!( + "UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED: {}ms", + start_time.elapsed().unwrap().as_millis() + ); + *self.last_unstalled.write().await = SystemTime::now(); } Ok(()) } diff --git a/src/skl/skip.rs b/src/skl/skip.rs index f9bac1e..eba9373 100644 --- a/src/skl/skip.rs +++ b/src/skl/skip.rs @@ -8,6 +8,7 @@ use std::ops::Deref; use std::sync::atomic::Ordering; use std::sync::Arc; use std::{cmp, ptr, ptr::NonNull, sync::atomic::AtomicI32}; +use log::info; use crate::y::ValueStruct; @@ -42,7 +43,9 @@ impl SkipList { pub fn incr_ref(&self) { self._ref.fetch_add(1, Ordering::Relaxed); } - // Sub crease the reference count + + // Sub crease the reference count, deallocating the skiplist when done using it + // TODO pub fn decr_ref(&self) { self._ref.fetch_sub(1, Ordering::Relaxed); } @@ -380,7 +383,7 @@ impl SkipList { impl Drop for SkipList { fn drop(&mut self) { let _ref = self._ref.load(Ordering::Relaxed); - println!("SkipList reference: {:p} => {:?}", &self, _ref); + info!("Drop SkipList, reference: {}", _ref); self.arena_mut_ref().reset(); } } diff --git a/src/types.rs b/src/types.rs index 713dc08..5c09095 100644 --- a/src/types.rs +++ b/src/types.rs @@ -16,6 +16,9 @@ use range_lock::{VecRangeLock, VecRangeLockGuard}; use tokio::sync::mpsc::{UnboundedSender, WeakUnboundedSender}; use tokio::time::sleep; +type TArcMx = Arc>; +type TArcRW = Arc>; + // Channel like to go's channel #[derive(Clone)] pub struct Channel { From ec47844a64fbb714d5bfd3a4325b62dd77927114 Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 22 Feb 2023 09:21:26 +0800 Subject: [PATCH 32/77] :card: --- src/doc/write.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 src/doc/write.md diff --git a/src/doc/write.md b/src/doc/write.md new file mode 100644 index 0000000..2635829 --- /dev/null +++ b/src/doc/write.md @@ -0,0 +1,28 @@ +Put Key + +```mermaid +%% Example of sequence diagram + sequenceDiagram +actor KV +participant WriteCh +actor FlushCh +KV-->>WriteCh: Async Send Req +activate WriteCh +alt Inner Data Transfer +WriteCh-->>WriteCh: 1. Call writeRequests[Mult Reqs] +WriteCh -->>WriteCh: 2. Write Into Vlog, Fill Ptrs +WriteCh -)WriteCh: 3. Check ensureRoomForWrite +WriteCh -->>FlushCh: 4. Send flushTask{s.mt, s.vptr} to FlushCh +Note right of WriteCh: 1) vlog.sync(): Ensure value log is synced to disk so this memtable's contents wouldn't be lost.
2) s.imm = append(s.imm, s.mt): We manage to push this task. Let's modify imm.
3) s.mt = skl.NewSkiplist(arenaSize(&s.opt)): New memtable is empty. We certainly have room. +WriteCh -->>WriteCh: 5. If not pass 3, writeToLSM +WriteCh-->>WriteCh: 6. updateOffset [update lasted Ptr] +end +WriteCh-->> KV: Async Return Req +deactivate WriteCh +activate FlushCh +FlushCh -->> FlushCh: Receive FlushTask From 4 +FlushCh -->> FlushCh: ft.mt is nil ? and ft.vptr.IsZero()? Put Offset for replay +FlushCh -->> FlushCh: Create a new table, writeLevel0Table and addLevel0Table +deactivate FlushCh +``` + From 7221d94d274df764d94418f637b390ac3ec9ae58 Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 23 Feb 2023 22:00:25 +0800 Subject: [PATCH 33/77] :coffee: --- Cargo.toml | 1 + src/kv.rs | 8 +----- src/level_handler.rs | 33 +++++++++++------------ src/lib.rs | 1 + src/skl/skip.rs | 64 +++++++++++++++++++++++++++++++++++++++++--- src/st_manager.rs | 28 +++++++++++++++++++ src/table/table.rs | 25 ++++++++--------- src/types.rs | 24 +++++++---------- 8 files changed, 129 insertions(+), 55 deletions(-) create mode 100644 src/st_manager.rs diff --git a/Cargo.toml b/Cargo.toml index d2ebaca..3fd90f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ range-lock = "0.2.2" tracing = "0.1.37" drop_cell = "0.0.0" walkdir = "2.3.2" +crossbeam-epoch = "0.9.13" [dev-dependencies] tracing-subscriber = "0.3.16" tracing-log = "0.1.3" diff --git a/src/kv.rs b/src/kv.rs index 4668a91..5f9c858 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -364,7 +364,7 @@ impl KV { let tc = TableCore::open_table(fp, &f_name, self.opt.table_loading_mode)?; let tb = Table::new(tc); // We own a ref on tbl. - self.must_lc().add_level0_table(tb.clone()).await?; // This will incr_ref (if we don't error, sure) + // self.must_lc().add_level0_table(tb.clone()).await?; // This will incr_ref (if we don't error, sure) tb.decr_ref(); // releases our ref. // Update s.imm, need a lock. @@ -514,12 +514,6 @@ impl ArcKV { } } -impl Clone for WeakKV { - fn clone(&self) -> Self { - XWeak { x: self.x.clone() } - } -} - async fn write_level0_table(st: &SkipList, f: &mut tokio::fs::File) -> Result<()> { let cur = st.new_cursor(); let mut builder = Builder::default(); diff --git a/src/level_handler.rs b/src/level_handler.rs index d4e9e95..4e7492e 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -84,11 +84,11 @@ impl LevelHandler { } pub(crate) fn get_total_size(&self) -> u64 { - self.x.total_size.load(Ordering::Relaxed) + self.total_size.load(Ordering::Relaxed) } pub(crate) fn get_max_total_size(&self) -> u64 { - self.x.max_total_size.load(Ordering::Relaxed) + self.max_total_size.load(Ordering::Relaxed) } // delete current level's tables of to_del @@ -96,7 +96,7 @@ impl LevelHandler { let to_del = to_del.iter().map(|id| *id).collect::>(); let mut tb_wl = self.tables_wl(); tb_wl.retain_mut(|tb| { - if to_del.contains(&tb.x.id()) { + if to_del.contains(&tb.id()) { // delete table reference tb.decr_ref(); return false; @@ -108,15 +108,14 @@ impl LevelHandler { // init with tables pub(crate) fn init_tables(&self, tables: Vec
) { let total_size = tables.iter().fold(0, |acc, table| acc + table.size()); - self.x - .total_size + self.total_size .store(total_size as u64, Ordering::Relaxed); let mut tb_wl = self.tables_wl(); (*tb_wl) = tables; - if self.x.level.load(Ordering::Relaxed) == 0 { + if self.level.load(Ordering::Relaxed) == 0 { // key range will overlap. Just sort by file_id in ascending order // because newer tables are at the end of level 0. - tb_wl.sort_by_key(|tb| tb.x.id()); + tb_wl.sort_by_key(|tb| tb.id()); } else { // Sort tables by keys. tb_wl.sort_by_key(|tb| tb.smallest().to_vec()); @@ -125,12 +124,12 @@ impl LevelHandler { // Get table write lock guards. fn tables_wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec
> { - self.x.tables.write() + self.tables.write() } // Get table read lock guards fn tables_rd(&self) -> RwLockReadGuard<'_, RawRwLock, Vec
> { - self.x.tables.read() + self.tables.read() } // Returns the tables that intersect with key range. Returns a half-interval [left, right]. @@ -148,7 +147,7 @@ impl LevelHandler { } pub(crate) fn get_total_siz(&self) -> u64 { - self.x.total_size.load(Ordering::Relaxed) + self.total_size.load(Ordering::Relaxed) } // Replace tables[left:right] with new_tables, Note this EXCLUDES tables[right]. @@ -163,8 +162,7 @@ impl LevelHandler { // TODO Add lock (think of level's sharing lock) // Increase total_size first. for tb in &new_tables { - self.x - .total_size + self.total_size .fetch_add(tb.size() as u64, Ordering::Relaxed); // add table reference tb.incr_ref(); @@ -187,8 +185,7 @@ impl LevelHandler { // TODO it should be not a good idea decr reference here, slow lock // decr table reference tb.decr_ref(); - self.x - .total_size + self.total_size .fetch_sub(tb.size() as u64, Ordering::Relaxed); false } @@ -202,13 +199,13 @@ impl LevelHandler { // Return true if ok and no stalling. pub(crate) async fn try_add_level0_table(&self, t: Table) -> bool { - assert_eq!(self.x.level.load(Ordering::Relaxed), 0); + assert_eq!(self.level.load(Ordering::Relaxed), 0); let tw = self.tables_wl(); if tw.len() >= self.opt.num_level_zero_tables_stall { return false; } t.incr_ref(); - self.x + self .total_size .fetch_add(t.size() as u64, Ordering::Relaxed); self.tables_wl().push(t); @@ -228,7 +225,7 @@ impl LevelHandler { // Acquires a read-lock to access s.tables. It returns a list of table_handlers. pub(crate) fn get_table_for_key(&self, key: &[u8]) -> Option { - return if self.x.level.load(Ordering::Relaxed) == 0 { + return if self.level.load(Ordering::Relaxed) == 0 { let tw = self.tables_rd(); for tb in tw.iter().rev() { tb.incr_ref(); @@ -263,7 +260,7 @@ impl LevelHandler { // returns current level pub(crate) fn level(&self) -> usize { - self.x.level.load(Ordering::Relaxed) as usize + self.level.load(Ordering::Relaxed) as usize } } diff --git a/src/lib.rs b/src/lib.rs index 078338e..4fb63ce 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,6 +38,7 @@ mod levels; mod pb; #[cfg(test)] mod test_util; +mod st_manager; pub use skl::*; pub use y::*; diff --git a/src/skl/skip.rs b/src/skl/skip.rs index eba9373..794ff17 100644 --- a/src/skl/skip.rs +++ b/src/skl/skip.rs @@ -1,14 +1,15 @@ use crate::skl::{Cursor, HEIGHT_INCREASE, MAX_HEIGHT}; +use log::info; use rand::random; use serde_json::Value; use std::borrow::Cow; use std::collections::HashMap; use std::fmt::{write, Display, Formatter}; use std::ops::Deref; -use std::sync::atomic::Ordering; +use std::ptr::null_mut; +use std::sync::atomic::{AtomicPtr, Ordering}; use std::sync::Arc; use std::{cmp, ptr, ptr::NonNull, sync::atomic::AtomicI32}; -use log::info; use crate::y::ValueStruct; @@ -19,13 +20,24 @@ pub struct SkipList { height: Arc, head: NonNull, _ref: Arc, - pub(crate) arena: Arena, + pub(crate) arena: Arc, } unsafe impl Send for SkipList {} unsafe impl Sync for SkipList {} +impl Clone for SkipList { + fn clone(&self) -> Self { + SkipList { + height: self.height.clone(), + head: NonNull::new(self.head.as_ptr()).unwrap(), + _ref: self._ref.clone(), + arena: self.arena.clone(), + } + } +} + impl SkipList { pub fn new(arena_size: usize) -> Self { let mut arena = Arena::new(arena_size); @@ -35,7 +47,7 @@ impl SkipList { height: Arc::new(AtomicI32::new(1)), head: NonNull::new(node).unwrap(), _ref: Arc::new(AtomicI32::new(1)), - arena, + arena: Arc::new(arena), } } @@ -536,6 +548,39 @@ mod tests { } } + fn t_concurrent_basic2() { + use rand::{thread_rng, Rng}; + + let st = SkipList::new(ARENA_SIZE); + let mut kv = vec![]; + for i in 0..10000 { + kv.push(( + Alphanumeric.sample_string(&mut rand::thread_rng(), 10), + Alphanumeric.sample_string(&mut rand::thread_rng(), 20), + )); + } + + let mut waits = vec![]; + for (key, value) in kv.clone() { + let st_ptr = st.clone(); + waits.push(spawn(move || { + // let st = unsafe {st.as_ref()}; + st_ptr.put( + key.as_bytes(), + ValueStruct::new(value.as_bytes().to_vec(), 0, 0, 0), + ) + })); + } + for join in waits { + join.join().unwrap(); + } + + for (key, value) in kv { + let got = st.get(key.as_bytes()).unwrap(); + assert_eq!(got.value, value.as_bytes().to_vec()); + } + } + #[test] fn t_one_key() { let key = "thekey"; @@ -805,3 +850,14 @@ mod tests { cur.close(); } } + +mod tests2 { + use crate::SkipList; + + const ARENA_SIZE: usize = 1 << 20; + + #[test] + fn atomic_swap_skip_list() { + let mut st = SkipList::new(ARENA_SIZE); + } +} diff --git a/src/st_manager.rs b/src/st_manager.rs new file mode 100644 index 0000000..c7b32fa --- /dev/null +++ b/src/st_manager.rs @@ -0,0 +1,28 @@ +use crate::SkipList; +use crossbeam_epoch::{Atomic, Shared}; +use std::borrow::Borrow; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use tokio::sync::RwLock; + +pub struct SkipListManager { + mt: Atomic>, + imm: Arc>>, +} + +impl Default for SkipListManager { + fn default() -> Self { + todo!() + } +} + +// impl SkipListManager { +// pub unsafe fn must_mt(&self) -> Option<&'_ Option> { +// let p = &crossbeam_epoch::pin(); +// let mt = self.mt.load(Ordering::SeqCst, p); +// mt.as_ref() +// } +// } + +#[test] +fn ti() {} diff --git a/src/table/table.rs b/src/table/table.rs index c1d8c2f..aaa9bc5 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -7,6 +7,7 @@ use byteorder::{BigEndian, ReadBytesExt}; use filename::file_name; use growable_bloom_filter::GrowableBloom; use memmap::{Mmap, MmapMut}; +use std::cell::RefCell; use std::collections::HashSet; use std::fmt::{Display, Formatter}; use std::fs::{read_dir, remove_file, File}; @@ -54,23 +55,23 @@ pub type WeakTable = XWeak; impl Table { pub fn incr_ref(&self) { - self.x.incr_ref() + self.to_ref().incr_ref() } pub fn decr_ref(&self) { - self.x.decr_ref() + self.to_ref().decr_ref() } pub fn size(&self) -> usize { - self.x.size() + self.to_ref().size() } pub fn biggest(&self) -> &[u8] { - &self.x.biggest + &self.biggest } pub fn smallest(&self) -> &[u8] { - &self.x.smallest + &self.smallest } } @@ -85,7 +86,7 @@ pub struct TableCore { _mmap: Option, // Memory mapped. // The following are initialized once and const. smallest: Vec, // smallest keys. - biggest: Vec, // biggest keys. + biggest: Vec, // biggest keys. id: u64, bf: GrowableBloom, } @@ -108,7 +109,7 @@ impl TableCore { loading_mode, _mmap: None, smallest: vec![], - biggest: vec![], + biggest: vec![], id, bf: GrowableBloom::new(0.01, 1), }; @@ -142,11 +143,11 @@ impl TableCore { .or_else(|| Some(vec![])) } .unwrap(); - let mut table = Arc::into_inner(table_ref.x).unwrap(); - table.biggest = biggest; - table.smallest = smallest; - println!("open table ==> {}", table); - Ok(table) + let mut tc = table_ref.to_inner().unwrap(); + tc.biggest = biggest; + tc.smallest = smallest; + println!("open table ==> {}", tc); + Ok(tc) } // increments the refcount (having to do with whether the file should be deleted) diff --git a/src/types.rs b/src/types.rs index 5c09095..cf13449 100644 --- a/src/types.rs +++ b/src/types.rs @@ -188,55 +188,51 @@ impl Closer { } #[derive(Debug, Clone)] -pub struct XWeak { - pub(crate) x: Weak, -} +pub struct XWeak(Weak); #[derive(Debug)] -pub struct XArc { - pub(crate) x: Arc, -} +pub struct XArc(Arc); impl Deref for XArc { type Target = T; fn deref(&self) -> &Self::Target { - self.x.deref() + self.0.deref() } } impl Clone for XArc { fn clone(&self) -> Self { - XArc { x: self.x.clone() } + XArc(self.0.clone()) } } impl XArc { pub fn new(x: T) -> XArc { - XArc { x: Arc::new(x) } + XArc(Arc::new(x)) } pub fn to_ref(&self) -> &T { - self.x.as_ref() + self.0.as_ref() } pub fn to_inner(self) -> Option { - Arc::into_inner(self.x) + Arc::into_inner(self.0) } } impl XWeak { pub fn new() -> Self { - Self { x: Weak::new() } + Self { 0: Weak::new() } } pub fn upgrade(&self) -> Option> { - self.x.upgrade().map(|x| XArc { x }) + self.0.upgrade().map(XArc) } pub fn from(xarc: &XArc) -> Self { Self { - x: Arc::downgrade(&xarc.x), + 0: Arc::downgrade(&xarc.0), } } } From b37a7257dc354785cf03449fdc9e2d3abc1726a1 Mon Sep 17 00:00:00 2001 From: Rg Date: Fri, 24 Feb 2023 09:38:41 +0800 Subject: [PATCH 34/77] :card: --- src/kv.rs | 4 +++- src/lib.rs | 1 + src/st_manager.rs | 61 +++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 61 insertions(+), 5 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 5f9c858..9011897 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -10,7 +10,7 @@ use crate::y::{ async_sync_directory, create_synced_file, sync_directory, Encode, Result, ValueStruct, }; use crate::Error::Unexpected; -use crate::{Decode, Error, Node, SkipList}; +use crate::{Decode, Error, Node, SkipList, SkipListManager}; use bytes::BufMut; use drop_cell::defer; use fs2::FileExt; @@ -63,6 +63,7 @@ pub struct KV { closers: Closers, // Our latest (actively written) in-memory table. mt: Option>, + mem_st_manger: Arc, // Add here only AFTER pushing to flush_ch imm: Vec>, write_ch: Channel, @@ -120,6 +121,7 @@ impl KV { imm: Vec::new(), write_ch: Channel::new(1), last_used_cas_counter: Default::default(), + mem_st_manger: Arc::new(SkipListManager::default()), }; let manifest = out.manifest.clone(); diff --git a/src/lib.rs b/src/lib.rs index 4fb63ce..ec15a59 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,6 +42,7 @@ mod st_manager; pub use skl::*; pub use y::*; +pub use st_manager::*; #[allow(dead_code)] #[inline] diff --git a/src/st_manager.rs b/src/st_manager.rs index c7b32fa..f0c073f 100644 --- a/src/st_manager.rs +++ b/src/st_manager.rs @@ -1,13 +1,18 @@ use crate::SkipList; -use crossbeam_epoch::{Atomic, Shared}; +use atomic::Atomic; +use drop_cell::defer; +use parking_lot::lock_api::RwLockWriteGuard; +use parking_lot::RawRwLock; use std::borrow::Borrow; +use std::ptr; +use std::ptr::NonNull; use std::sync::atomic::Ordering; use std::sync::Arc; -use tokio::sync::RwLock; pub struct SkipListManager { - mt: Atomic>, - imm: Arc>>, + share_lock: parking_lot::RwLock<()>, + mt: Atomic>, + imm: Arc>>>, } impl Default for SkipListManager { @@ -16,6 +21,54 @@ impl Default for SkipListManager { } } +impl SkipListManager { + pub fn take(&self) -> NonNull { + self.mt.swap(NonNull::dangling(), Ordering::Relaxed) + } + + pub fn set(&self, st: NonNull) { + self.mt.store(st, Ordering::Relaxed); + } + + pub fn mt(&self) -> NonNull { + self.mt.load(Ordering::Relaxed) + } + + pub unsafe fn mt_ref(&self) -> &'_ SkipList { + let st = self.mt(); + st.as_ref() + } + + pub fn imm(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec>> { + self.imm.write() + } + + pub fn append_imm(&self, st: NonNull) { + let mut imm = self.imm(); + imm.push(st); + } + + pub fn swap_st(&self) { + self.lock_exclusive(); + defer! {self.unlock_exclusive()} + let st = self.take(); + self.append_imm(st); + let st = Box::new(SkipList::new(1000)); + let ptr = st.as_ref() as *const SkipList as *mut SkipList; + self.set(NonNull::new(ptr).unwrap()); + } + + pub fn lock_exclusive(&self) { + use parking_lot::lock_api::RawRwLock; + unsafe { self.share_lock.raw().lock_exclusive() } + } + + pub fn unlock_exclusive(&self) { + use parking_lot::lock_api::RawRwLock; + unsafe { self.share_lock.raw().unlock_exclusive() } + } +} + // impl SkipListManager { // pub unsafe fn must_mt(&self) -> Option<&'_ Option> { // let p = &crossbeam_epoch::pin(); From 502c7ac1c79b2a4fdc553cc9399c28bef20b1a1a Mon Sep 17 00:00:00 2001 From: Rg Date: Fri, 24 Feb 2023 19:22:03 +0800 Subject: [PATCH 35/77] :coffee: --- src/kv.rs | 177 ++++++++++++++++++++++++++++--------------- src/level_handler.rs | 2 +- src/log_file.rs | 11 ++- src/st_manager.rs | 17 ++++- src/table/table.rs | 6 ++ src/types.rs | 17 ++++- src/value_log.rs | 13 ++-- src/y/mod.rs | 6 ++ 8 files changed, 174 insertions(+), 75 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 9011897..7fa0a9a 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -11,6 +11,7 @@ use crate::y::{ }; use crate::Error::Unexpected; use crate::{Decode, Error, Node, SkipList, SkipListManager}; +use atomic::Atomic; use bytes::BufMut; use drop_cell::defer; use fs2::FileExt; @@ -19,11 +20,16 @@ use parking_lot::Mutex; use std::borrow::BorrowMut; use std::fs::{read_dir, File}; use std::fs::{try_exists, OpenOptions}; +use std::future::Future; use std::io::{Cursor, Write}; use std::ops::Deref; use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::pin::Pin; +use std::ptr::NonNull; +use std::string; +use std::sync::atomic::{AtomicPtr, AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; +use std::time::Duration; use tokio::fs::create_dir_all; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::sync::{RwLock, RwLockWriteGuard}; @@ -54,6 +60,7 @@ impl FlushTask { pub struct KV { pub opt: Options, pub vlog: Option, + pub vptr: crossbeam_epoch::Atomic, pub manifest: Arc>, lc: XWeak, flush_chan: Channel, @@ -62,14 +69,13 @@ pub struct KV { value_dir_guard: File, closers: Closers, // Our latest (actively written) in-memory table. - mt: Option>, mem_st_manger: Arc, // Add here only AFTER pushing to flush_ch - imm: Vec>, write_ch: Channel, // Incremented in the non-concurrently accessed write loop. But also accessed outside. So // we use an atomic op. last_used_cas_counter: AtomicU64, + share_lock: tokio::sync::RwLock<()>, } // TODO not add bellow lines @@ -109,6 +115,7 @@ impl KV { let mut out = KV { opt: opt.clone(), vlog: None, + vptr: crossbeam_epoch::Atomic::null(), manifest: Arc::new(RwLock::new(manifest_file)), // lc: Default::default(), lc: XWeak::new(), @@ -117,11 +124,10 @@ impl KV { dir_lock_guard, value_dir_guard, closers, - mt: None, - imm: Vec::new(), write_ch: Channel::new(1), last_used_cas_counter: Default::default(), mem_st_manger: Arc::new(SkipListManager::default()), + share_lock: tokio::sync::RwLock::new(()), }; let manifest = out.manifest.clone(); @@ -183,39 +189,52 @@ impl KV { } let mut first = true; - xout.vlog.as_ref().unwrap().replay(&vptr, |entry, vptr| { - if first { - info!("First key={}", String::from_utf8_lossy(&entry.key)); - } - first = false; - if xout.last_used_cas_counter.load(Ordering::Relaxed) < entry.cas_counter { - xout.last_used_cas_counter - .store(entry.cas_counter, Ordering::Relaxed); - } - if entry.cas_counter_check != 0 { - let old_value = xout.get(&entry.key)?; - if old_value.cas_counter != entry.cas_counter_check { - return Ok(true); - } - } - let mut nv = vec![]; - let mut meta = entry.meta; - if xout.should_write_value_to_lsm(entry) { - nv = entry.value.clone(); - } else { - nv = Vec::with_capacity(ValuePointer::value_pointer_encoded_size()); - vptr.enc(&mut nv).unwrap(); - meta = meta | MetaBit::BIT_VALUE_POINTER.bits(); - } - let v = ValueStruct { - meta, - user_meta: entry.user_meta, - cas_counter: entry.cas_counter, - value: nv, - }; - while let Err(err) = xout.ensure_room_for_write() {} - todo!() - })?; + xout.vlog + .as_ref() + .unwrap() + .replay(&vptr, |entry, vptr| { + let xout = xout.clone(); + Box::pin(async move { + if first { + info!("First key={}", string::String::from_utf8_lossy(&entry.key)); + } + first = false; + if xout.last_used_cas_counter.load(Ordering::Relaxed) < entry.cas_counter { + xout.last_used_cas_counter + .store(entry.cas_counter, Ordering::Relaxed); + } + + // TODO why? + if entry.cas_counter_check != 0 { + let old_value = xout.get(&entry.key)?; + if old_value.cas_counter != entry.cas_counter_check { + return Ok(true); + } + } + let mut nv = vec![]; + let mut meta = entry.meta; + if xout.should_write_value_to_lsm(entry) { + // TODO OPZ memory copy + nv = entry.value.clone(); + } else { + nv = Vec::with_capacity(ValuePointer::value_pointer_encoded_size()); + vptr.enc(&mut nv).unwrap(); + meta = meta | MetaBit::BIT_VALUE_POINTER.bits(); + } + let v = ValueStruct { + meta, + user_meta: entry.user_meta, + cas_counter: entry.cas_counter, + value: nv, + }; + while let Err(err) = xout.ensure_room_for_write().await { + tokio::time::sleep(Duration::from_millis(10)).await; + } + xout.must_mt().put(&entry.key, v); + Ok(true) + }) + }) + .await?; // Wait for replay to be applied first. replay_closer.signal_and_wait().await; Ok(xout) @@ -278,17 +297,20 @@ impl KV { } // Returns the current `mem_tables` and get references. - fn get_mem_tables(&self) -> Vec> { - // TODO add kv lock - let mut tables = Vec::with_capacity(self.imm.len() + 1); + fn get_mem_tables(&self) -> Vec<&SkipList> { + self.mem_st_manger.lock_exclusive(); + defer! {self.mem_st_manger.unlock_exclusive()} + + let mt = self.mem_st_manger.mt_ref(); + let mut tables = Vec::with_capacity(self.mem_st_manger.imm().len() + 1); // Get mutable `mem_tables`. - tables.push(self.mt.as_ref().unwrap().clone()); + tables.push(mt); tables[0].incr_ref(); - // Get immutable `mem_tables`. - for tb in self.imm.iter().rev() { + for tb in self.mem_st_manger.imm().iter().rev() { + let tb = unsafe { tb.as_ref() }; tb.incr_ref(); - tables.push(tb.clone()); + tables.push(tb); } tables } @@ -327,7 +349,9 @@ impl KV { // async to flush memory table into zero level async fn flush_mem_table(&self, lc: Closer) -> Result<()> { + defer! {lc.done()} while let Ok(task) = self.flush_chan.recv().await { + // after kv send empty mt, it will close flush_chan, so we should return the job. if task.mt.is_none() { break; } @@ -356,23 +380,22 @@ impl KV { let f_name = new_file_name(fid, &self.opt.dir); let fp = create_synced_file(&f_name, true)?; // Don't block just to sync the directory entry. - // TODO use currency - async_sync_directory(self.opt.dir.clone().to_string()).await?; + let task1 = async_sync_directory(self.opt.dir.clone().to_string()); let mut fp = tokio::fs::File::from_std(fp); - write_level0_table(&task.mt.as_ref().unwrap(), &mut fp).await?; + let task2 = write_level0_table(&task.mt.as_ref().unwrap(), &mut fp); + let (task1_res, task2_res) = tokio::join!(task1, task2); + task1_res?; + task2_res?; let fp = fp.into_std().await; - let tc = TableCore::open_table(fp, &f_name, self.opt.table_loading_mode)?; - let tb = Table::new(tc); + let tb = Table::from(tc); // We own a ref on tbl. - // self.must_lc().add_level0_table(tb.clone()).await?; // This will incr_ref (if we don't error, sure) + self.must_lc().add_level0_table(tb.clone()).await?; + // This will incr_ref (if we don't error, sure) tb.decr_ref(); // releases our ref. - - // Update s.imm, need a lock. - // assert!(task.mt.as_ref().unwrap(), "{}", self.imm[0]); - // TODO - task.mt.as_ref().unwrap().decr_ref(); // Return memory + self.mem_st_manger.advance_imm(task.must_mt()); // Update s.imm, need a lock. + task.must_mt().decr_ref(); // Return memory } Ok(()) } @@ -388,9 +411,21 @@ impl KV { let lc = self.lc.upgrade().unwrap(); lc } + fn must_mt(&self) -> &SkipList { + self.mem_st_manger.mt_ref() + } - fn must_mt(&self) -> &Arc { - self.mt.as_ref().unwrap() + fn must_vlog(&self) -> &ValueLogCore { + self.vlog.as_ref().unwrap() + } + + fn must_vptr(&self) -> ValuePointer { + let p = crossbeam_epoch::pin(); + let ptr = self.vptr.load(Ordering::Relaxed, &p); + if ptr.is_null() { + return ValuePointer::default(); + } + unsafe { ptr.as_ref().unwrap().clone() } } } @@ -503,16 +538,36 @@ impl ArcKV { } // Always called serially. - fn ensure_room_for_write(&self) -> Result<()> { + async fn ensure_room_for_write(&self) -> Result<()> { // TODO a special global lock for this function + let _ = self.share_lock.write().await; if self.must_mt().mem_size() < self.opt.max_table_size as u32 { return Ok(()); } // A nil mt indicates that KV is being closed. assert!(!self.must_mt().empty()); - // let flush_task = FlushTask { mt: Some(self.mt), vptr: self.vlog.as_ref().unwrap() } - todo!() + + let flush_task = FlushTask { + mt: Some(self.must_mt().clone()), + vptr: self.must_vptr(), + }; + if let Ok(_) = self.flush_chan.try_send(flush_task) { + info!("Flushing value log to disk if async mode."); + // Ensure value log is synced to disk so this memtable's contents wouldn't be lost. + self.must_vlog().sync()?; + info!( + "Flushing memtable, mt.size={} size of flushChan: {}", + self.must_mt().mem_size(), + self.flush_chan.tx().len() + ); + // We manage to push this task. Let's modify imm. + self.mem_st_manger.swap_st(); + // New memtable is empty. We certainly have room. + Ok(()) + } else { + Err(Unexpected("No room for write".into())) + } } } diff --git a/src/level_handler.rs b/src/level_handler.rs index 4e7492e..fe12761 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -197,7 +197,7 @@ impl LevelHandler { Ok(()) } - // Return true if ok and no stalling. + // Return true if ok and no stalling that will hold a new table reference pub(crate) async fn try_add_level0_table(&self, t: Table) -> bool { assert_eq!(self.level.load(Ordering::Relaxed), 0); let tw = self.tables_wl(); diff --git a/src/log_file.rs b/src/log_file.rs index 0e36e0e..0603cc7 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -7,7 +7,9 @@ use memmap::MmapMut; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; use std::fs::File; +use std::future::Future; use std::io::{Read, Seek, SeekFrom}; +use std::pin::Pin; #[derive(Debug)] pub(crate) struct LogFile { @@ -79,10 +81,13 @@ impl LogFile { Ok(()) } - pub(crate) fn iterate( + pub(crate) async fn iterate( &mut self, offset: u32, - f: &mut impl FnMut(&Entry, &ValuePointer) -> Result, + f: &mut impl for<'a> FnMut( + &'a Entry, + &'a ValuePointer, + ) -> Pin> + 'a>>, ) -> Result<()> { let mut fd = self.fd.as_mut().unwrap(); fd.seek(SeekFrom::Start(offset as u64))?; @@ -135,7 +140,7 @@ impl LogFile { vp.offset = entry.offset; vp.fid = self.fid; - let _continue = f(&entry, &vp)?; + let _continue = f(&entry, &vp).await?; if !_continue { break; } diff --git a/src/st_manager.rs b/src/st_manager.rs index f0c073f..fa4a109 100644 --- a/src/st_manager.rs +++ b/src/st_manager.rs @@ -34,9 +34,11 @@ impl SkipListManager { self.mt.load(Ordering::Relaxed) } - pub unsafe fn mt_ref(&self) -> &'_ SkipList { - let st = self.mt(); - st.as_ref() + pub fn mt_ref(&self) -> &'_ SkipList { + unsafe { + let st = self.mt(); + st.as_ref() + } } pub fn imm(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec>> { @@ -48,6 +50,7 @@ impl SkipListManager { imm.push(st); } + // TODO pub fn swap_st(&self) { self.lock_exclusive(); defer! {self.unlock_exclusive()} @@ -58,6 +61,14 @@ impl SkipListManager { self.set(NonNull::new(ptr).unwrap()); } + pub fn advance_imm(&self, mt: &SkipList) { + self.lock_exclusive(); + defer! {self.unlock_exclusive()}; + let mut imm = self.imm(); + assert!(ptr::eq(imm[0].as_ptr(), mt)); + imm.remove(0); + } + pub fn lock_exclusive(&self) { use parking_lot::lock_api::RawRwLock; unsafe { self.share_lock.raw().lock_exclusive() } diff --git a/src/table/table.rs b/src/table/table.rs index aaa9bc5..d384d3c 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -53,6 +53,12 @@ impl Display for KeyOffset { pub type Table = XArc; pub type WeakTable = XWeak; +impl From for Table { + fn from(value: TableCore) -> Self { + Table::new(value) + } +} + impl Table { pub fn incr_ref(&self) { self.to_ref().incr_ref() diff --git a/src/types.rs b/src/types.rs index cf13449..d03e781 100644 --- a/src/types.rs +++ b/src/types.rs @@ -10,8 +10,11 @@ use std::{hint, mem, thread}; use async_channel::{ bounded, unbounded, Receiver, Recv, RecvError, SendError, Sender, TryRecvError, TrySendError, }; +use atomic::Atomic; +use crossbeam_epoch::Owned; use log::info; +use crate::value_log::ValuePointer; use range_lock::{VecRangeLock, VecRangeLockGuard}; use tokio::sync::mpsc::{UnboundedSender, WeakUnboundedSender}; use tokio::time::sleep; @@ -297,4 +300,16 @@ fn it_closer() { } #[tokio::test] -async fn lck() {} +async fn lck() { + use crossbeam_epoch::{self as epoch, Atomic, Shared}; + use std::sync::atomic::Ordering::SeqCst; + + let a = Atomic::new(1234); + let guard = &epoch::pin(); + // let p = a.swap(Shared::null(), SeqCst, guard); + // println!("{:?}", unsafe { p.as_ref().unwrap()}); + let p = a.swap(Owned::new(200), SeqCst, guard); + let p = a.swap(Owned::new(200), SeqCst, guard); + + println!("{:?}", unsafe { p.as_ref().unwrap()}); +} diff --git a/src/value_log.rs b/src/value_log.rs index 4c66108..33d66c4 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -15,12 +15,14 @@ use std::cell::{Ref, RefCell, RefMut}; use std::collections::{HashMap, HashSet}; use std::fmt::Formatter; use std::fs::{read_dir, File, OpenOptions}; +use std::future::Future; use std::io::{BufWriter, Cursor, Read, Seek, SeekFrom, Write}; use std::marker::PhantomData; use std::mem::size_of; use std::ops::{Deref, Index}; use std::path::Path; -use std::process::id; +use std::pin::Pin; +use std::process::{id, Output}; use std::sync::atomic::{AtomicI32, AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; use std::{fmt, fs, thread}; @@ -440,10 +442,10 @@ impl ValueLogCore { } /// Replays the value log. The kv provide is only valid for the lifetime of function call. - pub fn replay( + pub async fn replay( &self, vp: &ValuePointer, - mut f: impl FnMut(&Entry, &ValuePointer) -> Result, + mut f: impl for<'a> FnMut(&'a Entry, &'a ValuePointer) -> Pin>+'a>>, ) -> Result<()> { let vlogs = self.pick_log_guard(); info!("Seeking at value pointer: {:?}", vp); @@ -457,7 +459,7 @@ impl ValueLogCore { of = 0; } let log_file = vlogs.vlogs.get(&id).unwrap(); - log_file.write().iterate(of, &mut f)?; + log_file.write().iterate(of, &mut f).await?; } // Seek to the end to start writing. let last_file = vlogs @@ -480,7 +482,7 @@ impl ValueLogCore { } // sync is thread-unsafe and should not be called concurrently with write. - fn sync(&self) -> Result<()> { + pub(crate) fn sync(&self) -> Result<()> { if self.opt.sync_writes { return Ok(()); } @@ -761,7 +763,6 @@ fn it() { #[tokio::test] async fn lock1() { - let req: RwLock>> = RwLock::new(Vec::new()); tokio::spawn(async move { diff --git a/src/y/mod.rs b/src/y/mod.rs index 7edf1fd..46ac979 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -13,6 +13,7 @@ use std::error::Error as _; use std::fs::{File, OpenOptions, Permissions}; use std::hash::Hasher; use std::io::{ErrorKind, Write}; +use std::sync::mpsc::sync_channel; use std::{cmp, io}; use thiserror::Error; @@ -269,6 +270,11 @@ pub(crate) fn create_synced_file(file_name: &str, synce: bool) -> Result { .map_err(|err| err.into()) } +pub(crate) fn async_create_synced_file(file_name: &str, synced: bool) -> Result { + let fp = create_synced_file(file_name, synced)?; + Ok(tokio::fs::File::from_std(fp)) +} + pub(crate) fn sync_directory(d: &str) -> Result<()> { let mut fp = File::open(d)?; fp.sync_all().map_err(|err| err.into()) From fd5d06f78d804d593f47164e721258395a6c7aab Mon Sep 17 00:00:00 2001 From: Rg Date: Sat, 25 Feb 2023 15:36:44 +0800 Subject: [PATCH 36/77] :card: --- src/kv.rs | 28 ++++++++++++------- src/level_handler.rs | 6 +++- src/levels.rs | 18 +++++++++++- src/st_manager.rs | 65 ++++++++++++++++++-------------------------- 4 files changed, 67 insertions(+), 50 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 7fa0a9a..f65c7d8 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -13,6 +13,7 @@ use crate::Error::Unexpected; use crate::{Decode, Error, Node, SkipList, SkipListManager}; use atomic::Atomic; use bytes::BufMut; +use crossbeam_epoch::Shared; use drop_cell::defer; use fs2::FileExt; use log::{info, Log}; @@ -278,11 +279,12 @@ impl KV { // get returns the value in `mem_table` or disk for given key. // Note that value will include meta byte. pub(crate) fn get(&self, key: &[u8]) -> Result { - let tables = self.get_mem_tables(); + let p = crossbeam_epoch::pin(); + let tables = self.get_mem_tables(&p); // TODO add metrics for tb in tables { - let vs = tb.get(key); + let vs = unsafe { tb.as_ref().unwrap().get(key) }; if vs.is_none() { continue; } @@ -293,23 +295,26 @@ impl KV { } } - todo!() + self.must_lc() + .get(key) + .ok_or("Not found".into()) } // Returns the current `mem_tables` and get references. - fn get_mem_tables(&self) -> Vec<&SkipList> { + fn get_mem_tables<'a>(&'a self, p: &'a crossbeam_epoch::Guard) -> Vec> { self.mem_st_manger.lock_exclusive(); defer! {self.mem_st_manger.unlock_exclusive()} - let mt = self.mem_st_manger.mt_ref(); + let mt = self.mem_st_manger.mt_ref(p); let mut tables = Vec::with_capacity(self.mem_st_manger.imm().len() + 1); // Get mutable `mem_tables`. tables.push(mt); - tables[0].incr_ref(); + // TODO + unsafe { tables[0].as_ref().unwrap().incr_ref() }; // Get immutable `mem_tables`. for tb in self.mem_st_manger.imm().iter().rev() { - let tb = unsafe { tb.as_ref() }; - tb.incr_ref(); + let tb = tb.load(Ordering::Relaxed, &p); + unsafe { tb.as_ref().unwrap().incr_ref() }; tables.push(tb); } tables @@ -411,8 +416,11 @@ impl KV { let lc = self.lc.upgrade().unwrap(); lc } + fn must_mt(&self) -> &SkipList { - self.mem_st_manger.mt_ref() + let p = crossbeam_epoch::pin(); + let st = self.mem_st_manger.mt_ref(&p).as_raw(); + unsafe { &*st } } fn must_vlog(&self) -> &ValueLogCore { @@ -562,7 +570,7 @@ impl ArcKV { self.flush_chan.tx().len() ); // We manage to push this task. Let's modify imm. - self.mem_st_manger.swap_st(); + self.mem_st_manger.swap_st(self.opt.clone()); // New memtable is empty. We certainly have room. Ok(()) } else { diff --git a/src/level_handler.rs b/src/level_handler.rs index fe12761..d45a527 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -4,7 +4,7 @@ use crate::table::iterator::{ConcatIterator, IteratorImpl, IteratorItem}; use crate::table::table::{Table, TableCore}; use crate::types::{Channel, XArc, XWeak}; use crate::y::merge_iterator::MergeIterOverBuilder; -use crate::Result; +use crate::{Result, ValueStruct}; use core::slice::SlicePattern; use crate::levels::CompactDef; @@ -258,6 +258,10 @@ impl LevelHandler { }; } + pub(crate) fn get(&self, key: &[u8]) -> Option{ + self.get_table_for_key(key) + } + // returns current level pub(crate) fn level(&self) -> usize { self.level.load(Ordering::Relaxed) as usize diff --git a/src/levels.rs b/src/levels.rs index 43da190..bf98de7 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -12,9 +12,9 @@ use crate::types::{Closer, XArc, XWeak}; use crate::y::{ async_sync_directory, create_synced_file, open_existing_synced_file, sync_directory, }; -use crate::Result; use crate::Xiterator; use crate::{MergeIterOverBuilder, MergeIterOverIterator}; +use crate::{Result, ValueStruct}; use atomic::Ordering; use awaitgroup::WaitGroup; use drop_cell::defer; @@ -150,6 +150,22 @@ impl LevelsController { self.cleanup_levels() } + // returns the found value if any. If not found, we return nil. + pub(crate) fn get(&self, key: &[u8]) -> Option { + // It's important that we iterate the levels from 0 on upward. The reason is, if we iterated + // in opposite order, or in parallel (naively calling all the h.RLock() in some order) we could + // read level L's tables post-compaction and level L+1's tables pre-compaction. (If we do + // parallelize this, we will need to call the h.RLock() function by increasing order of level + // number.) + for h in self.levels.iter() { + let item = h.get(key); + if item.is_some() { + return Some(item.unwrap().value().clone()); + } + } + None + } + // cleanup all level's handler fn cleanup_levels(&self) -> Result<()> { for level in self.levels.iter() { diff --git a/src/st_manager.rs b/src/st_manager.rs index fa4a109..a949c0d 100644 --- a/src/st_manager.rs +++ b/src/st_manager.rs @@ -1,18 +1,24 @@ +use crate::options::Options; use crate::SkipList; use atomic::Atomic; +use crossbeam_epoch::Shared; use drop_cell::defer; use parking_lot::lock_api::RwLockWriteGuard; use parking_lot::RawRwLock; use std::borrow::Borrow; +use std::borrow::Cow::Owned; +use std::ops::Deref; use std::ptr; use std::ptr::NonNull; use std::sync::atomic::Ordering; use std::sync::Arc; +type SkipListItem = crossbeam_epoch::Atomic; + pub struct SkipListManager { share_lock: parking_lot::RwLock<()>, - mt: Atomic>, - imm: Arc>>>, + mt: SkipListItem, + imm: Arc>>, } impl Default for SkipListManager { @@ -22,50 +28,41 @@ impl Default for SkipListManager { } impl SkipListManager { - pub fn take(&self) -> NonNull { - self.mt.swap(NonNull::dangling(), Ordering::Relaxed) - } - - pub fn set(&self, st: NonNull) { - self.mt.store(st, Ordering::Relaxed); - } - - pub fn mt(&self) -> NonNull { - self.mt.load(Ordering::Relaxed) + pub fn take<'a>(&'a self, p: &'a crossbeam_epoch::Guard) -> Shared<'a, SkipList> { + self.mt.load_consume(p) } - pub fn mt_ref(&self) -> &'_ SkipList { - unsafe { - let st = self.mt(); - st.as_ref() - } + pub fn mt_ref<'a>(&'a self, p: &'a crossbeam_epoch::Guard) -> Shared<'a, SkipList> { + let st = self.mt.load(Ordering::Relaxed, &p); + st } - pub fn imm(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec>> { + pub fn imm(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec> { self.imm.write() } - pub fn append_imm(&self, st: NonNull) { - let mut imm = self.imm(); - imm.push(st); - } - // TODO - pub fn swap_st(&self) { + pub fn swap_st(&self, opt: Options) { self.lock_exclusive(); defer! {self.unlock_exclusive()} - let st = self.take(); - self.append_imm(st); - let st = Box::new(SkipList::new(1000)); - let ptr = st.as_ref() as *const SkipList as *mut SkipList; - self.set(NonNull::new(ptr).unwrap()); + let p = crossbeam_epoch::pin(); + let st = self.take(&p).into(); + self.imm.write().push(st); + let st = SkipList::new(1000); + self.mt + .store(crossbeam_epoch::Owned::new(st), Ordering::Relaxed); } pub fn advance_imm(&self, mt: &SkipList) { self.lock_exclusive(); defer! {self.unlock_exclusive()}; let mut imm = self.imm(); - assert!(ptr::eq(imm[0].as_ptr(), mt)); + let first_imm = imm + .first() + .unwrap() + .load(Ordering::Relaxed, &crossbeam_epoch::pin()) + .as_raw(); + assert!(ptr::eq(first_imm, mt)); imm.remove(0); } @@ -80,13 +77,5 @@ impl SkipListManager { } } -// impl SkipListManager { -// pub unsafe fn must_mt(&self) -> Option<&'_ Option> { -// let p = &crossbeam_epoch::pin(); -// let mt = self.mt.load(Ordering::SeqCst, p); -// mt.as_ref() -// } -// } - #[test] fn ti() {} From f82b9021896ba47e2af5f6f442c1d989a433e56b Mon Sep 17 00:00:00 2001 From: Rg Date: Sat, 25 Feb 2023 21:54:26 +0800 Subject: [PATCH 37/77] :coffee: --- Cargo.toml | 1 + src/compaction.rs | 2 +- src/kv.rs | 32 ++++++++++++++----------- src/levels.rs | 59 ++++++++++++++++++----------------------------- src/value_log.rs | 34 +++++++++++++++++++++------ 5 files changed, 71 insertions(+), 57 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3fd90f5..1add62e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ tracing = "0.1.37" drop_cell = "0.0.0" walkdir = "2.3.2" crossbeam-epoch = "0.9.13" +tokio-context = "0.1.3" [dev-dependencies] tracing-subscriber = "0.3.16" tracing-log = "0.1.3" diff --git a/src/compaction.rs b/src/compaction.rs index 2a9af24..154201d 100644 --- a/src/compaction.rs +++ b/src/compaction.rs @@ -139,7 +139,7 @@ impl LevelCompactStatus { } // [left, right], Special inf is range all if it be set `true` -#[derive(Clone, Debug)] +#[derive(Clone, Default, Debug)] pub(crate) struct KeyRange { pub(crate) left: Vec, // TODO zero Copy pub(crate) right: Vec, diff --git a/src/kv.rs b/src/kv.rs index f65c7d8..02bfaf5 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -77,6 +77,8 @@ pub struct KV { // we use an atomic op. last_used_cas_counter: AtomicU64, share_lock: tokio::sync::RwLock<()>, + ctx: tokio_context::context::Context, + ctx_handle: tokio_context::context::Handle, } // TODO not add bellow lines @@ -84,7 +86,7 @@ unsafe impl Send for KV {} unsafe impl Sync for KV {} impl KV { - pub async fn Open(mut opt: Options) -> Result> { + async fn open(mut opt: Options) -> Result> { opt.max_batch_size = (15 * opt.max_table_size) / 100; opt.max_batch_count = opt.max_batch_size / Node::size() as u64; create_dir_all(opt.dir.as_str()).await?; @@ -112,7 +114,7 @@ impl KV { writes: Closer::new(), value_gc: Closer::new(), }; - // go out.updateSize(out.closers.updateSize) + let (ctx, h) = tokio_context::context::Context::new(); let mut out = KV { opt: opt.clone(), vlog: None, @@ -129,6 +131,8 @@ impl KV { last_used_cas_counter: Default::default(), mem_st_manger: Arc::new(SkipListManager::default()), share_lock: tokio::sync::RwLock::new(()), + ctx, + ctx_handle: h, }; let manifest = out.manifest.clone(); @@ -215,7 +219,6 @@ impl KV { let mut nv = vec![]; let mut meta = entry.meta; if xout.should_write_value_to_lsm(entry) { - // TODO OPZ memory copy nv = entry.value.clone(); } else { nv = Vec::with_capacity(ValuePointer::value_pointer_encoded_size()); @@ -238,6 +241,9 @@ impl KV { .await?; // Wait for replay to be applied first. replay_closer.signal_and_wait().await; + + // Mmap writeable log + // let lf = xout.must_vlog().files_log.read()[xout.must_vlog().max_fid.load(Ordering::Relaxed)]; Ok(xout) } @@ -295,9 +301,7 @@ impl KV { } } - self.must_lc() - .get(key) - .ok_or("Not found".into()) + self.must_lc().get(key).ok_or("Not found".into()) } // Returns the current `mem_tables` and get references. @@ -442,14 +446,17 @@ pub type WeakKV = XWeak; pub type ArcKV = XArc; impl ArcKV { + /// Async open a KV db + pub async fn open(op: Options) -> Result { + KV::open(op).await + } + /// data size stats /// TODO pub async fn spawn_update_size(&self) { let lc = self.closers.update_size.spawn(); - defer! { - lc.done(); - info!("exit update size worker"); - } + defer! {lc.done()} + defer! {info!("exit update size worker");} let mut tk = tokio::time::interval(tokio::time::Duration::from_secs(5 * 60)); let dir = self.opt.dir.clone(); @@ -465,9 +472,7 @@ impl ArcKV { let (_, vlog_sz) = KV::walk_dir(dir.as_str()).await.unwrap(); } }, - _ = c.recv() => { - - }, + _ = c.recv() => {return;}, } } } @@ -489,6 +494,7 @@ impl ArcKV { impl ArcKV { async fn do_writes(&self, lc: Closer) { + info!("start do writes task!"); // TODO add metrics let has_been_close = lc.has_been_closed(); let write_ch = self.write_ch.clone(); diff --git a/src/levels.rs b/src/levels.rs index bf98de7..208568d 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -231,9 +231,7 @@ impl LevelsController { async fn do_compact(&self, p: CompactionPriority) -> Result { let l = p.level; assert!(l + 1 < self.opt.max_levels); // Sanity check. - let mut cd = CompactDef::default(); - cd.this_level = (self.levels[l]).clone(); - cd.next_level = (self.levels[l + 1]).clone(); + let mut cd = CompactDef::new(self.levels[l].clone(), self.levels[l + 1].clone()); info!("Got compaction priority: {:?}", p); // While picking tables to be compacted, both level's tables are expected to // remain unchanged. @@ -744,30 +742,19 @@ impl Display for CompactDef { } } -impl Default for CompactDef { - fn default() -> Self { - // CompactDef { - // this_level: XWeak::new(), - // next_level: XWeak::new(), - // top: RwLockReadGuard::, - // bot: RwLock::new(vec![]), - // this_range: KeyRange { - // left: vec![], - // right: vec![], - // inf: false, - // }, - // next_range: KeyRange { - // left: vec![], - // right: vec![], - // inf: false, - // }, - // this_size: Default::default(), - // } - todo!() +impl CompactDef { + pub(crate) fn new(this_level: LevelHandler, next_level: LevelHandler) -> Self { + CompactDef { + this_level, + next_level, + top: vec![], + bot: vec![], + this_range: KeyRange::default(), + next_range: KeyRange::default(), + this_size: Default::default(), + } } -} -impl CompactDef { #[inline] fn lock_shared_levels(&self) { self.this_level.lock_shared(); @@ -780,17 +767,17 @@ impl CompactDef { self.this_level.unlock_shared(); } - #[inline] - fn lock_exclusive_levels(&self) { - self.this_level.lock_exclusive(); - self.next_level.lock_exclusive(); - } - - #[inline] - fn unlock_exclusive_levels(&self) { - self.next_level.unlock_exclusive(); - self.this_level.unlock_exclusive(); - } + // #[inline] + // fn lock_exclusive_levels(&self) { + // self.this_level.lock_exclusive(); + // self.next_level.lock_exclusive(); + // } + // + // #[inline] + // fn unlock_exclusive_levels(&self) { + // self.next_level.unlock_exclusive(); + // self.this_level.unlock_exclusive(); + // } } // Checks that all necessary table files exist and removes all table files not diff --git a/src/value_log.rs b/src/value_log.rs index 33d66c4..d33af05 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -27,6 +27,7 @@ use std::sync::atomic::{AtomicI32, AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; use std::{fmt, fs, thread}; use tabled::object::Entity::Cell; +use tokio::macros::support::thread_rng_n; use crate::kv::{ArcKV, WeakKV, KV}; use crate::log_file::LogFile; @@ -37,7 +38,7 @@ use crate::types::{Channel, Closer, XArc}; use crate::y::{ create_synced_file, is_eof, open_existing_synced_file, read_at, sync_directory, Decode, Encode, }; -use crate::Error::Unexpected; +use crate::Error::{Unexpected, ValueNoRewrite, ValueRejected}; use crate::{Error, Result}; /// Values have their first byte being byteData or byteDelete. This helps us distinguish between @@ -261,10 +262,10 @@ impl From for ArcRequest { pub struct ValueLogCore { dir_path: Box, - max_fid: AtomicU32, + pub(crate) max_fid: AtomicU32, // TODO // guards our view of which files exist, which to be deleted, how many active iterators - files_log: Arc>, + pub(crate) files_log: Arc>, vlogs: Arc>>>>, dirty_vlogs: Arc>>, // TODO why? @@ -337,6 +338,7 @@ impl ValueLogCore { self.kv = kv; self.open_create_files()?; // todo add garbage and metrics + self.garbage_ch = Channel::new(1); Ok(()) } @@ -445,7 +447,10 @@ impl ValueLogCore { pub async fn replay( &self, vp: &ValuePointer, - mut f: impl for<'a> FnMut(&'a Entry, &'a ValuePointer) -> Pin>+'a>>, + mut f: impl for<'a> FnMut( + &'a Entry, + &'a ValuePointer, + ) -> Pin> + 'a>>, ) -> Result<()> { let vlogs = self.pick_log_guard(); info!("Seeking at value pointer: {:?}", vp); @@ -664,9 +669,6 @@ impl ValueLogCore { fn wait_gc(&self) { todo!() } - fn run_gc(&self) -> Result<()> { - todo!() - } } struct PickVlogsGuardsReadLock<'a> { @@ -711,6 +713,24 @@ impl SafeValueLog { } async fn do_run_gcc(&self, gc_threshold: f64) -> Result<()> { + let lf = self.value_log.pick_log(); + if lf.is_none() { + return Err(ValueNoRewrite); + } + #[derive(Debug, Default)] + struct Reason { + total: f64, + keep: f64, + discard: f64, + } + let mut reason = Reason::default(); + let mut window = 100.0; + let mut count = 0; + // Pick a random start point for the log. + let skip_first_m = { + let mut rng = thread_rng_n((self.value_log.opt.value_log_file_size / M) as u32); + let x: u32 = rng.gen() + }; Ok(()) } } From 02e2f14fdd5ebb5b95d476f39ea6b58fad940190 Mon Sep 17 00:00:00 2001 From: Rg Date: Sun, 26 Feb 2023 23:54:30 +0800 Subject: [PATCH 38/77] :card: --- Cargo.toml | 2 ++ src/kv.rs | 6 ++---- src/value_log.rs | 8 ++++---- src/y/mod.rs | 37 +++++++++++++++++++++++++++++++++++-- 4 files changed, 43 insertions(+), 10 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1add62e..1bdf5c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,8 @@ drop_cell = "0.0.0" walkdir = "2.3.2" crossbeam-epoch = "0.9.13" tokio-context = "0.1.3" +dyn-clone = "1.0.10" +eieio = "1.0.0" [dev-dependencies] tracing-subscriber = "0.3.16" tracing-log = "0.1.3" diff --git a/src/kv.rs b/src/kv.rs index 02bfaf5..ad34a5d 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -518,8 +518,7 @@ impl ArcKV { .collect::>(); let to_reqs = Arc::new(to_reqs); if let Err(err) = self.write_requests(to_reqs).await { - let ret = Arc::new(Err(err)); - reqs.lock().iter().for_each(|req| req.set_err(ret.clone())); + reqs.lock().iter().for_each(|req| req.set_err(Err(err.clone()))); } reqs.lock().clear(); } @@ -541,8 +540,7 @@ impl ArcKV { .map(|req| req.clone()) .collect::>(); if let Err(err) = self.write_requests(Arc::new(to_reqs)).await { - let ret = Arc::new(Err(err)); - reqs.lock().iter().for_each(|req| req.set_err(ret.clone())); + reqs.lock().iter().for_each(|req| req.set_err(Err(err.clone()))); } } } diff --git a/src/value_log.rs b/src/value_log.rs index d33af05..e3db6f0 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -231,7 +231,7 @@ impl Request { #[derive(Clone)] pub(crate) struct ArcRequest { inner: Arc, - err: Arc>>>, + err: Arc>>, } unsafe impl Send for ArcRequest {} @@ -242,7 +242,7 @@ impl ArcRequest { pub(crate) fn get_req(&self) -> Arc { self.inner.clone() } - pub(crate) fn set_err(&self, err: Arc>) { + pub(crate) fn set_err(&self, err: Result<()>) { *self.err.lock() = err; } @@ -255,7 +255,7 @@ impl From for ArcRequest { fn from(value: Request) -> Self { ArcRequest { inner: Arc::new(value), - err: Arc::new(Mutex::new(Arc::new(Ok(())))), + err: Arc::new(Mutex::new(Ok(()))), } } } @@ -729,7 +729,7 @@ impl SafeValueLog { // Pick a random start point for the log. let skip_first_m = { let mut rng = thread_rng_n((self.value_log.opt.value_log_file_size / M) as u32); - let x: u32 = rng.gen() + // let x: u32 = rng.gen() }; Ok(()) } diff --git a/src/y/mod.rs b/src/y/mod.rs index 46ac979..bd7c1a8 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -6,6 +6,7 @@ mod metrics; pub use codec::{AsyncEncDec, Decode, Encode}; pub use iterator::*; use libc::{O_DSYNC, O_WRONLY}; +use log::error; use memmap::MmapMut; pub use merge_iterator::*; use std::collections::hash_map::DefaultHasher; @@ -23,10 +24,10 @@ pub const USER_META_SIZE: usize = 1; pub const CAS_SIZE: usize = 8; pub const VALUE_SIZE: usize = 4; -#[derive(Debug, Error)] +#[derive(Debug, Error, Clone)] pub enum Error { #[error(transparent)] - StdIO(#[from] std::io::Error), + StdIO(#[from] eieio::Error), #[error("io error: {0}")] Io(String), @@ -85,6 +86,13 @@ impl Default for Error { } impl Error { + pub fn is_io(&self) -> bool { + match self { + Error::StdIO(err) => true, + _ => false, + } + } + pub fn is_io_eof(&self) -> bool { match self { Error::StdIO(err) if err.kind() == ErrorKind::UnexpectedEof => true, @@ -121,6 +129,12 @@ impl From for Error { } } +impl From for Error { + fn from(value: io::Error) -> Self { + Error::StdIO(eieio::Error::from(value)) + } +} + pub type Result = std::result::Result; #[inline] @@ -310,3 +324,22 @@ fn dsync() { let file = options.open("foo.txt"); println!("{:?}", file.err()); } + +#[test] +fn clone_error() { + #[derive(Debug, Error, Clone)] + pub enum Error { + #[error(transparent)] + StdIO(#[from] eieio::Error), + #[error("Hello")] + Hello, + } + let err = Error::StdIO(eieio::Error::from(io::ErrorKind::AlreadyExists)); + match err { + Error::StdIO(err) => { + let ioerr = io::Error::from(err.kind()); + println!("{}", ioerr); + } + _ => {} + } +} From f0a70e1600c55f855710438a41327371d8e00c5d Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 27 Feb 2023 15:03:46 +0800 Subject: [PATCH 39/77] :coffee: add ArcLock for valueLog.buf --- src/kv.rs | 4 +-- src/lib.rs | 1 + src/log_file.rs | 10 ++++++ src/types.rs | 6 ++-- src/value_log.rs | 90 ++++++++++++++++++++++++++++++++---------------- 5 files changed, 77 insertions(+), 34 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index ad34a5d..18f5209 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -140,12 +140,10 @@ impl KV { // handle levels_controller let lc = LevelsController::new(manifest.clone(), out.opt.clone()).await?; lc.start_compact(out.closers.compactors.clone()); - let mut vlog = ValueLogCore::default(); - vlog.open(&out, opt)?; out.vlog = Some(vlog); - let xout = XArc::new(out); + // xout.vlog.unwrap().open(&xout, opt)?; // update size { let _out = xout.clone(); diff --git a/src/lib.rs b/src/lib.rs index ec15a59..718c4cf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,6 +15,7 @@ #![feature(slice_take)] #![feature(arc_into_inner)] #![feature(async_closure)] +#![feature(async_iterator)] use std::io; use std::mem::align_of; diff --git a/src/log_file.rs b/src/log_file.rs index 0603cc7..8df5443 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -6,10 +6,12 @@ use byteorder::{BigEndian, ReadBytesExt}; use memmap::MmapMut; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; +use std::async_iter::AsyncIterator; use std::fs::File; use std::future::Future; use std::io::{Read, Seek, SeekFrom}; use std::pin::Pin; +use std::task::{Context, Poll}; #[derive(Debug)] pub(crate) struct LogFile { @@ -34,6 +36,14 @@ impl SafeLogFile { } } +impl AsyncIterator for LogFile { + type Item = (); + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + todo!() + } +} + impl LogFile { pub(crate) fn open_read_only(&mut self) -> Result<()> { let mut fd = std::fs::OpenOptions::new() diff --git a/src/types.rs b/src/types.rs index d03e781..c7ed5ba 100644 --- a/src/types.rs +++ b/src/types.rs @@ -19,8 +19,10 @@ use range_lock::{VecRangeLock, VecRangeLockGuard}; use tokio::sync::mpsc::{UnboundedSender, WeakUnboundedSender}; use tokio::time::sleep; -type TArcMx = Arc>; -type TArcRW = Arc>; +pub type TArcMx = Arc>; +pub type TArcRW = Arc>; +pub type ArcMx = Arc>; +pub type ArcRW = Arc>; // Channel like to go's channel #[derive(Clone)] diff --git a/src/value_log.rs b/src/value_log.rs index e3db6f0..62a307a 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -3,7 +3,7 @@ use awaitgroup::{WaitGroup, Worker}; use bitflags::bitflags; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use crc32fast::Hasher; -use libc::difftime; +use libc::{difftime, nice}; use log::info; use log::kv::Source; use memmap::MmapMut; @@ -14,7 +14,7 @@ use serde_json::to_vec; use std::cell::{Ref, RefCell, RefMut}; use std::collections::{HashMap, HashSet}; use std::fmt::Formatter; -use std::fs::{read_dir, File, OpenOptions}; +use std::fs::{read_dir, remove_file, File, OpenOptions}; use std::future::Future; use std::io::{BufWriter, Cursor, Read, Seek, SeekFrom, Write}; use std::marker::PhantomData; @@ -25,6 +25,8 @@ use std::pin::Pin; use std::process::{id, Output}; use std::sync::atomic::{AtomicI32, AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; +use std::task::{Context, Poll}; +use std::time::{Duration, SystemTime}; use std::{fmt, fs, thread}; use tabled::object::Entity::Cell; use tokio::macros::support::thread_rng_n; @@ -34,7 +36,7 @@ use crate::log_file::LogFile; use crate::options::Options; use crate::skl::BlockBytes; use crate::table::iterator::BlockSlice; -use crate::types::{Channel, Closer, XArc}; +use crate::types::{ArcRW, Channel, Closer, XArc}; use crate::y::{ create_synced_file, is_eof, open_existing_synced_file, read_at, sync_directory, Decode, Encode, }; @@ -272,9 +274,9 @@ pub struct ValueLogCore { // A refcount of iterators -- when this hits zero, we can delete the files_to_be_deleted. num_active_iterators: AtomicI32, writable_log_offset: AtomicU32, - buf: RefCell>>, + buf: ArcRW>>, opt: Options, - kv: *const KV, + kv: WeakKV, // Only allow one GC at a time. garbage_ch: Channel<()>, } @@ -289,9 +291,9 @@ impl Default for ValueLogCore { dirty_vlogs: Arc::new(Default::default()), num_active_iterators: Default::default(), writable_log_offset: Default::default(), - buf: RefCell::new(BufWriter::new(vec![0u8; 0])), + buf: Arc::new(RwLock::new(BufWriter::new(vec![0u8; 0]))), opt: Default::default(), - kv: std::ptr::null(), + kv: WeakKV::new(), garbage_ch: Channel::new(1), } } @@ -331,19 +333,18 @@ impl ValueLogCore { } // TODO Use Arc to replace it - pub(crate) fn open(&mut self, kv: &KV, opt: Options) -> Result<()> { + pub(crate) fn open(&mut self, kv: &ArcKV, opt: Options) -> Result<()> { self.dir_path = opt.value_dir.clone(); self.opt = opt; - let kv = kv as *const KV; - self.kv = kv; + self.kv = WeakKV::from(kv); self.open_create_files()?; // todo add garbage and metrics self.garbage_ch = Channel::new(1); Ok(()) } - fn get_kv(&self) -> &KV { - unsafe { &*(self.kv) } + fn get_kv(&self) -> XArc { + self.kv.upgrade().unwrap() } pub fn close(&self) -> Result<()> { @@ -482,8 +483,15 @@ impl ValueLogCore { Ok(()) } - fn delete_log_file(&mut self, log_file: &LogFile) -> Result<()> { - todo!() + fn delete_log_file(&mut self, mut log_file: LogFile) -> Result<()> { + if let Some(mp) = log_file._mmap.take() { + mp.flush()?; + } + if let Some(fp) = log_file.fd.take() { + fp.sync_all()?; + } + remove_file(self.fpath(log_file.fid))?; + Ok(()) } // sync is thread-unsafe and should not be called concurrently with write. @@ -516,13 +524,13 @@ impl ValueLogCore { pub(crate) fn write(&self, reqs: Arc>) -> Result<()> { let cur_vlog_file = self.pick_log_by_vlog_id(&self.max_fid.load(Ordering::Acquire)); let to_disk = || -> Result<()> { - if self.buf.borrow().buffer().is_empty() { + if self.buf.read().buffer().is_empty() { return Ok(()); } info!( " Flushing {} blocks of total size: {}", reqs.len(), - self.buf.borrow().buffer().len() + self.buf.read().buffer().len() ); let n = cur_vlog_file @@ -530,12 +538,12 @@ impl ValueLogCore { .fd .as_mut() .unwrap() - .write(self.buf.borrow().buffer())?; + .write(self.buf.read().buffer())?; // todo add metrics info!("Done"); self.writable_log_offset .fetch_add(n as u32, Ordering::Release); - self.buf.borrow_mut().get_mut().clear(); + self.buf.write().get_mut().clear(); if self.writable_log_offset.load(Ordering::Acquire) > self.opt.value_log_file_size as u32 @@ -564,8 +572,8 @@ impl ValueLogCore { ptr.fid = cur_vlog_file.read().fid; // Use the offset including buffer length so far. ptr.offset = self.writable_log_offset.load(Ordering::Acquire) - + self.buf.borrow().buffer().len() as u32; - let mut buf = self.buf.borrow_mut(); + + self.buf.read().buffer().len() as u32; + let mut buf = self.buf.write(); entry.borrow_mut().enc(&mut *buf)?; } } @@ -713,10 +721,7 @@ impl SafeValueLog { } async fn do_run_gcc(&self, gc_threshold: f64) -> Result<()> { - let lf = self.value_log.pick_log(); - if lf.is_none() { - return Err(ValueNoRewrite); - } + let lf = self.value_log.pick_log().ok_or(Error::ValueNoRewrite)?; #[derive(Debug, Default)] struct Reason { total: f64, @@ -724,13 +729,40 @@ impl SafeValueLog { discard: f64, } let mut reason = Reason::default(); - let mut window = 100.0; + let mut window = 100.0; // lasted 100M let mut count = 0; // Pick a random start point for the log. - let skip_first_m = { - let mut rng = thread_rng_n((self.value_log.opt.value_log_file_size / M) as u32); - // let x: u32 = rng.gen() - }; + let mut skip_first_m = + thread_rng_n((self.value_log.opt.value_log_file_size / M) as u32) as f64 - window; + let mut skipped = 0.0; + let mut start = SystemTime::now(); + // assert!(!self.value_log.kv.is_null()); + + lf.write() + .iterate(0, &mut |entry, vptr| { + let kv = self.value_log.get_kv(); + Box::pin(async move { + let esz = vptr.len as f64 / (1 << 20) as f64; // in MBs, +4 for the CAS stuff. + skipped += esz; + if skipped < skip_first_m { + return Ok(true); + } + count += 1; + if count % 100 == 0 { + tokio::time::sleep(Duration::from_millis(1)).await; + } + reason.total += esz; + if reason.total > window { + return Err("stop iteration".into()); + } + if start.elapsed().unwrap().as_secs() > 10 { + return Err("stop iteration".into()); + } + let vs = kv.get(&entry.key); + Ok(true) + }) + }) + .await?; Ok(()) } } From b682191fca5eacefc906a03748222330acc111b2 Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 27 Feb 2023 15:18:56 +0800 Subject: [PATCH 40/77] delete some unsafe code impl --- src/kv.rs | 4 ---- src/lib.rs | 1 + src/skl/skip.rs | 28 ++++++++++++---------------- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 18f5209..dad8e48 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -81,10 +81,6 @@ pub struct KV { ctx_handle: tokio_context::context::Handle, } -// TODO not add bellow lines -unsafe impl Send for KV {} -unsafe impl Sync for KV {} - impl KV { async fn open(mut opt: Options) -> Result> { opt.max_batch_size = (15 * opt.max_table_size) / 100; diff --git a/src/lib.rs b/src/lib.rs index 718c4cf..e06ea70 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,6 +16,7 @@ #![feature(arc_into_inner)] #![feature(async_closure)] #![feature(async_iterator)] +#![feature(atomic_mut_ptr)] use std::io; use std::mem::align_of; diff --git a/src/skl/skip.rs b/src/skl/skip.rs index 794ff17..1bf966a 100644 --- a/src/skl/skip.rs +++ b/src/skl/skip.rs @@ -18,20 +18,17 @@ use super::{arena::Arena, node::Node}; /// SkipList pub struct SkipList { height: Arc, - head: NonNull, + head: AtomicPtr, _ref: Arc, pub(crate) arena: Arc, } -unsafe impl Send for SkipList {} - -unsafe impl Sync for SkipList {} - impl Clone for SkipList { fn clone(&self) -> Self { + let node = self.head.load(Ordering::Relaxed); SkipList { height: self.height.clone(), - head: NonNull::new(self.head.as_ptr()).unwrap(), + head: AtomicPtr::new(node), _ref: self._ref.clone(), arena: self.arena.clone(), } @@ -45,7 +42,7 @@ impl SkipList { let node = Node::new(&mut arena, "".as_bytes(), &v, MAX_HEIGHT as isize); Self { height: Arc::new(AtomicI32::new(1)), - head: NonNull::new(node).unwrap(), + head: AtomicPtr::new(node), _ref: Arc::new(AtomicI32::new(1)), arena: Arc::new(arena), } @@ -77,12 +74,11 @@ impl SkipList { } pub(crate) fn get_head(&self) -> &Node { - let node = unsafe { self.head.as_ptr() as *const Node }; - unsafe { &*node } + unsafe { &*(self.head.load(Ordering::Relaxed) as *const Node) } } fn get_head_mut(&self) -> &mut Node { - let node = unsafe { self.head.as_ptr() as *mut Node }; + let node = unsafe { self.head.load(Ordering::Relaxed) as *mut Node }; unsafe { &mut *node } } @@ -125,7 +121,7 @@ impl SkipList { return (None, false); } // Try to return x. Make sure it is not a head node. - if ptr::eq(x, self.head.as_ptr()) { + if ptr::eq(x, self.head.load(Ordering::Relaxed)) { return (None, false); } return (Some(x), false); @@ -152,7 +148,7 @@ impl SkipList { continue; } // On base level. Return x. - if ptr::eq(x, self.head.as_ptr()) { + if ptr::eq(x, self.get_head()) { return (None, false); } @@ -170,7 +166,7 @@ impl SkipList { return (Some(next), false); } // Try to return x. Make sure it is not a head node. - if ptr::eq(x, self.head.as_ptr()) { + if ptr::eq(x, self.get_head()) { return (None, false); } return (Some(x), false); @@ -328,7 +324,7 @@ impl SkipList { // Returns the last element. If head (empty list), we return nil, All the find functions // will NEVER return the head nodes. pub unsafe fn find_last(&self) -> Option<&Node> { - let mut n = self.head.as_ptr() as *const Node; + let mut n = self.get_head() as *const Node; let mut level = self.get_height() - 1; loop { let next = self.get_next(&*n, level); @@ -337,7 +333,7 @@ impl SkipList { continue; } if level == 0 { - if ptr::eq(n, self.head.as_ptr()) { + if ptr::eq(n, self.get_head()) { return None; } return Some(&*n); @@ -442,7 +438,7 @@ mod tests { let mut st = SkipList::new(ARENA_SIZE); assert_eq!(st.height.load(Ordering::Relaxed), 1); assert_eq!(st._ref.load(Ordering::Relaxed), 1); - let head = unsafe { st.head.as_ref() }; + let head = st.get_head(); assert_eq!(head.height as usize, MAX_HEIGHT); assert_eq!(head.key_offset as usize, 1); } From 53538a26b4c9b35452972a81b8d6fafbea4bc8b4 Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 27 Feb 2023 21:29:40 +0800 Subject: [PATCH 41/77] :coffee: add ArcLock for valueLog.buf --- src/kv.rs | 11 ++++- src/log_file.rs | 100 ++++++++++++++++++++++++++++++---------- src/types.rs | 2 +- src/value_log.rs | 116 ++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 186 insertions(+), 43 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index dad8e48..e4bb236 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -81,6 +81,9 @@ pub struct KV { ctx_handle: tokio_context::context::Handle, } +unsafe impl Send for KV {} +unsafe impl Sync for KV {} + impl KV { async fn open(mut opt: Options) -> Result> { opt.max_batch_size = (15 * opt.max_table_size) / 100; @@ -512,7 +515,9 @@ impl ArcKV { .collect::>(); let to_reqs = Arc::new(to_reqs); if let Err(err) = self.write_requests(to_reqs).await { - reqs.lock().iter().for_each(|req| req.set_err(Err(err.clone()))); + reqs.lock() + .iter() + .for_each(|req| req.set_err(Err(err.clone()))); } reqs.lock().clear(); } @@ -534,7 +539,9 @@ impl ArcKV { .map(|req| req.clone()) .collect::>(); if let Err(err) = self.write_requests(Arc::new(to_reqs)).await { - reqs.lock().iter().for_each(|req| req.set_err(Err(err.clone()))); + reqs.lock() + .iter() + .for_each(|req| req.set_err(Err(err.clone()))); } } } diff --git a/src/log_file.rs b/src/log_file.rs index 8df5443..7899fac 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -9,7 +9,7 @@ use parking_lot::{RawRwLock, RwLock}; use std::async_iter::AsyncIterator; use std::fs::File; use std::future::Future; -use std::io::{Read, Seek, SeekFrom}; +use std::io::{Cursor, Read, Seek, SeekFrom}; use std::pin::Pin; use std::task::{Context, Poll}; @@ -22,28 +22,6 @@ pub(crate) struct LogFile { pub(crate) sz: u32, } -pub(crate) struct SafeLogFile(RwLock); - -impl SafeLogFile { - pub(crate) fn new(log_file: LogFile) -> Self { - Self(RwLock::new(log_file)) - } - pub(crate) fn rl(&self) -> RwLockReadGuard<'_, RawRwLock, LogFile> { - self.0.read() - } - pub(crate) fn wl(&self) -> RwLockWriteGuard<'_, RawRwLock, LogFile> { - self.0.write() - } -} - -impl AsyncIterator for LogFile { - type Item = (); - - fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - todo!() - } -} - impl LogFile { pub(crate) fn open_read_only(&mut self) -> Result<()> { let mut fd = std::fs::OpenOptions::new() @@ -159,4 +137,80 @@ impl LogFile { // todo add truncate Ok(()) } + + pub(crate) async fn iterate2( + &self, + offset: u32, + f: &mut impl for<'a> FnMut( + &'a Entry, + &'a ValuePointer, + ) -> Pin> + 'a>>, + ) -> Result<()> { + // let mut fd = self.fd.as_mut().unwrap(); + // fd.seek(SeekFrom::Start(offset as u64))?; + let fd = self.fd.as_ref().unwrap(); + let mut entry = Entry::default(); + let mut truncate = false; + let mut record_offset = offset; + loop { + let mut h = Header::default(); + let buffer = vec![0u8; Header::encoded_size()]; + let ok = h.dec(&mut Cursor::new(buffer)); + if ok.is_err() && ok.as_ref().unwrap_err().is_io_eof() { + break; + } + // todo add truncate currenct + ok?; + if h.k_len as usize > entry.key.capacity() { + entry.key = vec![0u8; h.k_len as usize]; + } + if h.v_len as usize > entry.value.capacity() { + entry.value = vec![0u8; h.v_len as usize]; + } + entry.key.clear(); + entry.value.clear(); + + let ok = fd.read(&mut entry.key); + if is_eof(&ok) { + break; + } + ok?; + + let ok = fd.read(&mut entry.value); + if is_eof(&ok) { + break; + } + ok?; + entry.offset = record_offset; + entry.meta = h.meta; + entry.user_meta = h.user_mata; + entry.cas_counter = h.cas_counter; + entry.cas_counter_check = h.cas_counter_check; + let ok = fd.read_u32::(); + if is_eof(&ok) { + break; + } + let crc = ok?; + + let mut vp = ValuePointer::default(); + vp.len = Header::encoded_size() as u32 + h.k_len + h.v_len + 4; + record_offset += vp.len; + + vp.offset = entry.offset; + vp.fid = self.fid; + + let _continue = f(&entry, &vp).await?; + if !_continue { + break; + } + } + + // todo add truncate + Ok(()) + } + pub(crate) fn reset_seek_start(&mut self) -> Result<()> { + let fd = self.fd.as_mut().unwrap(); + fd.seek(SeekFrom::Start(0))?; + Ok(()) + } } diff --git a/src/types.rs b/src/types.rs index c7ed5ba..ef2d5fa 100644 --- a/src/types.rs +++ b/src/types.rs @@ -313,5 +313,5 @@ async fn lck() { let p = a.swap(Owned::new(200), SeqCst, guard); let p = a.swap(Owned::new(200), SeqCst, guard); - println!("{:?}", unsafe { p.as_ref().unwrap()}); + println!("{:?}", unsafe { p.as_ref().unwrap() }); } diff --git a/src/value_log.rs b/src/value_log.rs index 62a307a..cc44442 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -23,11 +23,11 @@ use std::ops::{Deref, Index}; use std::path::Path; use std::pin::Pin; use std::process::{id, Output}; -use std::sync::atomic::{AtomicI32, AtomicU32, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicI32, AtomicPtr, AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; use std::task::{Context, Poll}; use std::time::{Duration, SystemTime}; -use std::{fmt, fs, thread}; +use std::{fmt, fs, io, ptr, thread}; use tabled::object::Entity::Cell; use tokio::macros::support::thread_rng_n; @@ -36,7 +36,7 @@ use crate::log_file::LogFile; use crate::options::Options; use crate::skl::BlockBytes; use crate::table::iterator::BlockSlice; -use crate::types::{ArcRW, Channel, Closer, XArc}; +use crate::types::{ArcRW, Channel, Closer, TArcMx, XArc}; use crate::y::{ create_synced_file, is_eof, open_existing_synced_file, read_at, sync_directory, Decode, Encode, }; @@ -74,6 +74,13 @@ impl Header { pub(crate) const fn encoded_size() -> usize { size_of::() } + + // pub(crate) fn from_fd(&mut self, rd: &File, offset: u64) -> Result<()> { + // let mut buffer = vec![0u8; Self::encoded_size()]; + // read_at(rd, &mut buffer, offset)?; + // self.enc(&mut Cursor::new(buffer))?; + // Ok(()) + // } } impl Encode for Header { @@ -148,6 +155,12 @@ impl Encode for Entry { } } +impl Decode for Entry { + fn dec(&mut self, rd: &mut dyn Read) -> Result<()> { + todo!() + } +} + impl fmt::Display for Entry { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( @@ -268,7 +281,7 @@ pub struct ValueLogCore { // TODO // guards our view of which files exist, which to be deleted, how many active iterators pub(crate) files_log: Arc>, - vlogs: Arc>>>>, + vlogs: Arc>>>>, // TODO It is not good idea that use raw lock for Arc>, it maybe lock AsyncRuntime thread. dirty_vlogs: Arc>>, // TODO why? // A refcount of iterators -- when this hits zero, we can delete the files_to_be_deleted. @@ -276,7 +289,7 @@ pub struct ValueLogCore { writable_log_offset: AtomicU32, buf: ArcRW>>, opt: Options, - kv: WeakKV, + kv: *const KV, // Only allow one GC at a time. garbage_ch: Channel<()>, } @@ -293,7 +306,7 @@ impl Default for ValueLogCore { writable_log_offset: Default::default(), buf: Arc::new(RwLock::new(BufWriter::new(vec![0u8; 0]))), opt: Default::default(), - kv: WeakKV::new(), + kv: ptr::null_mut(), garbage_ch: Channel::new(1), } } @@ -333,18 +346,18 @@ impl ValueLogCore { } // TODO Use Arc to replace it - pub(crate) fn open(&mut self, kv: &ArcKV, opt: Options) -> Result<()> { + pub(crate) fn open(&mut self, kv: *const KV, opt: Options) -> Result<()> { self.dir_path = opt.value_dir.clone(); self.opt = opt; - self.kv = WeakKV::from(kv); + self.kv = kv; self.open_create_files()?; // todo add garbage and metrics self.garbage_ch = Channel::new(1); Ok(()) } - fn get_kv(&self) -> XArc { - self.kv.upgrade().unwrap() + fn get_kv(&self) -> &KV { + unsafe { &*self.kv } } pub fn close(&self) -> Result<()> { @@ -464,8 +477,9 @@ impl ValueLogCore { if id > vp.fid { of = 0; } - let log_file = vlogs.vlogs.get(&id).unwrap(); - log_file.write().iterate(of, &mut f).await?; + let mut log_file = vlogs.vlogs.get(&id).unwrap().write(); + // log_file.reset_seek_start()?; + log_file.iterate(of, &mut f).await?; } // Seek to the end to start writing. let last_file = vlogs @@ -728,7 +742,7 @@ impl SafeValueLog { keep: f64, discard: f64, } - let mut reason = Reason::default(); + let mut reason: TArcMx = TArcMx::default(); let mut window = 100.0; // lasted 100M let mut count = 0; // Pick a random start point for the log. @@ -738,10 +752,16 @@ impl SafeValueLog { let mut start = SystemTime::now(); // assert!(!self.value_log.kv.is_null()); - lf.write() + let err = lf + .clone() + .write() .iterate(0, &mut |entry, vptr| { - let kv = self.value_log.get_kv(); + let vlg = self.value_log.clone(); + let reason = reason.clone(); + let lfc = lf.clone(); Box::pin(async move { + let kv = vlg.get_kv(); + let mut reason = reason.lock().await; let esz = vptr.len as f64 / (1 << 20) as f64; // in MBs, +4 for the CAS stuff. skipped += esz; if skipped < skip_first_m { @@ -758,11 +778,73 @@ impl SafeValueLog { if start.elapsed().unwrap().as_secs() > 10 { return Err("stop iteration".into()); } - let vs = kv.get(&entry.key); + let vs = kv.get(&entry.key)?; + if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { + // Key has been deleted. Discard. + reason.discard += esz; + return Ok(true); // Continue + } + if (vs.meta & MetaBit::BIT_VALUE_POINTER.bits()) == 0 { + // Value is stored alongside key. Discard. + reason.discard += esz; + return Ok(true); + } + // Value is still present in value log. + assert!(!vs.value.is_empty()); + let mut vptr = vptr.clone(); // TODO avoid copy + vptr.dec(&mut io::Cursor::new(vs.value))?; + if vptr.fid > lfc.read().fid { + // Value is present in a later log. Discard. + reason.discard += esz; + return Ok(true); + } + if vptr.offset > entry.offset { + // Value is present in a later offset, but in the same log. + reason.discard += esz; + return Ok(true); + } + if vptr.fid == lfc.read().fid && vptr.offset == entry.offset { + // This is still the active entry, This would need to be rewritten. + reason.keep += esz; + } else { + info!("Reason={:?}", reason); + let err = vlg.read_value_bytes(&vptr, |buf| { + let mut unexpect_entry = Entry::default(); + unexpect_entry.dec(&mut io::Cursor::new(buf))?; + unexpect_entry.offset = vptr.offset; + if unexpect_entry.cas_counter == entry.cas_counter { + info!("Latest Entry Header in LSM: {}", unexpect_entry); + info!("Latest Entry in Log: {}", entry); + } + Ok(()) + }); + if err.is_err() { + return Err("Stop iteration".into()); + } + } Ok(true) }) }) - .await?; + .await; + + if err.is_err() { + info!( + "Error while iterating for RunGC: {}", + err.as_ref().unwrap_err() + ); + return err; + } + + info!("Fid: {} Data status={:?}", lf.read().fid, reason); + if reason.lock().await.total < 10.0 + || reason.lock().await.discard < gc_threshold * reason.lock().await.total + { + info!("Skipping GC on fid: {}", lf.read().fid); + return Err(Error::ValueNoRewrite); + } + + info!("REWRITING VLOG {}", lf.read().fid); + self.value_log.rewrite(&lf.read())?; Ok(()) } } From fb7e6d2a5fc0d9844c55e192d96b1f88c2808af9 Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 2 Mar 2023 21:55:24 +0800 Subject: [PATCH 42/77] :coffee: add ArcLock for valueLog.buf --- src/lib.rs | 5 +- src/log_file.rs | 190 ++++++++++++++++++++++++++++++------------------ src/mmap.rs | 56 ++++++++++++++ 3 files changed, 177 insertions(+), 74 deletions(-) create mode 100644 src/mmap.rs diff --git a/src/lib.rs b/src/lib.rs index e06ea70..464fb4d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,14 +37,15 @@ mod y; mod compaction; mod level_handler; mod levels; +mod mmap; mod pb; +mod st_manager; #[cfg(test)] mod test_util; -mod st_manager; pub use skl::*; -pub use y::*; pub use st_manager::*; +pub use y::*; #[allow(dead_code)] #[inline] diff --git a/src/log_file.rs b/src/log_file.rs index 7899fac..96873bf 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -3,6 +3,7 @@ use crate::y::Result; use crate::y::{is_eof, read_at, Decode}; use crate::Error; use byteorder::{BigEndian, ReadBytesExt}; +use core::slice::SlicePattern; use memmap::MmapMut; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; @@ -12,6 +13,7 @@ use std::future::Future; use std::io::{Cursor, Read, Seek, SeekFrom}; use std::pin::Pin; use std::task::{Context, Poll}; +// use crate::mmap::AsyncMMAP; #[derive(Debug)] pub(crate) struct LogFile { @@ -23,14 +25,58 @@ pub(crate) struct LogFile { } impl LogFile { + pub(crate) async fn read_entries( + &self, + offset: u32, + n: usize, + ) -> Result> { + let m = self._mmap.as_ref().unwrap().as_slice(); + let mut cursor_offset = offset; + let mut v = vec![]; + while cursor_offset < m.len() as u32 && v.len() < n { + let mut entry = Entry::default(); + let mut h = Header::default(); + h.dec(&mut Cursor::new( + &m[cursor_offset as usize..cursor_offset as usize + Header::encoded_size()], + ))?; + entry.key = vec![0u8; h.k_len as usize]; + entry.value = vec![0u8; h.v_len as usize]; + entry.meta = h.meta; + entry.offset = cursor_offset as u32; + entry.cas_counter = h.cas_counter; + entry.user_meta = h.user_mata; + entry.cas_counter_check = h.cas_counter_check; + let mut start = cursor_offset + Header::encoded_size(); + entry.key.extend_from_slice(m[start..start + h.k_len]); + start = start + h.k_len; + entry.value.extend_from_slice(m[start..start + h.v_len]); + v.push((entry, )) + } + Ok(vec![]) + } +} + +impl LogFile { + pub(crate) fn new(path: &str) -> Result { + let mut lf = LogFile { + _path: Box::new(path.to_string()), + fd: None, + fid: 0, + _mmap: None, + sz: 0, + }; + lf.open_read_only()?; + Ok(lf) + } + pub(crate) fn open_read_only(&mut self) -> Result<()> { let mut fd = std::fs::OpenOptions::new() .read(true) .open(self._path.as_ref())?; let meta = fd.metadata()?; let file_sz = meta.len(); - let mut _mmap = MmapMut::map_anon(file_sz as usize).unwrap(); - let read = read_at(&fd, &mut _mmap, 0)?; + let mut _mmap = unsafe { MmapMut::map_mut(&fd)? }; + // let mut _mmap = _mmap.make_read_only()?; self._mmap.replace(_mmap); self.fd.replace(fd); self.sz = file_sz as u32; @@ -138,76 +184,76 @@ impl LogFile { Ok(()) } - pub(crate) async fn iterate2( - &self, - offset: u32, - f: &mut impl for<'a> FnMut( - &'a Entry, - &'a ValuePointer, - ) -> Pin> + 'a>>, - ) -> Result<()> { - // let mut fd = self.fd.as_mut().unwrap(); - // fd.seek(SeekFrom::Start(offset as u64))?; - let fd = self.fd.as_ref().unwrap(); - let mut entry = Entry::default(); - let mut truncate = false; - let mut record_offset = offset; - loop { - let mut h = Header::default(); - let buffer = vec![0u8; Header::encoded_size()]; - let ok = h.dec(&mut Cursor::new(buffer)); - if ok.is_err() && ok.as_ref().unwrap_err().is_io_eof() { - break; - } - // todo add truncate currenct - ok?; - if h.k_len as usize > entry.key.capacity() { - entry.key = vec![0u8; h.k_len as usize]; - } - if h.v_len as usize > entry.value.capacity() { - entry.value = vec![0u8; h.v_len as usize]; - } - entry.key.clear(); - entry.value.clear(); - - let ok = fd.read(&mut entry.key); - if is_eof(&ok) { - break; - } - ok?; - - let ok = fd.read(&mut entry.value); - if is_eof(&ok) { - break; - } - ok?; - entry.offset = record_offset; - entry.meta = h.meta; - entry.user_meta = h.user_mata; - entry.cas_counter = h.cas_counter; - entry.cas_counter_check = h.cas_counter_check; - let ok = fd.read_u32::(); - if is_eof(&ok) { - break; - } - let crc = ok?; - - let mut vp = ValuePointer::default(); - vp.len = Header::encoded_size() as u32 + h.k_len + h.v_len + 4; - record_offset += vp.len; - - vp.offset = entry.offset; - vp.fid = self.fid; - - let _continue = f(&entry, &vp).await?; - if !_continue { - break; - } - } - - // todo add truncate - Ok(()) - } + // pub(crate) async fn iterate2( + // &self, + // offset: u32, + // f: &mut impl for<'a> FnMut( + // &'a Entry, + // &'a ValuePointer, + // ) -> Pin> + 'a>>, + // ) -> Result<()> { + // let mut fd = self.fd.as_mut().unwrap(); + // fd.seek(SeekFrom::Start(offset as u64))?; + // // let fd = self.fd.as_ref().unwrap(); + // let mut entry = Entry::default(); + // let mut truncate = false; + // let mut record_offset = offset; + // loop { + // let mut h = Header::default(); + // let buffer = vec![0u8; Header::encoded_size()]; + // let ok = h.dec(&mut Cursor::new(buffer)); + // if ok.is_err() && ok.as_ref().unwrap_err().is_io_eof() { + // break; + // } + // // todo add truncate currenct + // ok?; + // if h.k_len as usize > entry.key.capacity() { + // entry.key = vec![0u8; h.k_len as usize]; + // } + // if h.v_len as usize > entry.value.capacity() { + // entry.value = vec![0u8; h.v_len as usize]; + // } + // entry.key.clear(); + // entry.value.clear(); + // + // let ok = fd.read(&mut entry.key); + // if is_eof(&ok) { + // break; + // } + // ok?; + // + // let ok = fd.read(&mut entry.value); + // if is_eof(&ok) { + // break; + // } + // ok?; + // entry.offset = record_offset; + // entry.meta = h.meta; + // entry.user_meta = h.user_mata; + // entry.cas_counter = h.cas_counter; + // entry.cas_counter_check = h.cas_counter_check; + // let ok = fd.read_u32::(); + // if is_eof(&ok) { + // break; + // } + // let crc = ok?; + // + // let mut vp = ValuePointer::default(); + // vp.len = Header::encoded_size() as u32 + h.k_len + h.v_len + 4; + // record_offset += vp.len; + // + // vp.offset = entry.offset; + // vp.fid = self.fid; + // + // let _continue = f(&entry, &vp).await?; + // if !_continue { + // break; + // } + // } + // + // // todo add truncate + // Ok(()) + // } pub(crate) fn reset_seek_start(&mut self) -> Result<()> { let fd = self.fd.as_mut().unwrap(); fd.seek(SeekFrom::Start(0))?; diff --git a/src/mmap.rs b/src/mmap.rs new file mode 100644 index 0000000..d6f6675 --- /dev/null +++ b/src/mmap.rs @@ -0,0 +1,56 @@ +use crate::Error::Unexpected; +use crate::Result; +use fmmap::raw::tokio::AsyncDiskMmapFile; +use fmmap::tokio::{AsyncMmapFile, AsyncMmapFileExt, AsyncOptions}; +use memmap::MmapMut; +use tokio::fs::File; + +pub(crate) struct AsyncLogFile { + path: Box, + sz: usize, + fp: Option, + m: Option, +} + +// impl AsyncMMAP { +// async fn new(path: &str) -> Result { +// let fp = std::fs::File::options().read(true).open(path)?; +// let sz = fp.metadata()?.len(); +// Ok(AsyncMMAP { +// fp: None, +// m: Some(m), +// }) +// } +// +// pub(crate) fn mut_fp(&mut self) -> &mut File { +// self.fp.as_mut().unwrap() +// } +// +// pub(crate) fn fp(&self) -> &File { +// self.fp.as_ref().unwrap() +// } +// +// pub(crate) async fn set_len(&self, offset: u64) -> Result<()>{ +// if let Some(m) = &self.m { +// m.flush()?; +// } +// self.fp().set_len(offset)?; +// Ok(()) +// } +// } +// +// impl Drop for AsyncMMAP { +// fn drop(&mut self) { +// if let Some(m) = self.m.take() {} +// } +// } + +// #[tokio::test] +// async fn it() { +// let mut m = AsyncMMAP::new("LICENSE").await; +// assert!(m.is_ok()); +// println!("{:?}", m.as_ref().unwrap().m.as_ref().unwrap().as_slice()); +// let m = m.as_mut().unwrap().m.as_mut().unwrap(); +// let mut buffer = vec![0u8; 100]; +// m.read_exact(&mut buffer, 0).await; +// } From 57d2b888954ce9bbb86c1c29a712f02302b65dca Mon Sep 17 00:00:00 2001 From: Rg Date: Fri, 3 Mar 2023 01:43:48 +0800 Subject: [PATCH 43/77] :card: --- src/log_file.rs | 68 +++++++++++++++++++++++++++++++---- src/test_data/vlog_file.text | 6 ++++ src/value_log.rs | 70 +++++++++++++++++++++++++++--------- 3 files changed, 121 insertions(+), 23 deletions(-) create mode 100644 src/test_data/vlog_file.text diff --git a/src/log_file.rs b/src/log_file.rs index 96873bf..df31c20 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -25,11 +25,47 @@ pub(crate) struct LogFile { } impl LogFile { + // pub(crate) async fn iterate_by_offset1( + // _self: &Self, + // mut offset: u32, + // f: &mut impl for<'a> FnMut( + // &'a Entry, + // &'a ValuePointer, + // ) -> Pin> + 'a>>, + // ) -> Result<()> { + // + // Ok(()) + // } + + pub(crate) async fn iterate_by_offset( + &self, + mut offset: u32, + f: &mut impl for<'a> FnMut( + &'a Entry, + &'a ValuePointer, + ) -> Pin> + 'a>>, + ) -> Result<()> { + loop { + let (v, next) = self.read_entries(offset, 1).await?; + if v.is_empty() { + return Ok(()); + } + + for (entry, vptr) in v.iter() { + let continue_ = f(entry, vptr).await?; + if !continue_ { + return Ok(()); + } + offset = next; + } + } + } + pub(crate) async fn read_entries( &self, offset: u32, n: usize, - ) -> Result> { + ) -> Result<(Vec<(Entry, ValuePointer)>, u32)> { let m = self._mmap.as_ref().unwrap().as_slice(); let mut cursor_offset = offset; let mut v = vec![]; @@ -46,13 +82,24 @@ impl LogFile { entry.cas_counter = h.cas_counter; entry.user_meta = h.user_mata; entry.cas_counter_check = h.cas_counter_check; - let mut start = cursor_offset + Header::encoded_size(); - entry.key.extend_from_slice(m[start..start + h.k_len]); - start = start + h.k_len; - entry.value.extend_from_slice(m[start..start + h.v_len]); - v.push((entry, )) + let mut start = cursor_offset as usize + Header::encoded_size(); + entry + .key + .extend_from_slice(&m[start..start + h.k_len as usize]); + start += h.k_len as usize; + entry + .value + .extend_from_slice(&m[start..start + h.v_len as usize]); + start += h.v_len as usize; + let crc32 = Cursor::new(&m[start..start + 4]).read_u32::()?; + let mut vpt = ValuePointer::default(); + vpt.fid = self.fid; + vpt.len = Header::encoded_size() as u32 + h.k_len + h.v_len + 4; + vpt.offset = cursor_offset; + cursor_offset += vpt.len; + v.push((entry, vpt)) } - Ok(vec![]) + Ok((v, cursor_offset)) } } @@ -75,6 +122,7 @@ impl LogFile { .open(self._path.as_ref())?; let meta = fd.metadata()?; let file_sz = meta.len(); + println!("file sz {}", file_sz); let mut _mmap = unsafe { MmapMut::map_mut(&fd)? }; // let mut _mmap = _mmap.make_read_only()?; self._mmap.replace(_mmap); @@ -260,3 +308,9 @@ impl LogFile { Ok(()) } } + +#[test] +fn concurrency() { + let mut lf = LogFile::new("src/test_data/vlog_file.text"); + assert!(lf.is_ok(), "{}", lf.unwrap_err().to_string()); +} diff --git a/src/test_data/vlog_file.text b/src/test_data/vlog_file.text new file mode 100644 index 0000000..1abd6ef --- /dev/null +++ b/src/test_data/vlog_file.text @@ -0,0 +1,6 @@ +This is a test data! +warning: `badger-rs` (lib test) generated 569 warnings (run `cargo fix --lib -p badger-rs --tests` to apply 289 suggestions) + Finished test [unoptimized + debuginfo] target(s) in 2.53s +warning: the following packages contain code that will be rejected by a future version of Rust: console_log v0.2.0 +note: to see what the problems were, use the option `--future-incompat-report`, or run `cargo report future-incompatibilities --id 277` + Running unittests src/lib.rs (target/debug/deps/badger_rs-fef6c93f1008e2ae) \ No newline at end of file diff --git a/src/value_log.rs b/src/value_log.rs index cc44442..44ebfa2 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -594,28 +594,67 @@ impl ValueLogCore { to_disk() } - fn rewrite(&self, lf: &LogFile) -> Result<()> { + async fn rewrite(&self, lf: Arc>) -> Result<()> { let max_fid = self.max_fid.load(Ordering::Relaxed); assert!( - lf.fid < max_fid, + lf.read().fid < max_fid, "fid to move: {}. Current max fid: {}", - lf.fid, + lf.read().fid, max_fid ); // TODO add metrics // let mut wb = Vec::with_capacity(1000); let mut size = 0i64; - let mut count = 0; - let fe = |e: &Entry| -> Result<()> { - count += 1; - if count % 1000 == 0 { - info!("Processing entry {}", count); - } - let vs = self.get_kv().get(&e.key); - Ok(()) - }; + let mut count = Arc::new(AtomicU32::new(0)); + lf.clone().read() + .iterate_by_offset(0, &mut |entry, _| { + let count = count.clone(); + let kv = self.get_kv(); + Box::pin(async move { + count.fetch_add(1, Ordering::Relaxed); + if count.load(Ordering::Relaxed) % 1000 == 0 { + info!("Processing entry {}", count.load(Ordering::Relaxed)); + } + let vs = kv.get(&entry.key)?; + // if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { + // return Ok(true); + // } + // if (vs.meta & MetaBit::BIT_VALUE_POINTER.bits()) <= 0 { + // return Ok(true); + // } + Ok(true) + }) + }) + .await?; + + // let err = lf.clone().read() + // .iterate_by_offset(0, &mut |entry, vptr| { + // // let vlg = self.value_log.clone(); + // // let lfc = lf.clone(); + // Box::pin(async move { + // count += 1; + // if count % 1000 == 0 { + // info!("Processing entry {}", count); + // } + // let vs = self.get_kv().get(&entry.key)?; + // if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { + // return Ok(true); + // } + // if (vs.meta & MetaBit::BIT_VALUE_POINTER.bits()) <= 0 { + // return Ok(true); + // } + // // Value is still present in value log. + // if vs.value.is_empty() { + // return Err(format!("Empty value: {:?}", vs).into()); + // } + // + // let mut vptr = ValuePointer::default(); + // vptr.dec(&mut Cursor::new(vs.value.as_slice())).unwrap(); + // Ok(true) + // }) + // }); Ok(()) } @@ -751,11 +790,10 @@ impl SafeValueLog { let mut skipped = 0.0; let mut start = SystemTime::now(); // assert!(!self.value_log.kv.is_null()); - let err = lf .clone() - .write() - .iterate(0, &mut |entry, vptr| { + .read() + .iterate_by_offset(0, &mut |entry, vptr| { let vlg = self.value_log.clone(); let reason = reason.clone(); let lfc = lf.clone(); @@ -844,7 +882,7 @@ impl SafeValueLog { } info!("REWRITING VLOG {}", lf.read().fid); - self.value_log.rewrite(&lf.read())?; + self.value_log.rewrite(lf).await?; Ok(()) } } From 14738b08a482a6f882cd18bcb54e27901264bfba Mon Sep 17 00:00:00 2001 From: Rg Date: Sat, 4 Mar 2023 17:28:43 +0800 Subject: [PATCH 44/77] :coffee: add ArcLock for valueLog.buf --- src/kv.rs | 15 ++++- src/value_log.rs | 167 ++++++++++++++++++++++++++++++++--------------- src/y/mod.rs | 9 +++ 3 files changed, 137 insertions(+), 54 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index e4bb236..42f45ce 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -9,7 +9,7 @@ use crate::value_log::{ArcRequest, Entry, MetaBit, Request, ValueLogCore, ValueP use crate::y::{ async_sync_directory, create_synced_file, sync_directory, Encode, Result, ValueStruct, }; -use crate::Error::Unexpected; +use crate::Error::{NotFound, Unexpected}; use crate::{Decode, Error, Node, SkipList, SkipListManager}; use atomic::Atomic; use bytes::BufMut; @@ -298,7 +298,7 @@ impl KV { } } - self.must_lc().get(key).ok_or("Not found".into()) + self.must_lc().get(key).ok_or(NotFound) } // Returns the current `mem_tables` and get references. @@ -406,6 +406,17 @@ impl KV { Ok(()) } + // Applies a list of `badger.entries`. If a request level error occurs it will be returned. Errors are also set on each + // `Entry` and must be checked individually. + // Check(kv.batch_set(entries)) + // for e in entries { + // Check(e.Error); + // } + // TODO + pub(crate) async fn batch_set(&self, entries: Vec) -> Result<()> { + todo!() + } + fn new_cas_counter(&self, how_many: u64) -> u64 { self.last_used_cas_counter .fetch_add(how_many, Ordering::Relaxed) diff --git a/src/value_log.rs b/src/value_log.rs index 44ebfa2..16d757f 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -30,6 +30,7 @@ use std::time::{Duration, SystemTime}; use std::{fmt, fs, io, ptr, thread}; use tabled::object::Entity::Cell; use tokio::macros::support::thread_rng_n; +use tracing_subscriber::fmt::format; use crate::kv::{ArcKV, WeakKV, KV}; use crate::log_file::LogFile; @@ -41,7 +42,7 @@ use crate::y::{ create_synced_file, is_eof, open_existing_synced_file, read_at, sync_directory, Decode, Encode, }; use crate::Error::{Unexpected, ValueNoRewrite, ValueRejected}; -use crate::{Error, Result}; +use crate::{Error, Result, META_SIZE}; /// Values have their first byte being byteData or byteDelete. This helps us distinguish between /// a key that has never been seen and a key that has been explicitly deleted. @@ -284,7 +285,7 @@ pub struct ValueLogCore { vlogs: Arc>>>>, // TODO It is not good idea that use raw lock for Arc>, it maybe lock AsyncRuntime thread. dirty_vlogs: Arc>>, // TODO why? - // A refcount of iterators -- when this hits zero, we can delete the files_to_be_deleted. + // A refcount of iterators -- when this hits zero, we can delete the files_to_be_deleted. Why? num_active_iterators: AtomicI32, writable_log_offset: AtomicU32, buf: ArcRW>>, @@ -604,57 +605,119 @@ impl ValueLogCore { ); // TODO add metrics - // let mut wb = Vec::with_capacity(1000); - let mut size = 0i64; - let mut count = Arc::new(AtomicU32::new(0)); - lf.clone().read() - .iterate_by_offset(0, &mut |entry, _| { - let count = count.clone(); - let kv = self.get_kv(); + let mut offset = 0; + let mut count = 0; + let kv = self.get_kv(); + let mut write_batch = Vec::with_capacity(1000); + loop { + let (mut entries, next) = lf.read().read_entries(offset, 1).await?; + if entries.is_empty() { + info!("not anything need to rewrite"); + break; + } + offset += next; + count += 1; + if count % 1000 == 0 { + info!("Processing entry {}", count); + } + // TODO don't need decode vptr + let entry = &mut entries[0].0; + let vs = kv.get(&entry.key); + if let Err(ref err) = vs { + if err.is_not_found() { + info!( + "REWRITE=> not found the value, {}", + String::from_utf8_lossy(&entry.key) + ); + continue; + } + return Err(err.clone()); + } + let vs = vs.unwrap(); + if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { + info!( + "REWRITE=> {} has been deleted", + String::from_utf8_lossy(&entry.key) + ); + continue; + } + if (vs.meta & MetaBit::BIT_VALUE_POINTER.bits()) < 0 { + info!( + "REWRITE=> {} has been skipped, meta: {}", + String::from_utf8_lossy(&entry.key), + entry.meta, + ); + continue; + } + if vs.value.is_empty() { + info!( + "REWRITE=> {} is empty value", + String::from_utf8_lossy(&entry.key) + ); + return Err(format!("Empty value: {:?}", vs).into()); + } + // the lasted vptr + let mut vptr = ValuePointer::default(); + vptr.dec(&mut Cursor::new(&vs.value)).unwrap(); + if vptr.fid > lf.read().fid { + continue; + } + if vptr.offset > entry.offset { + continue; + } + assert_eq!(vptr.fid, lf.read().fid); + assert_eq!(vptr.offset, entry.offset); + { + // This new entry only contains the key, and a pointer to the value. + let mut ne = Entry::default(); + if entry.meta == MetaBit::BIT_SET_IF_ABSENT.bits() { + // If we rewrite this entry without removing BitSetIfAbsent, LSM would see that + // the key is already present, which would be this same entry and won't update + // the vptr to point to the new file. + entry.meta = 0; + } + assert_eq!(entry.meta, 0, "Got meta: 0"); + ne.meta = entry.meta; + ne.user_meta = entry.user_meta; + ne.key = entry.key.clone(); // TODO avoid copy + ne.value = entry.value.clone(); + // CAS counter check. Do not rewrite if key has a newer value. + ne.cas_counter_check = vs.cas_counter; + write_batch.push(ne); + } + } + if write_batch.is_empty() { + info!("REWRITE: nothing to rewrite."); + return Ok(()); + } + info!( + "REWRITE: request has {} entries, size {}", + write_batch.len(), + count + ); + info!("REWRITE: Removing fid: {}", lf.read().fid); + kv.batch_set(write_batch).await?; + info!("REWRITE: Processed {} entries in total", count); + info!("REWRITE: Removing fid: {}", lf.read().fid); + let mut deleted_file_now = false; + // Entries written to LSM. Remove the older file now. + { + // Just a sanity-check. + let mut vlogs = self.vlogs.write(); + if !vlogs.contains_key(&lf.read().fid) { + return Err(format!("Unable to find fid: {}", lf.read().fid).into()); + } + // TODO Why? + if self.num_active_iterators.load(Ordering::Relaxed) == 0 { + vlogs.remove(&lf.read().fid); + deleted_file_now = true; + }else { + self.dirty_vlogs.write().insert(lf.read().fid.clone()); + } + } + if deleted_file_now { - Box::pin(async move { - count.fetch_add(1, Ordering::Relaxed); - if count.load(Ordering::Relaxed) % 1000 == 0 { - info!("Processing entry {}", count.load(Ordering::Relaxed)); - } - let vs = kv.get(&entry.key)?; - // if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { - // return Ok(true); - // } - // if (vs.meta & MetaBit::BIT_VALUE_POINTER.bits()) <= 0 { - // return Ok(true); - // } - Ok(true) - }) - }) - .await?; - - // let err = lf.clone().read() - // .iterate_by_offset(0, &mut |entry, vptr| { - // // let vlg = self.value_log.clone(); - // // let lfc = lf.clone(); - // Box::pin(async move { - // count += 1; - // if count % 1000 == 0 { - // info!("Processing entry {}", count); - // } - // let vs = self.get_kv().get(&entry.key)?; - // if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { - // return Ok(true); - // } - // if (vs.meta & MetaBit::BIT_VALUE_POINTER.bits()) <= 0 { - // return Ok(true); - // } - // // Value is still present in value log. - // if vs.value.is_empty() { - // return Err(format!("Empty value: {:?}", vs).into()); - // } - // - // let mut vptr = ValuePointer::default(); - // vptr.dec(&mut Cursor::new(vs.value.as_slice())).unwrap(); - // Ok(true) - // }) - // }); + } Ok(()) } diff --git a/src/y/mod.rs b/src/y/mod.rs index bd7c1a8..96e30b1 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -77,6 +77,8 @@ pub enum Error { #[error("Manifest has bad magic")] BadMagic, ///////////////////////////////// + #[error("Not found")] + NotFound, } impl Default for Error { @@ -113,6 +115,13 @@ impl Error { _ => false, } } + + pub fn is_not_found(&self) -> bool { + match self { + Error::NotFound => true, + _ => false, + } + } } impl From<&'static str> for Error { From ee0a2d5c538745d18bee7eae78ebd08e5bed6b29 Mon Sep 17 00:00:00 2001 From: rg Date: Mon, 6 Mar 2023 01:35:02 +0800 Subject: [PATCH 45/77] :dog: --- Cargo.toml | 1 + src/log_file.rs | 53 ++++++++++++++++++++++++++++++++++++++++++++---- src/value_log.rs | 15 +++++++------- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1bdf5c1..ac90f07 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ crossbeam-epoch = "0.9.13" tokio-context = "0.1.3" dyn-clone = "1.0.10" eieio = "1.0.0" +either = "1.8.1" [dev-dependencies] tracing-subscriber = "0.3.16" tracing-log = "0.1.3" diff --git a/src/log_file.rs b/src/log_file.rs index df31c20..69cf9fd 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -4,23 +4,37 @@ use crate::y::{is_eof, read_at, Decode}; use crate::Error; use byteorder::{BigEndian, ReadBytesExt}; use core::slice::SlicePattern; -use memmap::MmapMut; +use memmap::{MmapMut, Mmap}; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; use std::async_iter::AsyncIterator; use std::fs::File; use std::future::Future; use std::io::{Cursor, Read, Seek, SeekFrom}; +use std::ops::Deref; use std::pin::Pin; use std::task::{Context, Poll}; +use either::Either; // use crate::mmap::AsyncMMAP; +struct MmapType(Either); + +impl Deref for MmapType { + type Target = (); + + fn deref(&self) -> &Self::Target { + todo!() + } +} + #[derive(Debug)] pub(crate) struct LogFile { pub(crate) _path: Box, pub(crate) fd: Option, pub(crate) fid: u32, - pub(crate) _mmap: Option, + // pub(crate) _mmap: Option, + pub(crate) _mut_mmap: Option, + pub(crate) _mmap: Option, pub(crate) sz: u32, } @@ -61,6 +75,15 @@ impl LogFile { } } + fn mmap_slice(&self) { + match self._mmap.as_ref().unwrap() { + Either::Left(_mmap) => { + + }, + Either::Right(_mut_mmap) => {}, + } + } + pub(crate) async fn read_entries( &self, offset: u32, @@ -123,7 +146,7 @@ impl LogFile { let meta = fd.metadata()?; let file_sz = meta.len(); println!("file sz {}", file_sz); - let mut _mmap = unsafe { MmapMut::map_mut(&fd)? }; + let mut _mmap = unsafe { Mmap::map(&fd)? }; // let mut _mmap = _mmap.make_read_only()?; self._mmap.replace(_mmap); self.fd.replace(fd); @@ -147,7 +170,8 @@ impl LogFile { // todo opz pub(crate) fn done_writing(&mut self, offset: u32) -> Result<()> { self.sync()?; - self._mmap.as_mut().unwrap().flush_async()?; + let mut_mmap =self.mut_mmap(); + mut_mmap.flush_async()?; self.fd.as_mut().unwrap().set_len(offset as u64)?; self.fd.as_mut().unwrap().sync_all()?; { @@ -157,6 +181,15 @@ impl LogFile { self.open_read_only() } + fn mut_mmap(&self) -> &mut MmapMut{ + let _mmap = self._mmap.as_ref().unwrap(); + _mmap.make_mut().as_mut().unwrap() + } + + fn mmap_ref(&self) -> &Mmap { + self._mmap.as_ref().unwrap() + } + // You must hold lf.lock to sync() fn sync(&mut self) -> Result<()> { self.fd.as_mut().unwrap().sync_all()?; @@ -314,3 +347,15 @@ fn concurrency() { let mut lf = LogFile::new("src/test_data/vlog_file.text"); assert!(lf.is_ok(), "{}", lf.unwrap_err().to_string()); } + +#[test] +fn test_mmap () { + let mut fd = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open("src/test_data/vlog_file.text").unwrap(); + + let _mmap = unsafe {Mmap::map(&fd).unwrap()}; + println!("{}", _mmap.len()); + println!("{}", _mmap.make_mut().is_err()); +} diff --git a/src/value_log.rs b/src/value_log.rs index 16d757f..9dc9211 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -6,7 +6,7 @@ use crc32fast::Hasher; use libc::{difftime, nice}; use log::info; use log::kv::Source; -use memmap::MmapMut; +use memmap::{MmapMut, Mmap}; use parking_lot::*; use protobuf::well_known_types::api::Mixin; use rand::random; @@ -30,7 +30,6 @@ use std::time::{Duration, SystemTime}; use std::{fmt, fs, io, ptr, thread}; use tabled::object::Entity::Cell; use tokio::macros::support::thread_rng_n; -use tracing_subscriber::fmt::format; use crate::kv::{ArcKV, WeakKV, KV}; use crate::log_file::LogFile; @@ -341,7 +340,7 @@ impl ValueLogCore { fn create_mmap_vlog_file(&self, fid: u32, offset: u64) -> Result { let mut vlog_file = self.create_vlog_file(fid)?; vlog_file.fd.as_mut().unwrap().set_len(offset)?; - let mut _mmap = unsafe { MmapMut::map_mut(vlog_file.fd.as_ref().unwrap())? }; + let mut _mmap = unsafe { Mmap::map(vlog_file.fd.as_ref().unwrap())? }; vlog_file._mmap.replace(_mmap); Ok(vlog_file) } @@ -365,11 +364,11 @@ impl ValueLogCore { info!("Stopping garbage collection of values."); let mut vlogs = self.vlogs.write(); for vlog in vlogs.iter() { - vlog.1.write()._mmap.as_mut().unwrap().flush()?; + let mut lf = vlog.1.write(); if *vlog.0 == self.max_fid.load(Ordering::Acquire) { - vlog.1 - .write() - .fd + let _mmap = lf._mmap.take().unwrap(); + _mmap.make_mut().unwrap().flush()?; + lf.fd .as_mut() .unwrap() .set_len(self.writable_log_offset.load(Ordering::Acquire) as u64)?; @@ -500,7 +499,7 @@ impl ValueLogCore { fn delete_log_file(&mut self, mut log_file: LogFile) -> Result<()> { if let Some(mp) = log_file._mmap.take() { - mp.flush()?; + mp.make_mut()?.flush()?; } if let Some(fp) = log_file.fd.take() { fp.sync_all()?; From 3944783ab68db6de5f05cadf05ab05c07cae31b1 Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 6 Mar 2023 01:56:44 +0800 Subject: [PATCH 46/77] compiled --- src/log_file.rs | 82 +++++++++++++++++++++++++++++++++--------------- src/value_log.rs | 16 +++++----- 2 files changed, 64 insertions(+), 34 deletions(-) diff --git a/src/log_file.rs b/src/log_file.rs index 69cf9fd..ca8ac24 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -4,40 +4,73 @@ use crate::y::{is_eof, read_at, Decode}; use crate::Error; use byteorder::{BigEndian, ReadBytesExt}; use core::slice::SlicePattern; -use memmap::{MmapMut, Mmap}; +use either::Either; +use memmap::{Mmap, MmapMut}; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; use std::async_iter::AsyncIterator; +use std::f32::consts::E; +use std::fmt::{Debug, Display, Formatter}; use std::fs::File; use std::future::Future; use std::io::{Cursor, Read, Seek, SeekFrom}; use std::ops::Deref; use std::pin::Pin; use std::task::{Context, Poll}; -use either::Either; // use crate::mmap::AsyncMMAP; -struct MmapType(Either); +pub(crate) struct MmapType(Either); + +impl MmapType { + fn get_mmap(&self) -> &Mmap { + match self.0 { + Either::Left(ref _mmap) => _mmap, + _ => panic!("It should be not happen"), + } + } + + pub(crate) fn get_mut_mmap(&self) -> &MmapMut { + match self.0 { + Either::Right(ref m) => m, + _ => panic!("It should be not happen"), + } + } +} impl Deref for MmapType { - type Target = (); + type Target = Either; fn deref(&self) -> &Self::Target { - todo!() + &self.0 + } +} + +impl From for MmapType { + fn from(value: Mmap) -> Self { + Self(Either::Left(value)) + } +} + +impl From for MmapType { + fn from(value: MmapMut) -> Self { + Self(Either::Right(value)) } } -#[derive(Debug)] pub(crate) struct LogFile { pub(crate) _path: Box, pub(crate) fd: Option, pub(crate) fid: u32, - // pub(crate) _mmap: Option, - pub(crate) _mut_mmap: Option, pub(crate) _mmap: Option, pub(crate) sz: u32, } +impl Debug for LogFile { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + todo!() + } +} + impl LogFile { // pub(crate) async fn iterate_by_offset1( // _self: &Self, @@ -75,12 +108,11 @@ impl LogFile { } } - fn mmap_slice(&self) { - match self._mmap.as_ref().unwrap() { - Either::Left(_mmap) => { - - }, - Either::Right(_mut_mmap) => {}, + fn mmap_slice(&self) -> &[u8] { + let mmap = self._mmap.as_ref().unwrap(); + match mmap.0 { + Either::Left(ref _mmap) => _mmap.as_ref(), + Either::Right(ref _mmap) => _mmap.as_ref(), } } @@ -89,7 +121,7 @@ impl LogFile { offset: u32, n: usize, ) -> Result<(Vec<(Entry, ValuePointer)>, u32)> { - let m = self._mmap.as_ref().unwrap().as_slice(); + let m = self.mmap_slice(); let mut cursor_offset = offset; let mut v = vec![]; while cursor_offset < m.len() as u32 && v.len() < n { @@ -148,7 +180,7 @@ impl LogFile { println!("file sz {}", file_sz); let mut _mmap = unsafe { Mmap::map(&fd)? }; // let mut _mmap = _mmap.make_read_only()?; - self._mmap.replace(_mmap); + self._mmap.replace(_mmap.into()); self.fd.replace(fd); self.sz = file_sz as u32; Ok(()) @@ -170,7 +202,7 @@ impl LogFile { // todo opz pub(crate) fn done_writing(&mut self, offset: u32) -> Result<()> { self.sync()?; - let mut_mmap =self.mut_mmap(); + let mut_mmap = self.mut_mmap(); mut_mmap.flush_async()?; self.fd.as_mut().unwrap().set_len(offset as u64)?; self.fd.as_mut().unwrap().sync_all()?; @@ -181,13 +213,12 @@ impl LogFile { self.open_read_only() } - fn mut_mmap(&self) -> &mut MmapMut{ - let _mmap = self._mmap.as_ref().unwrap(); - _mmap.make_mut().as_mut().unwrap() + fn mut_mmap(&self) -> &MmapMut { + self._mmap.as_ref().unwrap().get_mut_mmap() } fn mmap_ref(&self) -> &Mmap { - self._mmap.as_ref().unwrap() + self._mmap.as_ref().unwrap().get_mmap() } // You must hold lf.lock to sync() @@ -345,17 +376,18 @@ impl LogFile { #[test] fn concurrency() { let mut lf = LogFile::new("src/test_data/vlog_file.text"); - assert!(lf.is_ok(), "{}", lf.unwrap_err().to_string()); + assert!(lf.is_ok(), "{:?}", lf.unwrap_err().to_string()); } #[test] -fn test_mmap () { +fn test_mmap() { let mut fd = std::fs::OpenOptions::new() .read(true) .write(true) - .open("src/test_data/vlog_file.text").unwrap(); + .open("src/test_data/vlog_file.text") + .unwrap(); - let _mmap = unsafe {Mmap::map(&fd).unwrap()}; + let _mmap = unsafe { Mmap::map(&fd).unwrap() }; println!("{}", _mmap.len()); println!("{}", _mmap.make_mut().is_err()); } diff --git a/src/value_log.rs b/src/value_log.rs index 9dc9211..01e968c 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -6,7 +6,7 @@ use crc32fast::Hasher; use libc::{difftime, nice}; use log::info; use log::kv::Source; -use memmap::{MmapMut, Mmap}; +use memmap::{Mmap, MmapMut}; use parking_lot::*; use protobuf::well_known_types::api::Mixin; use rand::random; @@ -341,7 +341,7 @@ impl ValueLogCore { let mut vlog_file = self.create_vlog_file(fid)?; vlog_file.fd.as_mut().unwrap().set_len(offset)?; let mut _mmap = unsafe { Mmap::map(vlog_file.fd.as_ref().unwrap())? }; - vlog_file._mmap.replace(_mmap); + vlog_file._mmap.replace(_mmap.into()); Ok(vlog_file) } @@ -367,8 +367,8 @@ impl ValueLogCore { let mut lf = vlog.1.write(); if *vlog.0 == self.max_fid.load(Ordering::Acquire) { let _mmap = lf._mmap.take().unwrap(); - _mmap.make_mut().unwrap().flush()?; - lf.fd + _mmap.get_mut_mmap().flush()?; + lf.fd .as_mut() .unwrap() .set_len(self.writable_log_offset.load(Ordering::Acquire) as u64)?; @@ -499,7 +499,7 @@ impl ValueLogCore { fn delete_log_file(&mut self, mut log_file: LogFile) -> Result<()> { if let Some(mp) = log_file._mmap.take() { - mp.make_mut()?.flush()?; + mp.get_mut_mmap().flush()?; } if let Some(fp) = log_file.fd.take() { fp.sync_all()?; @@ -710,13 +710,11 @@ impl ValueLogCore { if self.num_active_iterators.load(Ordering::Relaxed) == 0 { vlogs.remove(&lf.read().fid); deleted_file_now = true; - }else { + } else { self.dirty_vlogs.write().insert(lf.read().fid.clone()); } } - if deleted_file_now { - - } + if deleted_file_now {} Ok(()) } From d7da34fbd538c02eab52f3fb17a36fd0cb898240 Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 6 Mar 2023 13:50:50 +0800 Subject: [PATCH 47/77] compiled --- src/level_handler.rs | 8 +- src/log_file.rs | 298 ++++++++++++++++--------------------------- src/mmap.rs | 2 +- src/table/table.rs | 10 +- src/test_util.rs | 13 +- src/value_log.rs | 1 - 6 files changed, 127 insertions(+), 205 deletions(-) diff --git a/src/level_handler.rs b/src/level_handler.rs index d45a527..d412c4f 100644 --- a/src/level_handler.rs +++ b/src/level_handler.rs @@ -108,8 +108,7 @@ impl LevelHandler { // init with tables pub(crate) fn init_tables(&self, tables: Vec
) { let total_size = tables.iter().fold(0, |acc, table| acc + table.size()); - self.total_size - .store(total_size as u64, Ordering::Relaxed); + self.total_size.store(total_size as u64, Ordering::Relaxed); let mut tb_wl = self.tables_wl(); (*tb_wl) = tables; if self.level.load(Ordering::Relaxed) == 0 { @@ -205,8 +204,7 @@ impl LevelHandler { return false; } t.incr_ref(); - self - .total_size + self.total_size .fetch_add(t.size() as u64, Ordering::Relaxed); self.tables_wl().push(t); true @@ -258,7 +256,7 @@ impl LevelHandler { }; } - pub(crate) fn get(&self, key: &[u8]) -> Option{ + pub(crate) fn get(&self, key: &[u8]) -> Option { self.get_table_for_key(key) } diff --git a/src/log_file.rs b/src/log_file.rs index ca8ac24..c359b24 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -1,5 +1,5 @@ use crate::value_log::{Entry, Header, ValuePointer}; -use crate::y::Result; +use crate::y::{create_synced_file, Result}; use crate::y::{is_eof, read_at, Decode}; use crate::Error; use byteorder::{BigEndian, ReadBytesExt}; @@ -9,6 +9,7 @@ use memmap::{Mmap, MmapMut}; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; use parking_lot::{RawRwLock, RwLock}; use std::async_iter::AsyncIterator; +use std::env::temp_dir; use std::f32::consts::E; use std::fmt::{Debug, Display, Formatter}; use std::fs::File; @@ -17,7 +18,7 @@ use std::io::{Cursor, Read, Seek, SeekFrom}; use std::ops::Deref; use std::pin::Pin; use std::task::{Context, Poll}; -// use crate::mmap::AsyncMMAP; +use std::time::SystemTime; pub(crate) struct MmapType(Either); @@ -67,55 +68,15 @@ pub(crate) struct LogFile { impl Debug for LogFile { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - todo!() + f.debug_struct("LogFile") + .field("path", self._path.as_ref()) + .field("fd", &self.fid) + .field("size", &self.sz) + .finish() } } impl LogFile { - // pub(crate) async fn iterate_by_offset1( - // _self: &Self, - // mut offset: u32, - // f: &mut impl for<'a> FnMut( - // &'a Entry, - // &'a ValuePointer, - // ) -> Pin> + 'a>>, - // ) -> Result<()> { - // - // Ok(()) - // } - - pub(crate) async fn iterate_by_offset( - &self, - mut offset: u32, - f: &mut impl for<'a> FnMut( - &'a Entry, - &'a ValuePointer, - ) -> Pin> + 'a>>, - ) -> Result<()> { - loop { - let (v, next) = self.read_entries(offset, 1).await?; - if v.is_empty() { - return Ok(()); - } - - for (entry, vptr) in v.iter() { - let continue_ = f(entry, vptr).await?; - if !continue_ { - return Ok(()); - } - offset = next; - } - } - } - - fn mmap_slice(&self) -> &[u8] { - let mmap = self._mmap.as_ref().unwrap(); - match mmap.0 { - Either::Left(ref _mmap) => _mmap.as_ref(), - Either::Right(ref _mmap) => _mmap.as_ref(), - } - } - pub(crate) async fn read_entries( &self, offset: u32, @@ -156,77 +117,32 @@ impl LogFile { } Ok((v, cursor_offset)) } -} - -impl LogFile { - pub(crate) fn new(path: &str) -> Result { - let mut lf = LogFile { - _path: Box::new(path.to_string()), - fd: None, - fid: 0, - _mmap: None, - sz: 0, - }; - lf.open_read_only()?; - Ok(lf) - } - - pub(crate) fn open_read_only(&mut self) -> Result<()> { - let mut fd = std::fs::OpenOptions::new() - .read(true) - .open(self._path.as_ref())?; - let meta = fd.metadata()?; - let file_sz = meta.len(); - println!("file sz {}", file_sz); - let mut _mmap = unsafe { Mmap::map(&fd)? }; - // let mut _mmap = _mmap.make_read_only()?; - self._mmap.replace(_mmap.into()); - self.fd.replace(fd); - self.sz = file_sz as u32; - Ok(()) - } - // Acquire lock on mmap if you are calling this. - pub(crate) fn read(&self, p: &ValuePointer) -> Result<&[u8]> { - let offset = p.offset; - let sz = self._mmap.as_ref().unwrap().len(); - let value_sz = p.len; - return if offset >= sz as u32 || offset + value_sz > sz as u32 { - Err(Error::EOF) - } else { - Ok(&self._mmap.as_ref().unwrap()[offset as usize..(offset + value_sz) as usize]) - }; - // todo add metrics - } + pub(crate) async fn iterate_by_offset( + &self, + mut offset: u32, + f: &mut impl for<'a> FnMut( + &'a Entry, + &'a ValuePointer, + ) -> Pin> + 'a>>, + ) -> Result<()> { + loop { + let (v, next) = self.read_entries(offset, 1).await?; + if v.is_empty() { + return Ok(()); + } - // todo opz - pub(crate) fn done_writing(&mut self, offset: u32) -> Result<()> { - self.sync()?; - let mut_mmap = self.mut_mmap(); - mut_mmap.flush_async()?; - self.fd.as_mut().unwrap().set_len(offset as u64)?; - self.fd.as_mut().unwrap().sync_all()?; - { - self._mmap.take(); - self.fd.take(); + for (entry, vptr) in v.iter() { + let continue_ = f(entry, vptr).await?; + if !continue_ { + return Ok(()); + } + offset = next; + } } - self.open_read_only() - } - - fn mut_mmap(&self) -> &MmapMut { - self._mmap.as_ref().unwrap().get_mut_mmap() - } - - fn mmap_ref(&self) -> &Mmap { - self._mmap.as_ref().unwrap().get_mmap() - } - - // You must hold lf.lock to sync() - fn sync(&mut self) -> Result<()> { - self.fd.as_mut().unwrap().sync_all()?; - Ok(()) } + // It should be call by one thread. pub(crate) async fn iterate( &mut self, offset: u32, @@ -295,80 +211,84 @@ impl LogFile { // todo add truncate Ok(()) } +} - // pub(crate) async fn iterate2( - // &self, - // offset: u32, - // f: &mut impl for<'a> FnMut( - // &'a Entry, - // &'a ValuePointer, - // ) -> Pin> + 'a>>, - // ) -> Result<()> { - // let mut fd = self.fd.as_mut().unwrap(); - // fd.seek(SeekFrom::Start(offset as u64))?; - // // let fd = self.fd.as_ref().unwrap(); - // let mut entry = Entry::default(); - // let mut truncate = false; - // let mut record_offset = offset; - // loop { - // let mut h = Header::default(); - // let buffer = vec![0u8; Header::encoded_size()]; - // let ok = h.dec(&mut Cursor::new(buffer)); - // if ok.is_err() && ok.as_ref().unwrap_err().is_io_eof() { - // break; - // } - // // todo add truncate currenct - // ok?; - // if h.k_len as usize > entry.key.capacity() { - // entry.key = vec![0u8; h.k_len as usize]; - // } - // if h.v_len as usize > entry.value.capacity() { - // entry.value = vec![0u8; h.v_len as usize]; - // } - // entry.key.clear(); - // entry.value.clear(); - // - // let ok = fd.read(&mut entry.key); - // if is_eof(&ok) { - // break; - // } - // ok?; - // - // let ok = fd.read(&mut entry.value); - // if is_eof(&ok) { - // break; - // } - // ok?; - // entry.offset = record_offset; - // entry.meta = h.meta; - // entry.user_meta = h.user_mata; - // entry.cas_counter = h.cas_counter; - // entry.cas_counter_check = h.cas_counter_check; - // let ok = fd.read_u32::(); - // if is_eof(&ok) { - // break; - // } - // let crc = ok?; - // - // let mut vp = ValuePointer::default(); - // vp.len = Header::encoded_size() as u32 + h.k_len + h.v_len + 4; - // record_offset += vp.len; - // - // vp.offset = entry.offset; - // vp.fid = self.fid; - // - // let _continue = f(&entry, &vp).await?; - // if !_continue { - // break; - // } - // } - // - // // todo add truncate - // Ok(()) - // } - pub(crate) fn reset_seek_start(&mut self) -> Result<()> { - let fd = self.fd.as_mut().unwrap(); - fd.seek(SeekFrom::Start(0))?; +impl LogFile { + pub(crate) fn new(path: &str) -> Result { + let mut lf = LogFile { + _path: Box::new(path.to_string()), + fd: None, + fid: 0, + _mmap: None, + sz: 0, + }; + lf.open_read_only()?; + Ok(lf) + } + + pub(crate) fn open_read_only(&mut self) -> Result<()> { + let mut fd = std::fs::OpenOptions::new() + .read(true) + .open(self._path.as_ref())?; + let meta = fd.metadata()?; + let file_sz = meta.len(); + let mut _mmap = unsafe { Mmap::map(&fd)? }; + self._mmap.replace(_mmap.into()); + self.fd.replace(fd); + self.sz = file_sz as u32; + Ok(()) + } + + fn mmap_slice(&self) -> &[u8] { + let mmap = self._mmap.as_ref().unwrap(); + match mmap.0 { + Either::Left(ref _mmap) => _mmap.as_ref(), + Either::Right(ref _mmap) => _mmap.as_ref(), + } + } + + fn file_ref(&self) -> &File { + self.fd.as_ref().unwrap() + } + + // Acquire lock on mmap if you are calling this. + pub(crate) fn read(&self, p: &ValuePointer) -> Result<&[u8]> { + let offset = p.offset; + let sz = self._mmap.as_ref().unwrap().len(); + let value_sz = p.len; + return if offset >= sz as u32 || offset + value_sz > sz as u32 { + Err(Error::EOF) + } else { + Ok(&self._mmap.as_ref().unwrap()[offset as usize..(offset + value_sz) as usize]) + }; + // todo add metrics + } + + // todo opz + pub(crate) fn done_writing(&mut self, offset: u32) -> Result<()> { + self.sync()?; + let mut_mmap = self.mut_mmap(); + mut_mmap.flush_async()?; + self.fd.as_mut().unwrap().set_len(offset as u64)?; + self.fd.as_mut().unwrap().sync_all()?; + { + self._mmap.take(); + self.fd.take(); + } + self.open_read_only() + } + + fn mut_mmap(&self) -> &MmapMut { + self._mmap.as_ref().unwrap().get_mut_mmap() + } + + fn mmap_ref(&self) -> &Mmap { + self._mmap.as_ref().unwrap().get_mmap() + } + + // You must hold lf.lock to sync() + fn sync(&mut self) -> Result<()> { + self.fd.as_mut().unwrap().sync_all()?; Ok(()) } } @@ -391,3 +311,9 @@ fn test_mmap() { println!("{}", _mmap.len()); println!("{}", _mmap.make_mut().is_err()); } + +#[test] +fn test_write_file() { + let _path = temp_dir().join("badger-".to_owned() + &*rand::random::().to_string()); + let lf = create_synced_file(_path.to_str().unwrap(), true).unwrap(); +} diff --git a/src/mmap.rs b/src/mmap.rs index d6f6675..f3f56a0 100644 --- a/src/mmap.rs +++ b/src/mmap.rs @@ -9,7 +9,7 @@ pub(crate) struct AsyncLogFile { path: Box, sz: usize, fp: Option, - m: Option, + m: Option, } // impl AsyncMMAP { diff --git a/src/table/table.rs b/src/table/table.rs index d384d3c..87e5037 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -55,7 +55,7 @@ pub type WeakTable = XWeak; impl From for Table { fn from(value: TableCore) -> Self { - Table::new(value) + Table::new(value) } } @@ -73,11 +73,11 @@ impl Table { } pub fn biggest(&self) -> &[u8] { - &self.biggest + &self.biggest } pub fn smallest(&self) -> &[u8] { - &self.smallest + &self.smallest } } @@ -92,7 +92,7 @@ pub struct TableCore { _mmap: Option, // Memory mapped. // The following are initialized once and const. smallest: Vec, // smallest keys. - biggest: Vec, // biggest keys. + biggest: Vec, // biggest keys. id: u64, bf: GrowableBloom, } @@ -115,7 +115,7 @@ impl TableCore { loading_mode, _mmap: None, smallest: vec![], - biggest: vec![], + biggest: vec![], id, bf: GrowableBloom::new(0.01, 1), }; diff --git a/src/test_util.rs b/src/test_util.rs index 5ae4b39..c0c3082 100644 --- a/src/test_util.rs +++ b/src/test_util.rs @@ -1,3 +1,4 @@ +use atomic::Atomic; use chrono::Local; use log::{info, kv::source::as_map, kv::Source, Level}; use rand::random; @@ -5,10 +6,9 @@ use std::collections::HashMap; use std::env::temp_dir; use std::fs::create_dir_all; use std::io; -use std::sync::Arc; use std::sync::atomic::{AtomicI32, AtomicU64, Ordering}; +use std::sync::Arc; use std::time::Duration; -use atomic::Atomic; use tokio::runtime::Handle; use tracing_subscriber::fmt::format::Writer; use tracing_subscriber::fmt::time::FormatTime; @@ -185,10 +185,9 @@ fn tk2() { }); } fn add() -> i32 { - let f = async move {100}; - let r = tokio::task::block_in_place( move || { - tokio::runtime::Handle::current().block_on(f) - }); + let f = async move { 100 }; + let r = + tokio::task::block_in_place(move || tokio::runtime::Handle::current().block_on(f)); r } @@ -197,4 +196,4 @@ fn tk2() { }); println!("{}", a.load(Ordering::Relaxed)); -} \ No newline at end of file +} diff --git a/src/value_log.rs b/src/value_log.rs index 01e968c..3182513 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -478,7 +478,6 @@ impl ValueLogCore { of = 0; } let mut log_file = vlogs.vlogs.get(&id).unwrap().write(); - // log_file.reset_seek_start()?; log_file.iterate(of, &mut f).await?; } // Seek to the end to start writing. From 0b932738368022133b7a363d35b633fd7962145a Mon Sep 17 00:00:00 2001 From: Rg Date: Tue, 7 Mar 2023 01:35:41 +0800 Subject: [PATCH 48/77] :dog: --- src/log_file.rs | 26 ++--------------- src/value_log.rs | 73 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 57 insertions(+), 42 deletions(-) diff --git a/src/log_file.rs b/src/log_file.rs index c359b24..4a058a7 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -86,31 +86,11 @@ impl LogFile { let mut cursor_offset = offset; let mut v = vec![]; while cursor_offset < m.len() as u32 && v.len() < n { - let mut entry = Entry::default(); - let mut h = Header::default(); - h.dec(&mut Cursor::new( - &m[cursor_offset as usize..cursor_offset as usize + Header::encoded_size()], - ))?; - entry.key = vec![0u8; h.k_len as usize]; - entry.value = vec![0u8; h.v_len as usize]; - entry.meta = h.meta; - entry.offset = cursor_offset as u32; - entry.cas_counter = h.cas_counter; - entry.user_meta = h.user_mata; - entry.cas_counter_check = h.cas_counter_check; - let mut start = cursor_offset as usize + Header::encoded_size(); - entry - .key - .extend_from_slice(&m[start..start + h.k_len as usize]); - start += h.k_len as usize; - entry - .value - .extend_from_slice(&m[start..start + h.v_len as usize]); - start += h.v_len as usize; - let crc32 = Cursor::new(&m[start..start + 4]).read_u32::()?; + let mut entry = Entry::from_slice(cursor_offset, m)?; let mut vpt = ValuePointer::default(); vpt.fid = self.fid; - vpt.len = Header::encoded_size() as u32 + h.k_len + h.v_len + 4; + vpt.len = + Header::encoded_size() as u32 + (entry.key.len() + entry.value.len()) as u32 + 4; vpt.offset = cursor_offset; cursor_offset += vpt.len; v.push((entry, vpt)) diff --git a/src/value_log.rs b/src/value_log.rs index 3182513..64bde70 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -74,13 +74,6 @@ impl Header { pub(crate) const fn encoded_size() -> usize { size_of::() } - - // pub(crate) fn from_fd(&mut self, rd: &File, offset: u64) -> Result<()> { - // let mut buffer = vec![0u8; Self::encoded_size()]; - // read_at(rd, &mut buffer, offset)?; - // self.enc(&mut Cursor::new(buffer))?; - // Ok(()) - // } } impl Encode for Header { @@ -124,6 +117,32 @@ pub struct Entry { } impl Entry { + pub(crate) fn from_slice(cursor_offset: u32, m: &[u8]) -> Result { + let mut entry = Entry::default(); + let mut h = Header::default(); + h.dec(&mut Cursor::new( + &m[cursor_offset as usize..cursor_offset as usize + Header::encoded_size()], + ))?; + entry.key = vec![0u8; h.k_len as usize]; + entry.value = vec![0u8; h.v_len as usize]; + entry.meta = h.meta; + entry.offset = cursor_offset as u32; + entry.cas_counter = h.cas_counter; + entry.user_meta = h.user_mata; + entry.cas_counter_check = h.cas_counter_check; + let mut start = cursor_offset as usize + Header::encoded_size(); + entry + .key + .extend_from_slice(&m[start..start + h.k_len as usize]); + start += h.k_len as usize; + entry + .value + .extend_from_slice(&m[start..start + h.v_len as usize]); + start += h.v_len as usize; + let crc32 = Cursor::new(&m[start..start + 4]).read_u32::()?; + Ok(entry) + } + fn to_string(&self, prefix: &str) -> String { format!("{} {}", prefix, self) } @@ -157,23 +176,39 @@ impl Encode for Entry { impl Decode for Entry { fn dec(&mut self, rd: &mut dyn Read) -> Result<()> { - todo!() + let mut h = Header::default(); + let mut buffer = vec![0u8; Header::encoded_size()]; + let sz = rd.read(&mut buffer)?; + assert_eq!(sz, buffer.len()); + h.dec(&mut Cursor::new(&buffer))?; + self.key = vec![0u8; h.k_len as usize]; + self.value = vec![0u8; h.v_len as usize]; + self.meta = h.meta; + // self.offset = cursor_offset as u32; + self.cas_counter = h.cas_counter; + self.user_meta = h.user_mata; + self.cas_counter_check = h.cas_counter_check; + let sz = rd.read(&mut self.key)?; + assert_eq!(sz, h.k_len as usize); + let sz = rd.read(&mut self.value)?; + assert_eq!(sz, h.v_len as usize); + // TODO check crc32 + let crc32 = rd.read_u32::()?; + Ok(()) } } impl fmt::Display for Entry { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!( - f, - "key: {:?} meta: {} usermeta: {} offset: {} len={} cas={} check={}", - self.key, - self.meta, - self.user_meta, - self.offset, - self.value.len(), - self.cas_counter, - self.cas_counter_check - ) + f.debug_struct("Entry") + .field("key", &String::from_utf8_lossy(&self.key).to_string()) + .field("meta", &self.meta) + .field("user_meta", &self.user_meta) + .field("offset", &self.offset) + .field("value", &self.value) + .field("case=", &self.cas_counter) + .field("check", &self.cas_counter_check) + .finish() } } From af4a39a9eccd8fbea7dc7de5b07453db2ec249a9 Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 8 Mar 2023 01:34:25 +0800 Subject: [PATCH 49/77] :dog: --- src/kv.rs | 55 ++++++++++++++++++++++++++++++++++++++++------ src/options/mod.rs | 2 +- src/value_log.rs | 28 ++++++++++++++++++----- 3 files changed, 71 insertions(+), 14 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 42f45ce..414cc0f 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -5,7 +5,9 @@ use crate::table::builder::Builder; use crate::table::iterator::IteratorImpl; use crate::table::table::{new_file_name, Table, TableCore}; use crate::types::{Channel, Closer, XArc, XWeak}; -use crate::value_log::{ArcRequest, Entry, MetaBit, Request, ValueLogCore, ValuePointer}; +use crate::value_log::{ + ArcRequest, Entry, MetaBit, Request, ValueLogCore, ValuePointer, MAX_KEY_SIZE, +}; use crate::y::{ async_sync_directory, create_synced_file, sync_directory, Encode, Result, ValueStruct, }; @@ -19,6 +21,7 @@ use fs2::FileExt; use log::{info, Log}; use parking_lot::Mutex; use std::borrow::BorrowMut; +use std::cell::RefCell; use std::fs::{read_dir, File}; use std::fs::{try_exists, OpenOptions}; use std::future::Future; @@ -27,10 +30,10 @@ use std::ops::Deref; use std::path::{Path, PathBuf}; use std::pin::Pin; use std::ptr::NonNull; -use std::string; use std::sync::atomic::{AtomicPtr, AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use std::time::Duration; +use std::{string, vec}; use tokio::fs::create_dir_all; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::sync::{RwLock, RwLockWriteGuard}; @@ -413,8 +416,46 @@ impl KV { // Check(e.Error); // } // TODO - pub(crate) async fn batch_set(&self, entries: Vec) -> Result<()> { - todo!() + pub(crate) async fn batch_set(&self, entries: Vec) -> Result> { + let mut bad = vec![]; + let mut b = vec![Request::default()]; + let mut count = 0; + let mut sz = 0u64; + for mut entry in entries { + if entry.key.len() > MAX_KEY_SIZE { + bad.push(entry); + continue; + } + if entry.value.len() as u64 > self.opt.value_log_file_size { + bad.push(entry); + continue; + } + count += 1; + sz += self.opt.estimate_size(&entry) as u64; + let req = b.last_mut().unwrap(); + req.entries.write().push(RefCell::new(entry)); + if count >= self.opt.max_batch_count || sz >= self.opt.max_batch_count { + b.push(Request::default()); + } + } + let mut reqs = vec![]; + for req in b { + if req.entries.read().is_empty() { + break; + } + let arc_req = ArcRequest::from(req); + reqs.push(arc_req.clone()); + self.write_ch.send(arc_req).await.unwrap(); + } + if !bad.is_empty() { + let mut req = Request::default(); + *req.entries.write() = + Vec::from_iter(bad.into_iter().map(|bad| RefCell::new(bad)).into_iter()); + let arc_req = ArcRequest::from(req); + arc_req.set_err(Err("key too big or value to big".into())).await; + reqs.push(arc_req); + } + Ok(reqs) } fn new_cas_counter(&self, how_many: u64) -> u64 { @@ -526,9 +567,9 @@ impl ArcKV { .collect::>(); let to_reqs = Arc::new(to_reqs); if let Err(err) = self.write_requests(to_reqs).await { - reqs.lock() - .iter() - .for_each(|req| req.set_err(Err(err.clone()))); + for req in reqs.lock().iter() { + req.set_err(Err(err.clone())).await; + } } reqs.lock().clear(); } diff --git a/src/options/mod.rs b/src/options/mod.rs index 0784dd4..1af61e9 100644 --- a/src/options/mod.rs +++ b/src/options/mod.rs @@ -71,7 +71,7 @@ pub struct Options { } impl Options { - fn estimate_size(&self, entry: Entry) -> usize { + pub fn estimate_size(&self, entry: &Entry) -> usize { if entry.value.len() < self.value_threshold { return entry.key.len() + entry.value.len() + META_SIZE + USER_META_SIZE + CAS_SIZE; } diff --git a/src/value_log.rs b/src/value_log.rs index 64bde70..1fc98c8 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -59,6 +59,8 @@ bitflags! { const M: u64 = 1 << 20; +pub(crate) const MAX_KEY_SIZE: usize = 1 << 20; + #[derive(Debug, Default)] #[repr(C)] pub(crate) struct Header { @@ -268,6 +270,16 @@ pub(crate) struct Request { pub(crate) res: Channel>, } +impl Default for Request { + fn default() -> Self { + Request{ + entries: Default::default(), + ptrs: Default::default(), + res: Channel::new(1), + } + } +} + impl Request { pub(crate) async fn get_resp(&self) -> Result<()> { self.res.recv().await.unwrap() @@ -292,8 +304,9 @@ impl ArcRequest { pub(crate) fn get_req(&self) -> Arc { self.inner.clone() } - pub(crate) fn set_err(&self, err: Result<()>) { - *self.err.lock() = err; + pub(crate) async fn set_err(&self, err: Result<()>) { + *self.err.lock() = err.clone(); + self.inner.res.send(err).await; } pub(crate) fn to_inner(self) -> Request { @@ -628,7 +641,8 @@ impl ValueLogCore { to_disk() } - async fn rewrite(&self, lf: Arc>) -> Result<()> { + // rewrite the log_file + async fn rewrite(&self, lf: Arc>, x: &KV) -> Result<()> { let max_fid = self.max_fid.load(Ordering::Relaxed); assert!( lf.read().fid < max_fid, @@ -667,14 +681,15 @@ impl ValueLogCore { return Err(err.clone()); } let vs = vs.unwrap(); - if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { + // It should be not happen, if the value is deleted + if vs.meta & MetaBit::BIT_DELETE.bits() > 0 { info!( "REWRITE=> {} has been deleted", String::from_utf8_lossy(&entry.key) ); continue; } - if (vs.meta & MetaBit::BIT_VALUE_POINTER.bits()) < 0 { + if vs.meta & MetaBit::BIT_VALUE_POINTER.bits() < 0 { info!( "REWRITE=> {} has been skipped, meta: {}", String::from_utf8_lossy(&entry.key), @@ -703,6 +718,7 @@ impl ValueLogCore { { // This new entry only contains the key, and a pointer to the value. let mut ne = Entry::default(); + // TODO why? if entry.meta == MetaBit::BIT_SET_IF_ABSENT.bits() { // If we rewrite this entry without removing BitSetIfAbsent, LSM would see that // the key is already present, which would be this same entry and won't update @@ -976,7 +992,7 @@ impl SafeValueLog { } info!("REWRITING VLOG {}", lf.read().fid); - self.value_log.rewrite(lf).await?; + self.value_log.rewrite(lf, self.value_log.get_kv()).await?; Ok(()) } } From 872f66323d2154ad31ffd7bfca6c02ea85d84972 Mon Sep 17 00:00:00 2001 From: Rg Date: Fri, 10 Mar 2023 01:30:48 +0800 Subject: [PATCH 50/77] :dog: --- src/kv.rs | 28 ++++++++++++++-------------- src/value_log.rs | 5 +++++ 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 414cc0f..9571e3b 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -4,7 +4,7 @@ use crate::options::Options; use crate::table::builder::Builder; use crate::table::iterator::IteratorImpl; use crate::table::table::{new_file_name, Table, TableCore}; -use crate::types::{Channel, Closer, XArc, XWeak}; +use crate::types::{ArcMx, Channel, Closer, TArcMx, XArc, XWeak}; use crate::value_log::{ ArcRequest, Entry, MetaBit, Request, ValueLogCore, ValuePointer, MAX_KEY_SIZE, }; @@ -338,8 +338,9 @@ impl KV { // There is code (in flush_mem_table) whose correctness depends on us generating CAS Counter // values _before_ we modify s.vptr here. for req in reqs.iter() { - let counter_base = self.new_cas_counter(req.get_req().entries.read().len() as u64); - for (idx, entry) in req.get_req().entries.read().iter().enumerate() { + let entries = req.req_ref().entries.write(); + let counter_base = self.new_cas_counter(entries.len() as u64); + for (idx, entry) in entries.iter().enumerate() { entry.borrow_mut().cas_counter = counter_base + idx as u64; } } @@ -348,9 +349,6 @@ impl KV { info!("Writing to memory table"); let mut count = 0; for req in reqs.iter() { - if req.get_req().entries.read().is_empty() { - continue; - } count += req.get_req().entries.read().len(); } Ok(()) @@ -452,7 +450,9 @@ impl KV { *req.entries.write() = Vec::from_iter(bad.into_iter().map(|bad| RefCell::new(bad)).into_iter()); let arc_req = ArcRequest::from(req); - arc_req.set_err(Err("key too big or value to big".into())).await; + arc_req + .set_err(Err("key too big or value to big".into())) + .await; reqs.push(arc_req); } Ok(reqs) @@ -547,7 +547,7 @@ impl ArcKV { // TODO add metrics let has_been_close = lc.has_been_closed(); let write_ch = self.write_ch.clone(); - let reqs = Arc::new(Mutex::new(vec![])); + let reqs = ArcMx::>::new(Mutex::new(vec![])); loop { tokio::select! { _ = has_been_close.recv() => { @@ -567,9 +567,9 @@ impl ArcKV { .collect::>(); let to_reqs = Arc::new(to_reqs); if let Err(err) = self.write_requests(to_reqs).await { - for req in reqs.lock().iter() { - req.set_err(Err(err.clone())).await; - } + // for req in reqs.lock().iter() { + // req.set_err(Err(err.clone())).await; + // } } reqs.lock().clear(); } @@ -591,9 +591,9 @@ impl ArcKV { .map(|req| req.clone()) .collect::>(); if let Err(err) = self.write_requests(Arc::new(to_reqs)).await { - reqs.lock() - .iter() - .for_each(|req| req.set_err(Err(err.clone()))); + // for req in reqs.lock().iter() { + // req.set_err(Err(err.clone())).await; + // } } } } diff --git a/src/value_log.rs b/src/value_log.rs index 1fc98c8..7560cc6 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -304,6 +304,11 @@ impl ArcRequest { pub(crate) fn get_req(&self) -> Arc { self.inner.clone() } + + pub(crate) fn req_ref(&self) -> &Arc { + &self.inner + } + pub(crate) async fn set_err(&self, err: Result<()>) { *self.err.lock() = err.clone(); self.inner.res.send(err).await; From 2ca8d3daa9df0c655ef9a3e0f5b594d964a358ea Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 13 Mar 2023 02:24:23 +0800 Subject: [PATCH 51/77] :dog: --- src/log_file.rs | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/log_file.rs b/src/log_file.rs index 4a058a7..ef6a63a 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -3,7 +3,6 @@ use crate::y::{create_synced_file, Result}; use crate::y::{is_eof, read_at, Decode}; use crate::Error; use byteorder::{BigEndian, ReadBytesExt}; -use core::slice::SlicePattern; use either::Either; use memmap::{Mmap, MmapMut}; use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; @@ -20,10 +19,11 @@ use std::pin::Pin; use std::task::{Context, Poll}; use std::time::SystemTime; +// MmapType is a Mmap and MmapMut tule pub(crate) struct MmapType(Either); impl MmapType { - fn get_mmap(&self) -> &Mmap { + pub(crate) fn get_mmap(&self) -> &Mmap { match self.0 { Either::Left(ref _mmap) => _mmap, _ => panic!("It should be not happen"), @@ -77,6 +77,7 @@ impl Debug for LogFile { } impl LogFile { + // async read *n* entries pub(crate) async fn read_entries( &self, offset: u32, @@ -86,7 +87,7 @@ impl LogFile { let mut cursor_offset = offset; let mut v = vec![]; while cursor_offset < m.len() as u32 && v.len() < n { - let mut entry = Entry::from_slice(cursor_offset, m)?; + let entry = Entry::from_slice(cursor_offset, m)?; let mut vpt = ValuePointer::default(); vpt.fid = self.fid; vpt.len = @@ -98,6 +99,7 @@ impl LogFile { Ok((v, cursor_offset)) } + // async iterate from offset that must be call with thread safty pub(crate) async fn iterate_by_offset( &self, mut offset: u32, @@ -113,8 +115,7 @@ impl LogFile { } for (entry, vptr) in v.iter() { - let continue_ = f(entry, vptr).await?; - if !continue_ { + if !f(entry, vptr).await? { return Ok(()); } offset = next; @@ -194,6 +195,7 @@ impl LogFile { } impl LogFile { + // new LogFile with special path. pub(crate) fn new(path: &str) -> Result { let mut lf = LogFile { _path: Box::new(path.to_string()), @@ -206,8 +208,9 @@ impl LogFile { Ok(lf) } + // open only read permission pub(crate) fn open_read_only(&mut self) -> Result<()> { - let mut fd = std::fs::OpenOptions::new() + let fd = std::fs::OpenOptions::new() .read(true) .open(self._path.as_ref())?; let meta = fd.metadata()?; @@ -219,18 +222,6 @@ impl LogFile { Ok(()) } - fn mmap_slice(&self) -> &[u8] { - let mmap = self._mmap.as_ref().unwrap(); - match mmap.0 { - Either::Left(ref _mmap) => _mmap.as_ref(), - Either::Right(ref _mmap) => _mmap.as_ref(), - } - } - - fn file_ref(&self) -> &File { - self.fd.as_ref().unwrap() - } - // Acquire lock on mmap if you are calling this. pub(crate) fn read(&self, p: &ValuePointer) -> Result<&[u8]> { let offset = p.offset; @@ -244,7 +235,7 @@ impl LogFile { // todo add metrics } - // todo opz + // Done written, reopen with read only permisson for file and mmap. pub(crate) fn done_writing(&mut self, offset: u32) -> Result<()> { self.sync()?; let mut_mmap = self.mut_mmap(); @@ -258,6 +249,20 @@ impl LogFile { self.open_read_only() } + // return mmap slice + fn mmap_slice(&self) -> &[u8] { + let mmap = self._mmap.as_ref().unwrap(); + match mmap.0 { + Either::Left(ref _mmap) => _mmap.as_ref(), + Either::Right(ref _mmap) => _mmap.as_ref(), + } + } + + // return file reference + fn file_ref(&self) -> &File { + self.fd.as_ref().unwrap() + } + fn mut_mmap(&self) -> &MmapMut { self._mmap.as_ref().unwrap().get_mut_mmap() } From 23164b1e13314ca77775971f61a2cdada3b3193d Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 15 Mar 2023 01:23:48 +0800 Subject: [PATCH 52/77] :dog: --- src/log_file.rs | 7 ++++--- src/value_log.rs | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/log_file.rs b/src/log_file.rs index ef6a63a..10bdfa6 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -135,7 +135,7 @@ impl LogFile { let mut fd = self.fd.as_mut().unwrap(); fd.seek(SeekFrom::Start(offset as u64))?; let mut entry = Entry::default(); - let mut truncate = false; + let mut truncate = false; // because maybe abort before write let mut record_offset = offset; loop { let mut h = Header::default(); @@ -299,6 +299,7 @@ fn test_mmap() { #[test] fn test_write_file() { - let _path = temp_dir().join("badger-".to_owned() + &*rand::random::().to_string()); - let lf = create_synced_file(_path.to_str().unwrap(), true).unwrap(); + //let lf = create_synced_file(_path.to_str().unwrap(), true).unwrap(); + let mut vlog = LogFile::new("src/test_data/vlog_file.text").unwrap(); + println!("{}", vlog.unwrap_err()); } diff --git a/src/value_log.rs b/src/value_log.rs index 7560cc6..bd115ec 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -522,12 +522,13 @@ impl ValueLogCore { let vlogs = self.pick_log_guard(); info!("Seeking at value pointer: {:?}", vp); let offset = vp.offset + vp.len; + // Find the max file to recover for id in vlogs.fids { if id < vp.fid { continue; } let mut of = offset; - if id > vp.fid { + if id > vp.fid { // It is very import that maybe the lasted memory table are not persistent at disk. of = 0; } let mut log_file = vlogs.vlogs.get(&id).unwrap().write(); From 65c0936f6286086c8200f8a571c914bc0bd6c16e Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 16 Mar 2023 09:34:56 +0800 Subject: [PATCH 53/77] :dog: --- src/iterator.rs | 90 +++++++++++++++++++++++++++++++++++++++++++++++++ src/kv.rs | 18 +++++++--- src/lib.rs | 1 + src/log_file.rs | 2 +- 4 files changed, 106 insertions(+), 5 deletions(-) diff --git a/src/iterator.rs b/src/iterator.rs index e69de29..ede55b8 100644 --- a/src/iterator.rs +++ b/src/iterator.rs @@ -0,0 +1,90 @@ +use std::{io::Cursor, sync::atomic::AtomicU64}; + +use awaitgroup::WaitGroup; + +use crate::{ + kv::KV, + types::XArc, + value_log::{MetaBit, ValuePointer}, + Decode, Result, +}; + +#[derive(Debug, PartialEq)] +pub(crate) enum PreFetchStatus { + Empty, + Prefetced, +} + +// Retuned during iteration. Both the key() and value() output is only valid until +// iterator.next() is called. +pub(crate) struct KVItemInner { + status: PreFetchStatus, + kv: XArc, + key: Vec, + value: Vec, + vptr: Vec, + meta: u8, + user_meta: u8, + cas_counter: AtomicU64, + wg: WaitGroup, + err: Result<()>, +} + +impl KVItemInner { + // Returns the key. Remember to copy if you need to access it outside the iteration loop. + pub(crate) fn key(&self) -> &[u8] { + &self.value + } + + pub(crate) async fn value(&self, consumer: &mut impl FnMut(&[u8]) -> Result<()>) -> Result<()> { + self.wg.wait().await; + if self.status == PreFetchStatus::Prefetced { + if self.err.is_err() { + return self.err.clone(); + } + + return consumer(&self.value); + } + + Ok(()) + } + + pub(crate) fn has_value(&self) -> bool { + if self.meta == 0 && self.vptr.is_empty() { + return false; + } + if self.meta & MetaBit::BIT_DELETE.bits() > 0 { + return false; + } + + true + } + + // Returns approximate size of the key-value pair. + // + // This can be called while iterating through a store to quickly estimate the + // size of a range of key-value pairs (without fetching the corresponding) + // values). + pub(crate) fn estimated_size(&self) -> u64 { + if !self.has_value() { + return 0; + } + if self.meta & MetaBit::BIT_VALUE_POINTER.bits() == 0 { + return (self.key.len() + self.vptr.len()) as u64; + } + let mut vpt = ValuePointer::default(); + vpt.dec(&mut Cursor::new(&self.vptr)).unwrap(); + vpt.len as u64 // includes key length + } + + // Returns the CAS counter associated with the value. + pub(crate) fn counter(&self) -> u64 { + self.cas_counter.load(atomic::Ordering::Relaxed) + } + + // Returns the user_meta set by the user. Typically, this byte, optionally set by the user + // is used to interpret the value. + pub(crate) fn user_meta(&self) -> u8 { + self.meta + } +} diff --git a/src/kv.rs b/src/kv.rs index 9571e3b..bdb14f6 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -1,3 +1,4 @@ +use crate::iterator::KVItemInner; use crate::levels::{LevelsController, XLevelsController}; use crate::manifest::{open_or_create_manifest_file, Manifest, ManifestFile}; use crate::options::Options; @@ -122,10 +123,8 @@ impl KV { vlog: None, vptr: crossbeam_epoch::Atomic::null(), manifest: Arc::new(RwLock::new(manifest_file)), - // lc: Default::default(), lc: XWeak::new(), flush_chan: Channel::new(1), - // write_chan: Channel::new(1), dir_lock_guard, value_dir_guard, closers, @@ -143,9 +142,13 @@ impl KV { let lc = LevelsController::new(manifest.clone(), out.opt.clone()).await?; lc.start_compact(out.closers.compactors.clone()); let mut vlog = ValueLogCore::default(); + { + let kv = unsafe { &out as *const KV }; + vlog.open(kv, opt)?; + } out.vlog = Some(vlog); let xout = XArc::new(out); - // xout.vlog.unwrap().open(&xout, opt)?; + // update size { let _out = xout.clone(); @@ -163,12 +166,12 @@ impl KV { }); } + // Get the lasted ValueLog Recover Pointer let item = xout.get(_HEAD); if item.is_err() { return Err("Retrieving head".into()); } let item = item.unwrap(); - let value = &item.value; if value != _HEAD { return Err("Retrieving head".into()); @@ -634,6 +637,13 @@ impl ArcKV { Err(Unexpected("No room for write".into())) } } + + async fn yield_item_value( + &self, + item: &KVItemInner, + consume: impl FnMut(&[u8]) -> Result<()>, + ) -> Result<()> { + } } async fn write_level0_table(st: &SkipList, f: &mut tokio::fs::File) -> Result<()> { diff --git a/src/lib.rs b/src/lib.rs index 464fb4d..b81ca60 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,6 +22,7 @@ use std::io; use std::mem::align_of; mod event; +mod iterator; mod kv; mod log_file; mod manifest; diff --git a/src/log_file.rs b/src/log_file.rs index 10bdfa6..a7ecde9 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -301,5 +301,5 @@ fn test_mmap() { fn test_write_file() { //let lf = create_synced_file(_path.to_str().unwrap(), true).unwrap(); let mut vlog = LogFile::new("src/test_data/vlog_file.text").unwrap(); - println!("{}", vlog.unwrap_err()); + // println!("{}", vlog.unwrap_err()); } From 7f3b7b279ade57d43c1675ad44bb742971e98ea7 Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 16 Mar 2023 09:35:02 +0800 Subject: [PATCH 54/77] :dog: --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index ac90f07..705a5af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,6 +45,7 @@ tokio-context = "0.1.3" dyn-clone = "1.0.10" eieio = "1.0.0" either = "1.8.1" +enum-unitary = "0.5.0" [dev-dependencies] tracing-subscriber = "0.3.16" tracing-log = "0.1.3" From 645b2f6e66bae2157b9037169de738aa473eb55c Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 16 Mar 2023 22:56:56 +0800 Subject: [PATCH 55/77] :coffee: add ArcLock for valueLog.buf --- src/iterator.rs | 58 +++++++++++++++++++++++++++++++++++++++++++++++- src/value_log.rs | 25 +++++++++++++++++++-- 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/src/iterator.rs b/src/iterator.rs index ede55b8..c678605 100644 --- a/src/iterator.rs +++ b/src/iterator.rs @@ -1,7 +1,9 @@ -use std::{io::Cursor, sync::atomic::AtomicU64}; +use std::{collections::VecDeque, io::Cursor, sync::atomic::AtomicU64}; +use std::ops::Deref; use awaitgroup::WaitGroup; +use crate::types::{ArcMx, ArcRW}; use crate::{ kv::KV, types::XArc, @@ -88,3 +90,57 @@ impl KVItemInner { self.meta } } + +// Used to set options when iterating over Badger key-value stores. +pub(crate) struct IteratorOptions { + // Indicates whether we should prefetch values during iteration and store them. + pre_fetch_values: bool, + // How may KV pairs to prefetch while iterating. Valid only if PrefetchValues is true. + pre_fetch_size: isize, + // Direction of iteration. False is forward, true is backward. + reverse: bool, +} + +pub(crate) const DEF_ITERATOR_OPTIONS: IteratorOptions = IteratorOptions { + pre_fetch_size: 100, + pre_fetch_values: true, + reverse: false, +}; + +// Helps iterating over the KV pairs in a lexicographically sorted order. +struct IteratorExt { + kv: XArc, + opt: IteratorOptions, + item: Option>, + data: ArcRW>>, + waste: ArcRW>>, +} + +impl IteratorExt { + fn new_item(&self) -> Option> { + self.waste.write().pop_front() + } + + // Returns pointer to the current KVItem. + // This item is only valid until it.Next() gets called. + fn item(&self) -> Option> { + self.item.clone() + } + + // Returns false when iteration is done. + fn valid(&self) -> bool { + self.item.is_some() + } + + // Returns false when iteration is done + // or when the current key is not prefixed by the specified prefix. + fn valid_for_prefix(&self, prefix: &[u8]) -> bool { + self.item.is_some() && self.item.as_ref().unwrap().key().starts_with(prefix) + } + + // Close the iterator, It is important to call this when you're done with iteration. + fn close(&self) { + // TODO: We could handle this error. + self.kv.vlog.as_ref().unwrap().deref(); + } +} diff --git a/src/value_log.rs b/src/value_log.rs index bd115ec..74b741c 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -272,7 +272,7 @@ pub(crate) struct Request { impl Default for Request { fn default() -> Self { - Request{ + Request { entries: Default::default(), ptrs: Default::default(), res: Channel::new(1), @@ -528,7 +528,8 @@ impl ValueLogCore { continue; } let mut of = offset; - if id > vp.fid { // It is very import that maybe the lasted memory table are not persistent at disk. + if id > vp.fid { + // It is very import that maybe the lasted memory table are not persistent at disk. of = 0; } let mut log_file = vlogs.vlogs.get(&id).unwrap().write(); @@ -550,6 +551,26 @@ impl ValueLogCore { Ok(()) } + fn decr_iterator_count(&self) -> Result<()> { + // TODO add share lock. + let old = self.num_active_iterators.fetch_sub(1, Ordering::Relaxed); + if old != 1 { + // the lasted reference + return Ok(()); + } + let mut lfs = vec![]; + for dirty_vlog in self.dirty_vlogs.read().iter() { + lfs.push(*dirty_vlog); + // TODO + self.vlogs.write().remove(dirty_vlog); + } + self.dirty_vlogs.write().clear(); + for lf in lfs { + // self.delete_log_file() // TODO + } + Ok(()) + } + fn delete_log_file(&mut self, mut log_file: LogFile) -> Result<()> { if let Some(mp) = log_file._mmap.take() { mp.get_mut_mmap().flush()?; From 89a7b33c17d4eb5055ca0ede6ae272db6fabe944 Mon Sep 17 00:00:00 2001 From: Rg Date: Fri, 17 Mar 2023 02:29:51 +0800 Subject: [PATCH 56/77] :dog: --- .idea/.gitignore | 8 ++++++ .idea/badger-rs.iml | 11 ++++++++ .idea/modules.xml | 8 ++++++ .idea/vcs.xml | 6 ++++ lock.txt | 0 src/iterator.rs | 69 +++++++++++++++++++++++++++++++++++---------- src/kv.rs | 2 +- src/lock.txt | 0 src/types.rs | 3 +- src/value_log.rs | 16 +++++------ 10 files changed, 97 insertions(+), 26 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/badger-rs.iml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 lock.txt create mode 100644 src/lock.txt diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/badger-rs.iml b/.idea/badger-rs.iml new file mode 100644 index 0000000..c254557 --- /dev/null +++ b/.idea/badger-rs.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..1da5144 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/lock.txt b/lock.txt new file mode 100644 index 0000000..e69de29 diff --git a/src/iterator.rs b/src/iterator.rs index c678605..b3eec84 100644 --- a/src/iterator.rs +++ b/src/iterator.rs @@ -1,23 +1,27 @@ -use std::{collections::VecDeque, io::Cursor, sync::atomic::AtomicU64}; -use std::ops::Deref; +use parking_lot::RwLock; +use std::sync::atomic::{AtomicU8, Ordering}; +use std::sync::Arc; +use std::{io::Cursor, sync::atomic::AtomicU64}; -use awaitgroup::WaitGroup; - -use crate::types::{ArcMx, ArcRW}; +use crate::types::{ArcMx, ArcRW, Closer, TArcMx, TArcRW}; +use crate::MergeIterOverIterator; use crate::{ kv::KV, types::XArc, value_log::{MetaBit, ValuePointer}, - Decode, Result, + Decode, Result, Xiterator, }; +use crate::iterator::PreFetchStatus::Prefetched; #[derive(Debug, PartialEq)] pub(crate) enum PreFetchStatus { Empty, - Prefetced, + Prefetched, } -// Retuned during iteration. Both the key() and value() output is only valid until +type KVItem = TArcRW; + +// Returned during iteration. Both the key() and value() output is only valid until // iterator.next() is called. pub(crate) struct KVItemInner { status: PreFetchStatus, @@ -28,26 +32,25 @@ pub(crate) struct KVItemInner { meta: u8, user_meta: u8, cas_counter: AtomicU64, - wg: WaitGroup, + wg: Closer, err: Result<()>, } impl KVItemInner { // Returns the key. Remember to copy if you need to access it outside the iteration loop. - pub(crate) fn key(&self) -> &[u8] { - &self.value + pub(crate) fn key(&self) -> Vec { + self.value } pub(crate) async fn value(&self, consumer: &mut impl FnMut(&[u8]) -> Result<()>) -> Result<()> { self.wg.wait().await; - if self.status == PreFetchStatus::Prefetced { + if self.status == PreFetchStatus::Prefetch { if self.err.is_err() { return self.err.clone(); } return consumer(&self.value); } - Ok(()) } @@ -62,6 +65,18 @@ impl KVItemInner { true } + pub(crate) async fn pre_fetch_value(&self) { + self.kv + .yield_item_value(&self, |value| -> Result<()> { + if value.is_empty() { + self.status = Prefetched; + return Ok(()); + } + self.value = value.clone(); + self.status = Prefetched; + Ok(()) + }); + } // Returns approximate size of the key-value pair. // // This can be called while iterating through a store to quickly estimate the @@ -110,6 +125,7 @@ pub(crate) const DEF_ITERATOR_OPTIONS: IteratorOptions = IteratorOptions { // Helps iterating over the KV pairs in a lexicographically sorted order. struct IteratorExt { kv: XArc, + itr: MergeIterOverIterator, opt: IteratorOptions, item: Option>, data: ArcRW>>, @@ -139,8 +155,31 @@ impl IteratorExt { } // Close the iterator, It is important to call this when you're done with iteration. - fn close(&self) { + fn close(&self) -> Result<()> { // TODO: We could handle this error. - self.kv.vlog.as_ref().unwrap().deref(); + self.kv.vlog.as_ref().unwrap().decr_iterator_count()?; + Ok(()) + } + + async fn fill(&self, item: KVItem) { + let mut vs = self.itr.peek().as_ref().unwrap().value(); + { + let mut item = item.write().await; + item.meta = vs.meta; + item.user_meta = vs.user_meta; + item.cas_counter.store(vs.cas_counter, Ordering::Relaxed); + item.key.extend(self.itr.peek().as_ref().unwrap().key()); + item.vptr.extend(&vs.value); + item.value.clear(); + } + + if self.opt.pre_fetch_values { + item.wg.add_running(1); + tokio::spawn(async move { + // FIXME we are not handling errors here. + item.read().pre_fetch_value()?; + item.read().await.wg.done(); + }); + } } } diff --git a/src/kv.rs b/src/kv.rs index bdb14f6..34ec4e3 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -638,7 +638,7 @@ impl ArcKV { } } - async fn yield_item_value( + pub(crate) async fn yield_item_value( &self, item: &KVItemInner, consume: impl FnMut(&[u8]) -> Result<()>, diff --git a/src/lock.txt b/src/lock.txt new file mode 100644 index 0000000..e69de29 diff --git a/src/types.rs b/src/types.rs index ef2d5fa..55f3513 100644 --- a/src/types.rs +++ b/src/types.rs @@ -180,8 +180,7 @@ impl Closer { if self.wait.load(Ordering::Relaxed) <= 0 { break; } - // hint::spin_loop(); - sleep(Duration::from_millis(10)).await; + sleep(Duration::from_millis(1)).await; } } diff --git a/src/value_log.rs b/src/value_log.rs index 74b741c..8feafed 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -551,7 +551,7 @@ impl ValueLogCore { Ok(()) } - fn decr_iterator_count(&self) -> Result<()> { + pub(crate) fn decr_iterator_count(&self) -> Result<()> { // TODO add share lock. let old = self.num_active_iterators.fetch_sub(1, Ordering::Relaxed); if old != 1 { @@ -560,25 +560,25 @@ impl ValueLogCore { } let mut lfs = vec![]; for dirty_vlog in self.dirty_vlogs.read().iter() { - lfs.push(*dirty_vlog); // TODO - self.vlogs.write().remove(dirty_vlog); + let vlog = self.vlogs.write().remove(dirty_vlog).unwrap(); + lfs.push(vlog); } self.dirty_vlogs.write().clear(); for lf in lfs { - // self.delete_log_file() // TODO + self.delete_log_file_by_fid(lf)?; } Ok(()) } - fn delete_log_file(&mut self, mut log_file: LogFile) -> Result<()> { - if let Some(mp) = log_file._mmap.take() { + fn delete_log_file_by_fid(&self, log_file: Arc>) -> Result<()> { + if let Some(mp) = log_file.write()._mmap.take() { mp.get_mut_mmap().flush()?; } - if let Some(fp) = log_file.fd.take() { + if let Some(fp) = log_file.read().fd.take() { fp.sync_all()?; } - remove_file(self.fpath(log_file.fid))?; + remove_file(self.fpath(log_file.read().fid))?; Ok(()) } From ab13455a15f71f822dad56bfb7e39cc471e12534 Mon Sep 17 00:00:00 2001 From: Rg Date: Sat, 18 Mar 2023 18:08:20 +0800 Subject: [PATCH 57/77] :dog: --- src/iterator.rs | 83 +++++++++++++++++++++++++++++++++--------------- src/kv.rs | 35 ++++++++++++++++++-- src/value_log.rs | 41 +++++++++++++++++++++++- 3 files changed, 130 insertions(+), 29 deletions(-) diff --git a/src/iterator.rs b/src/iterator.rs index b3eec84..034d32d 100644 --- a/src/iterator.rs +++ b/src/iterator.rs @@ -1,8 +1,4 @@ -use parking_lot::RwLock; -use std::sync::atomic::{AtomicU8, Ordering}; -use std::sync::Arc; -use std::{io::Cursor, sync::atomic::AtomicU64}; - +use crate::iterator::PreFetchStatus::Prefetched; use crate::types::{ArcMx, ArcRW, Closer, TArcMx, TArcRW}; use crate::MergeIterOverIterator; use crate::{ @@ -11,7 +7,12 @@ use crate::{ value_log::{MetaBit, ValuePointer}, Decode, Result, Xiterator, }; -use crate::iterator::PreFetchStatus::Prefetched; +use parking_lot::RwLock; +use std::future::Future; +use std::pin::Pin; +use std::sync::atomic::{AtomicU8, Ordering}; +use std::sync::Arc; +use std::{io::Cursor, sync::atomic::AtomicU64}; #[derive(Debug, PartialEq)] pub(crate) enum PreFetchStatus { @@ -24,10 +25,10 @@ type KVItem = TArcRW; // Returned during iteration. Both the key() and value() output is only valid until // iterator.next() is called. pub(crate) struct KVItemInner { - status: PreFetchStatus, + status: TArcRW, kv: XArc, key: Vec, - value: Vec, + value: TArcRW>, vptr: Vec, meta: u8, user_meta: u8, @@ -38,20 +39,28 @@ pub(crate) struct KVItemInner { impl KVItemInner { // Returns the key. Remember to copy if you need to access it outside the iteration loop. - pub(crate) fn key(&self) -> Vec { - self.value + pub(crate) fn key(&self) -> &[u8] { + &self.key } - pub(crate) async fn value(&self, consumer: &mut impl FnMut(&[u8]) -> Result<()>) -> Result<()> { + // Value retrieves the value of the item from the value log. It calls the + // consumer function with a slice argument representing the value. In case + // of error, the consumer function is not called. + // + // Note that the call to the consumer func happens synchronously. + pub(crate) async fn value( + &self, + mut consumer: impl for<'a> FnMut(&'a [u8]) -> Pin> + 'a>>, + ) -> Result<()> { + // Wait result self.wg.wait().await; - if self.status == PreFetchStatus::Prefetch { + if *self.status.read().await == PreFetchStatus::Prefetched { if self.err.is_err() { return self.err.clone(); } - - return consumer(&self.value); + return consumer(&self.value.read().await).await; } - Ok(()) + return self.kv.yield_item_value(&self, consumer).await; } pub(crate) fn has_value(&self) -> bool { @@ -65,17 +74,24 @@ impl KVItemInner { true } + // async fetch value from value_log. pub(crate) async fn pre_fetch_value(&self) { - self.kv - .yield_item_value(&self, |value| -> Result<()> { + let kv = self.kv.clone(); + kv.yield_item_value(&self, |value| { + let ref_value = self.value.clone(); + let ref_status = self.status.clone(); + Box::pin(async move { if value.is_empty() { - self.status = Prefetched; + *ref_status.write().await = PreFetchStatus::Prefetched; return Ok(()); } - self.value = value.clone(); - self.status = Prefetched; + ref_value.write().await.extend(value); + *ref_status.write().await = PreFetchStatus::Prefetched; Ok(()) - }); + }) + }) + .await + .unwrap(); } // Returns approximate size of the key-value pair. // @@ -102,8 +118,16 @@ impl KVItemInner { // Returns the user_meta set by the user. Typically, this byte, optionally set by the user // is used to interpret the value. pub(crate) fn user_meta(&self) -> u8 { + self.user_meta + } + + pub(crate) fn meta(&self) -> u8 { self.meta } + + pub(crate) fn vptr(&self) -> &[u8] { + &self.vptr + } } // Used to set options when iterating over Badger key-value stores. @@ -162,7 +186,8 @@ impl IteratorExt { } async fn fill(&self, item: KVItem) { - let mut vs = self.itr.peek().as_ref().unwrap().value(); + let vs = self.itr.peek().unwrap(); + let vs = vs.value(); { let mut item = item.write().await; item.meta = vs.meta; @@ -170,14 +195,22 @@ impl IteratorExt { item.cas_counter.store(vs.cas_counter, Ordering::Relaxed); item.key.extend(self.itr.peek().as_ref().unwrap().key()); item.vptr.extend(&vs.value); - item.value.clear(); + item.value.write().await.clear(); } + // need fetch value, use new coroutine to load value. if self.opt.pre_fetch_values { - item.wg.add_running(1); + item.read().await.wg.add_running(1); tokio::spawn(async move { // FIXME we are not handling errors here. - item.read().pre_fetch_value()?; + { + let item = item.read().await; + item.pre_fetch_value().await; + } + // { + // let rd = item.read().await; + // rd.pre_fetch_value().await; + // } item.read().await.wg.done(); }); } diff --git a/src/kv.rs b/src/kv.rs index 34ec4e3..296b88e 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -21,9 +21,8 @@ use drop_cell::defer; use fs2::FileExt; use log::{info, Log}; use parking_lot::Mutex; -use std::borrow::BorrowMut; use std::cell::RefCell; -use std::fs::{read_dir, File}; +use std::fs::File; use std::fs::{try_exists, OpenOptions}; use std::future::Future; use std::io::{Cursor, Write}; @@ -88,6 +87,20 @@ pub struct KV { unsafe impl Send for KV {} unsafe impl Sync for KV {} +pub struct BoxKV { + kv: *const KV, +} + +unsafe impl Send for BoxKV {} + +unsafe impl Sync for BoxKV{} + +impl BoxKV { + fn new(kv: *const KV) -> BoxKV { + BoxKV { kv } + } +} + impl KV { async fn open(mut opt: Options) -> Result> { opt.max_batch_size = (15 * opt.max_table_size) / 100; @@ -638,11 +651,27 @@ impl ArcKV { } } + // asyn yield item value from ValueLog pub(crate) async fn yield_item_value( &self, item: &KVItemInner, - consume: impl FnMut(&[u8]) -> Result<()>, + mut consumer: impl for<'a> FnMut(&'a [u8]) -> Pin> + 'a>>, ) -> Result<()> { + // no value + if !item.has_value() { + return consumer(&[0u8]).await; + } + + // TODO What is this + if (item.meta() & MetaBit::BIT_VALUE_POINTER.bits()) == 0 { + return consumer(item.vptr()).await; + } + + let mut vptr = ValuePointer::default(); + vptr.dec(&mut Cursor::new(item.vptr()))?; + let vlog = self.must_vlog(); + vlog.async_read(&vptr, consumer).await?; + Ok(()) } } diff --git a/src/value_log.rs b/src/value_log.rs index 8feafed..2e56c37 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -510,6 +510,26 @@ impl ValueLogCore { }) } + pub async fn async_read( + &self, + vp: &ValuePointer, + consumer: impl for<'a> FnMut(&'a [u8]) -> Pin> + 'a>>, + ) -> Result<()> { + // Check for valid offset if we are reading to writable log. + if vp.fid == self.max_fid.load(Ordering::Acquire) + && vp.offset >= self.writable_log_offset.load(Ordering::Acquire) + { + return Err(format!( + "Invalid value pointer offset: {} greater than current offset: {}", + vp.offset, + self.writable_log_offset.load(Ordering::Acquire) + ) + .into()); + } + self.async_read_bytes(vp, consumer).await?; + Ok(()) + } + /// Replays the value log. The kv provide is only valid for the lifetime of function call. pub async fn replay( &self, @@ -571,11 +591,12 @@ impl ValueLogCore { Ok(()) } + // Delete log file after no refernece of LogFile fn delete_log_file_by_fid(&self, log_file: Arc>) -> Result<()> { if let Some(mp) = log_file.write()._mmap.take() { mp.get_mut_mmap().flush()?; } - if let Some(fp) = log_file.read().fd.take() { + if let Some(fp) = log_file.write().fd.take() { fp.sync_all()?; } remove_file(self.fpath(log_file.read().fid))?; @@ -608,6 +629,24 @@ impl ValueLogCore { consumer(buffer) } + async fn async_read_bytes( + &self, + vp: &ValuePointer, + mut consumer: impl for<'a> FnMut(&'a [u8]) -> Pin> + 'a>>, + ) -> Result<()> { + let log_file = self.pick_log_by_vlog_id(&vp.fid); + let lf = log_file.read(); + let buffer = lf.read(vp)?; + let mut h = Header::default(); + h.dec(&mut Cursor::new(buffer))?; + if (h.meta & MetaBit::BIT_DELETE.bits) != 0 { + // Tombstone key + return consumer(&[0u8]).await; + } + let n = Header::encoded_size() + h.k_len as usize; + consumer(&buffer[n..n + (h.v_len as usize)]).await + } + // write is thread-unsafe by design and should not be called concurrently. pub(crate) fn write(&self, reqs: Arc>) -> Result<()> { let cur_vlog_file = self.pick_log_by_vlog_id(&self.max_fid.load(Ordering::Acquire)); From fc44d4f7dae801687d1e6bfcb3cfbbc47418f43b Mon Sep 17 00:00:00 2001 From: Rg Date: Sun, 19 Mar 2023 14:44:50 +0800 Subject: [PATCH 58/77] :dog: --- Cargo.toml | 1 + src/iterator.rs | 166 ++++++++++++++++++++++++++++++++++++++++------- src/kv.rs | 45 +++++++++---- src/skl/skip.rs | 36 ++++++++++ src/value_log.rs | 28 ++++---- 5 files changed, 225 insertions(+), 51 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 705a5af..3c21d40 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,7 @@ dyn-clone = "1.0.10" eieio = "1.0.0" either = "1.8.1" enum-unitary = "0.5.0" +atom_box = "0.1.2" [dev-dependencies] tracing-subscriber = "0.3.16" tracing-log = "0.1.3" diff --git a/src/iterator.rs b/src/iterator.rs index 034d32d..6ade34a 100644 --- a/src/iterator.rs +++ b/src/iterator.rs @@ -1,4 +1,5 @@ use crate::iterator::PreFetchStatus::Prefetched; +use crate::kv::_BADGER_PREFIX; use crate::types::{ArcMx, ArcRW, Closer, TArcMx, TArcRW}; use crate::MergeIterOverIterator; use crate::{ @@ -7,6 +8,7 @@ use crate::{ value_log::{MetaBit, ValuePointer}, Decode, Result, Xiterator, }; +use log::Metadata; use parking_lot::RwLock; use std::future::Future; use std::pin::Pin; @@ -24,6 +26,7 @@ type KVItem = TArcRW; // Returned during iteration. Both the key() and value() output is only valid until // iterator.next() is called. +#[derive(Clone)] pub(crate) struct KVItemInner { status: TArcRW, kv: XArc, @@ -32,7 +35,7 @@ pub(crate) struct KVItemInner { vptr: Vec, meta: u8, user_meta: u8, - cas_counter: AtomicU64, + cas_counter: Arc, wg: Closer, err: Result<()>, } @@ -50,7 +53,7 @@ impl KVItemInner { // Note that the call to the consumer func happens synchronously. pub(crate) async fn value( &self, - mut consumer: impl for<'a> FnMut(&'a [u8]) -> Pin> + 'a>>, + mut consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { // Wait result self.wg.wait().await; @@ -58,9 +61,9 @@ impl KVItemInner { if self.err.is_err() { return self.err.clone(); } - return consumer(&self.value.read().await).await; + return consumer(self.value.read().await.clone()).await; } - return self.kv.yield_item_value(&self, consumer).await; + return self.kv.yield_item_value(self.clone(), consumer).await; } pub(crate) fn has_value(&self) -> bool { @@ -75,11 +78,11 @@ impl KVItemInner { } // async fetch value from value_log. - pub(crate) async fn pre_fetch_value(&self) { + pub(crate) async fn pre_fetch_value(&self) -> Result<()> { let kv = self.kv.clone(); - kv.yield_item_value(&self, |value| { - let ref_value = self.value.clone(); + kv.yield_item_value(self.clone(), |value| { let ref_status = self.status.clone(); + let ref_value = self.value.clone(); Box::pin(async move { if value.is_empty() { *ref_status.write().await = PreFetchStatus::Prefetched; @@ -91,8 +94,8 @@ impl KVItemInner { }) }) .await - .unwrap(); } + // Returns approximate size of the key-value pair. // // This can be called while iterating through a store to quickly estimate the @@ -147,35 +150,123 @@ pub(crate) const DEF_ITERATOR_OPTIONS: IteratorOptions = IteratorOptions { }; // Helps iterating over the KV pairs in a lexicographically sorted order. -struct IteratorExt { +pub(crate) struct IteratorExt { kv: XArc, itr: MergeIterOverIterator, opt: IteratorOptions, - item: Option>, - data: ArcRW>>, - waste: ArcRW>>, + item: ArcRW>, + data: ArcRW>, + waste: ArcRW>, +} + +impl IteratorExt { + // Seek to the provided key if present. If absent, if would seek to the next smallest key + // greater than provided if iterating in the forward direction. Behavior would be reversed is + // iterating backwards. + pub(crate) async fn seek(&self, key: &[u8]) -> Option { + while let Some(el) = self.data.write().pop_front() { + el.read().await.wg.wait().await; + } + while let Some(el) = self.itr.seek(key) { + if el.key().starts_with(_BADGER_PREFIX) { + continue; + } + } + self.pre_fetch().await; + self.item.read().clone() + } + + // Rewind the iterator cursor all the wy to zero-th position, which would be the + // smallest key if iterating forward, and largest if iterating backward. It dows not + // keep track of whether the cusor started with a `seek`. + pub(crate) async fn rewind(&self) -> Option { + while let Some(el) = self.data.write().pop_front() { + // Just cleaner to wait before pushing. No ref counting need. + el.read().await.wg.wait().await; + } + + let mut item = self.itr.rewind(); + while item.is_some() && item.as_ref().unwrap().key().starts_with(_BADGER_PREFIX) { + item = self.itr.next(); + } + self.pre_fetch().await; + self.item.read().clone() + } + + // Advance the iterator by one. Always check it.valid() after a next () + // to ensure you have access to a valid it.item() + pub(crate) async fn next(&self) -> Option { + // Ensure current item + if let Some(el) = self.item.write().take() { + el.read().await.wg.wait().await; // Just cleaner to wait before pushing to avoid doing ref counting. + } + // Set next item to current + if let Some(el) = self.data.write().pop_front() { + self.item.write().replace(el); + } + // Advance internal iterator until entry is not deleted + while let Some(el) = self.itr.next() { + if el.key().starts_with(_BADGER_PREFIX) { + continue; + } + if el.value().meta & MetaBit::BIT_DELETE.bits() == 0 { + // Not deleted + break; + } + } + let item = self.itr.peek(); + if item.is_none() { + return None; + } + let mut xitem = self.new_item(); + self.fill(xitem.clone()).await; + self.data.write().push_back(xitem.clone()); + Some(xitem) + } } impl IteratorExt { - fn new_item(&self) -> Option> { - self.waste.write().pop_front() + fn new_item(&self) -> KVItem { + let inner_item = KVItemInner { + status: Arc::new(tokio::sync::RwLock::new(PreFetchStatus::Empty)), + kv: self.kv.clone(), + key: vec![], + value: Arc::new(Default::default()), + vptr: vec![], + meta: 0, + user_meta: 0, + cas_counter: Arc::new(Default::default()), + wg: Closer::new(), + err: Ok(()), + }; + return KVItem::new(tokio::sync::RwLock::new(inner_item)); } // Returns pointer to the current KVItem. // This item is only valid until it.Next() gets called. - fn item(&self) -> Option> { - self.item.clone() + fn item(&self) -> Option { + todo!() + //self.item.clone() } // Returns false when iteration is done. fn valid(&self) -> bool { - self.item.is_some() + self.item.read().is_some() } // Returns false when iteration is done // or when the current key is not prefixed by the specified prefix. - fn valid_for_prefix(&self, prefix: &[u8]) -> bool { - self.item.is_some() && self.item.as_ref().unwrap().key().starts_with(prefix) + async fn valid_for_prefix(&self, prefix: &[u8]) -> bool { + self.item.read().is_some() + && self + .item + .read() + .as_ref() + .unwrap() + .read() + .await + .key() + .starts_with(prefix) } // Close the iterator, It is important to call this when you're done with iteration. @@ -205,14 +296,41 @@ impl IteratorExt { // FIXME we are not handling errors here. { let item = item.read().await; - item.pre_fetch_value().await; + if let Err(err) = item.pre_fetch_value().await { + log::error!("Failed to fetch value, {}", err); + } } - // { - // let rd = item.read().await; - // rd.pre_fetch_value().await; - // } item.read().await.wg.done(); }); } } + + async fn pre_fetch(&self) { + let mut pre_fetch_size = 2; + if self.opt.pre_fetch_values && self.opt.pre_fetch_size > 1 { + pre_fetch_size = self.opt.pre_fetch_size; + } + + let itr = &self.itr; + let mut count = 0; + while let Some(item) = itr.next() { + if item.key().starts_with(crate::kv::_BADGER_PREFIX) { + continue; + } + if item.value().meta & MetaBit::BIT_DELETE.bits() > 0 { + continue; + } + count += 1; + let xitem = self.new_item(); + self.fill(xitem.clone()).await; + if self.item.read().is_none() { + self.item.write().replace(xitem); + } else { + self.data.write().push_back(xitem); + } + if count == pre_fetch_size { + break; + } + } + } } diff --git a/src/kv.rs b/src/kv.rs index 296b88e..4818bc8 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -1,9 +1,9 @@ -use crate::iterator::KVItemInner; +use crate::iterator::{IteratorExt, KVItemInner}; use crate::levels::{LevelsController, XLevelsController}; use crate::manifest::{open_or_create_manifest_file, Manifest, ManifestFile}; use crate::options::Options; use crate::table::builder::Builder; -use crate::table::iterator::IteratorImpl; +use crate::table::iterator::{IteratorImpl, IteratorItem}; use crate::table::table::{new_file_name, Table, TableCore}; use crate::types::{ArcMx, Channel, Closer, TArcMx, XArc, XWeak}; use crate::value_log::{ @@ -13,7 +13,7 @@ use crate::y::{ async_sync_directory, create_synced_file, sync_directory, Encode, Result, ValueStruct, }; use crate::Error::{NotFound, Unexpected}; -use crate::{Decode, Error, Node, SkipList, SkipListManager}; +use crate::{Decode, Error, Node, SkipList, SkipListManager, Xiterator}; use atomic::Atomic; use bytes::BufMut; use crossbeam_epoch::Shared; @@ -38,9 +38,9 @@ use tokio::fs::create_dir_all; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::sync::{RwLock, RwLockWriteGuard}; -const _BADGER_PREFIX: &[u8; 8] = b"!badger!"; +pub const _BADGER_PREFIX: &[u8; 8] = b"!badger!"; // Prefix for internal keys used by badger. -const _HEAD: &[u8; 11] = b"!bager!head"; // For Storing value offset for replay. +pub const _HEAD: &[u8; 11] = b"!bager!head"; // For Storing value offset for replay. struct Closers { update_size: Closer, @@ -88,15 +88,15 @@ unsafe impl Send for KV {} unsafe impl Sync for KV {} pub struct BoxKV { - kv: *const KV, + pub kv: *const KV, } unsafe impl Send for BoxKV {} -unsafe impl Sync for BoxKV{} +unsafe impl Sync for BoxKV {} impl BoxKV { - fn new(kv: *const KV) -> BoxKV { + pub(crate) fn new(kv: *const KV) -> BoxKV { BoxKV { kv } } } @@ -435,7 +435,7 @@ impl KV { let mut b = vec![Request::default()]; let mut count = 0; let mut sz = 0u64; - for mut entry in entries { + for entry in entries { if entry.key.len() > MAX_KEY_SIZE { bad.push(entry); continue; @@ -462,7 +462,7 @@ impl KV { self.write_ch.send(arc_req).await.unwrap(); } if !bad.is_empty() { - let mut req = Request::default(); + let req = Request::default(); *req.entries.write() = Vec::from_iter(bad.into_iter().map(|bad| RefCell::new(bad)).into_iter()); let arc_req = ArcRequest::from(req); @@ -555,6 +555,23 @@ impl ArcKV { Ok(()) } + + pub async fn new_iterator(&self) -> IteratorExt { + let p = crossbeam_epoch::pin(); + let tables = self.get_mem_tables(&p); + defer! { + tables.iter().for_each(|table| unsafe {table.as_ref().unwrap().decr_ref()}); + } + self.must_vlog().incr_iterator_count(); + + // Create iterators across all the tables involved first. + let mut itrs: Vec>> = vec![]; + // for tb in tables { + // let iter = Box::new(IteratorImpl::new(tb, false)); + // itr.push(iter); + // } + todo!() + } } impl ArcKV { @@ -654,17 +671,17 @@ impl ArcKV { // asyn yield item value from ValueLog pub(crate) async fn yield_item_value( &self, - item: &KVItemInner, - mut consumer: impl for<'a> FnMut(&'a [u8]) -> Pin> + 'a>>, + item: KVItemInner, + mut consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { // no value if !item.has_value() { - return consumer(&[0u8]).await; + return consumer(vec![]).await; } // TODO What is this if (item.meta() & MetaBit::BIT_VALUE_POINTER.bits()) == 0 { - return consumer(item.vptr()).await; + return consumer(item.vptr().to_vec()).await; } let mut vptr = ValuePointer::default(); diff --git a/src/skl/skip.rs b/src/skl/skip.rs index 1bf966a..9f55434 100644 --- a/src/skl/skip.rs +++ b/src/skl/skip.rs @@ -1,4 +1,5 @@ use crate::skl::{Cursor, HEIGHT_INCREASE, MAX_HEIGHT}; +use crate::Xiterator; use log::info; use rand::random; use serde_json::Value; @@ -418,6 +419,41 @@ impl Display for SkipList { } } +// A unidirectional memetable iterator. It is a thin wrapper around +// `Iterator`. We like to keep `Iterator` as before, because it is more powerful and +// we might support bidirectional iterations in the future. +pub struct UniIterator {} + +// An iterator over SkipList object. for new objects, you just +// need to initialize Iterator.list. +pub struct SkipIterator { + st: SkipList, + node: NonNull, +} + +impl<'a> Xiterator for SkipIterator<'a> { + type Output = &'a Node; + fn next(&self) -> Option { + todo!() + } + + fn rewind(&self) -> Option { + todo!() + } + + fn seek(&self, key: &[u8]) -> Option { + todo!() + } + + fn peek(&self) -> Option { + todo!() + } + + fn close(&self) { + self.st.decr_ref(); + } +} + mod tests { use crate::skl::node::Node; use crate::skl::skip::SkipList; diff --git a/src/value_log.rs b/src/value_log.rs index 2e56c37..fb61cee 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -31,7 +31,7 @@ use std::{fmt, fs, io, ptr, thread}; use tabled::object::Entity::Cell; use tokio::macros::support::thread_rng_n; -use crate::kv::{ArcKV, WeakKV, KV}; +use crate::kv::{ArcKV, BoxKV, WeakKV, KV}; use crate::log_file::LogFile; use crate::options::Options; use crate::skl::BlockBytes; @@ -342,7 +342,7 @@ pub struct ValueLogCore { writable_log_offset: AtomicU32, buf: ArcRW>>, opt: Options, - kv: *const KV, + kv: BoxKV, // Only allow one GC at a time. garbage_ch: Channel<()>, } @@ -359,7 +359,7 @@ impl Default for ValueLogCore { writable_log_offset: Default::default(), buf: Arc::new(RwLock::new(BufWriter::new(vec![0u8; 0]))), opt: Default::default(), - kv: ptr::null_mut(), + kv: BoxKV::new(ptr::null_mut()), garbage_ch: Channel::new(1), } } @@ -402,7 +402,7 @@ impl ValueLogCore { pub(crate) fn open(&mut self, kv: *const KV, opt: Options) -> Result<()> { self.dir_path = opt.value_dir.clone(); self.opt = opt; - self.kv = kv; + self.kv = BoxKV::new(kv); self.open_create_files()?; // todo add garbage and metrics self.garbage_ch = Channel::new(1); @@ -410,7 +410,7 @@ impl ValueLogCore { } fn get_kv(&self) -> &KV { - unsafe { &*self.kv } + unsafe { &*self.kv.kv } } pub fn close(&self) -> Result<()> { @@ -513,7 +513,7 @@ impl ValueLogCore { pub async fn async_read( &self, vp: &ValuePointer, - consumer: impl for<'a> FnMut(&'a [u8]) -> Pin> + 'a>>, + consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { // Check for valid offset if we are reading to writable log. if vp.fid == self.max_fid.load(Ordering::Acquire) @@ -571,6 +571,10 @@ impl ValueLogCore { Ok(()) } + pub(crate) fn incr_iterator_count(&self) { + self.num_active_iterators.fetch_add(1, Ordering::Relaxed); + } + pub(crate) fn decr_iterator_count(&self) -> Result<()> { // TODO add share lock. let old = self.num_active_iterators.fetch_sub(1, Ordering::Relaxed); @@ -632,19 +636,17 @@ impl ValueLogCore { async fn async_read_bytes( &self, vp: &ValuePointer, - mut consumer: impl for<'a> FnMut(&'a [u8]) -> Pin> + 'a>>, + mut consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { - let log_file = self.pick_log_by_vlog_id(&vp.fid); - let lf = log_file.read(); - let buffer = lf.read(vp)?; + let mut buffer = self.pick_log_by_vlog_id(&vp.fid).read().read(&vp)?.to_vec(); + let value_buffer = buffer.split_off(Header::encoded_size()); let mut h = Header::default(); h.dec(&mut Cursor::new(buffer))?; if (h.meta & MetaBit::BIT_DELETE.bits) != 0 { // Tombstone key - return consumer(&[0u8]).await; + return consumer(vec![]).await; } - let n = Header::encoded_size() + h.k_len as usize; - consumer(&buffer[n..n + (h.v_len as usize)]).await + consumer(value_buffer).await } // write is thread-unsafe by design and should not be called concurrently. From 3f9ba1e65ed5b6644469fdd68b41d5e8b515b91b Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 20 Mar 2023 02:15:54 +0800 Subject: [PATCH 59/77] :dog: --- src/kv.rs | 2 +- src/skl/skip.rs | 115 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 97 insertions(+), 20 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 4818bc8..f85b620 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -556,7 +556,7 @@ impl ArcKV { Ok(()) } - pub async fn new_iterator(&self) -> IteratorExt { + pub(crate) async fn new_iterator(&self) -> IteratorExt { let p = crossbeam_epoch::pin(); let tables = self.get_mem_tables(&p); defer! { diff --git a/src/skl/skip.rs b/src/skl/skip.rs index 9f55434..c2539fa 100644 --- a/src/skl/skip.rs +++ b/src/skl/skip.rs @@ -1,5 +1,7 @@ use crate::skl::{Cursor, HEIGHT_INCREASE, MAX_HEIGHT}; +use crate::table::iterator::IteratorItem; use crate::Xiterator; +use atom_box::AtomBox; use log::info; use rand::random; use serde_json::Value; @@ -426,34 +428,91 @@ pub struct UniIterator {} // An iterator over SkipList object. for new objects, you just // need to initialize Iterator.list. +// Try GAT lifetime pub struct SkipIterator { st: SkipList, - node: NonNull, + node: AtomicPtr, } -impl<'a> Xiterator for SkipIterator<'a> { - type Output = &'a Node; - fn next(&self) -> Option { - todo!() - } - - fn rewind(&self) -> Option { - todo!() - } - - fn seek(&self, key: &[u8]) -> Option { - todo!() - } - - fn peek(&self) -> Option { - todo!() +impl SkipIterator { + fn peek(&self) {} + fn close(&self) { + self.st.decr_ref() } - fn close(&self) { - self.st.decr_ref(); + // Advance to the next position + fn next(&self) -> Option { + let node = self.node.load(Ordering::Relaxed); + if node.is_null() { + return None; + } + let next = self.st.get_next(unsafe { node.as_ref().unwrap() }, 0); + let next = next.unwrap() as *const Node as *mut Node; + self.node.store(next, Ordering::Relaxed); + let key = next.key().to_vec(); + let (value_offset, val_size) = next.get_value_offset(); + let value = self.st.arena_ref().get_val(value_offset, val_size); + Some(IteratorItem) } } +// impl Xiterator for SkipIterator { +// type Output = IteratorItem; +// +// fn next(&self) -> Option { +// todo!() +// } +// +// fn rewind(&self) -> Option { +// todo!() +// } +// +// fn seek(&self, key: &[u8]) -> Option { +// if self.node.load(Ordering::Relaxed).is_null() { +// return None; +// } +// +// let node = self.node.load(Ordering::Relaxed); +// if node.is_null() { +// return None; +// } +// let key = node.key(self.st.arena_ref()).to_vec(); +// let value = node.value.load(Ordering::Relaxed); +// Some(IteratorItem{ key: node.key(self.st.arena_ref()).to_vec(), value: Default::default() }) +// } +// +// fn peek(&self) -> Option { +// todo!() +// } +// +// fn close(&self) { +// todo!() +// } +// } + +// impl<'a> Xiterator for SkipIterator<'a> { +// type Output = &'a Node; +// fn next(&self) -> Option { +// todo!() +// } +// +// fn rewind(&self) -> Option { +// todo!() +// } +// +// fn seek(&self, key: &[u8]) -> Option { +// todo!() +// } +// +// fn peek(&self) -> Option { +// todo!() +// } +// +// fn close(&self) { +// self.st.decr_ref(); +// } +// } + mod tests { use crate::skl::node::Node; use crate::skl::skip::SkipList; @@ -892,4 +951,22 @@ mod tests2 { fn atomic_swap_skip_list() { let mut st = SkipList::new(ARENA_SIZE); } + + #[test] + fn gat() { + // #![allow(unused)] + + // trait IterableTypes { + // type Item<'me>; + // type Iterator<'me>: Iterator>; + // } + + // trait Iterable: IterableTypes { + // fn iter<'a>(&'a self) -> Self::Iterator<'a>; + // } + + // struct GatSimple {} + + // impl GatSimple {} + } } From 96369072541f34577ad9f1ebb4eb27b2d91cc55c Mon Sep 17 00:00:00 2001 From: Rg Date: Tue, 21 Mar 2023 01:01:26 +0800 Subject: [PATCH 60/77] :dog: --- src/skl/skip.rs | 87 +++++++++++++++++++++++++++++++++++++++---- src/table/iterator.rs | 4 +- 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/src/skl/skip.rs b/src/skl/skip.rs index c2539fa..f018632 100644 --- a/src/skl/skip.rs +++ b/src/skl/skip.rs @@ -435,13 +435,35 @@ pub struct SkipIterator { } impl SkipIterator { - fn peek(&self) {} - fn close(&self) { + pub fn get_item_by_node(&self, node: &Node) -> Option { + if ptr::eq(node, self.st.get_head()) { + return None; + } + let key = node.key(&self.st.arena); + let (value_offset, val_size) = node.get_value_offset(); + let value = self.st.arena_ref().get_val(value_offset, val_size); + Some(IteratorItem { + key: key.to_vec(), + value, + }) + } + + pub fn peek(&self) -> Option { + let node = unsafe { self.node.load(Ordering::Relaxed).as_ref().unwrap() }; + self.get_item_by_node(&node) + } + + // returns true iff the iterator is positioned at a valid node. + pub fn valid(&self) -> bool { + self.peek().is_some() + } + + pub fn close(&self) { self.st.decr_ref() } // Advance to the next position - fn next(&self) -> Option { + pub fn next(&self) -> Option { let node = self.node.load(Ordering::Relaxed); if node.is_null() { return None; @@ -449,10 +471,61 @@ impl SkipIterator { let next = self.st.get_next(unsafe { node.as_ref().unwrap() }, 0); let next = next.unwrap() as *const Node as *mut Node; self.node.store(next, Ordering::Relaxed); - let key = next.key().to_vec(); - let (value_offset, val_size) = next.get_value_offset(); - let value = self.st.arena_ref().get_val(value_offset, val_size); - Some(IteratorItem) + self.get_item_by_node(unsafe { next.as_ref().unwrap() }) + } + + // Advances to the previous position. + pub fn prev(&self) -> Option { + assert!(self.peek().is_some()); + let (node, _) = self.st.find_near(self.peek().unwrap().key(), true, false); + if node.is_none() { + self.set_node(self.st.get_head()); + return None; + } + self.set_node(node.unwrap()); + self.get_item_by_node(node.unwrap()) + } + + // finds an entry with key <= target. + pub fn seek_to_prev(&self, target: &[u8]) -> Option { + let (node, _) = self.st.find_near(target, true, true); // find <=1 + if node.is_none() { + self.set_node(self.st.get_head()); + return None; + } + self.node + .store(node.unwrap() as *const Node as *mut Node, Ordering::Relaxed); + self.get_item_by_node(node.unwrap()) + } + + // Seeks position at the first entry in list. + // Final state of iterator is valid() iff list is not empty. + pub fn seek_to_first(&self) -> Option { + let node = self.st.get_next(self.st.get_head(), 0); + if node.is_none() { + self.set_node(self.st.get_head()); + return None; + } + + self.node + .store(node.unwrap() as *const Node as *mut Node, Ordering::Relaxed); + self.get_item_by_node(node.unwrap()) + } + + pub fn seek_to_last(&self) -> Option { + let node = unsafe { self.st.find_last() }; + if node.is_none() { + self.set_node(self.st.get_head()); + return None; + } + self.node + .store(node.unwrap() as *const Node as *mut Node, Ordering::Relaxed); + self.get_item_by_node(node.unwrap()) + } + + fn set_node(&self, node: &Node) { + let node = node as *const Node as *mut Node; + self.node.store(node, Ordering::Relaxed); } } diff --git a/src/table/iterator.rs b/src/table/iterator.rs index 58e6da3..ae9bd76 100644 --- a/src/table/iterator.rs +++ b/src/table/iterator.rs @@ -245,8 +245,8 @@ impl BlockIterator { #[derive(Debug, Clone)] pub struct IteratorItem { - key: Vec, - value: ValueStruct, + pub key: Vec, + pub value: ValueStruct, } impl fmt::Display for IteratorItem { From 8a6f423758e564818b7958fc99c78558d38cba86 Mon Sep 17 00:00:00 2001 From: Rg Date: Sun, 26 Mar 2023 11:10:21 +0800 Subject: [PATCH 61/77] :dog: --- src/db.rs | 18 ++++++ src/iterator.rs | 23 +++++-- src/kv.rs | 150 ++++++++++++++++++++++++++++++++++----------- src/levels.rs | 23 +++++++ src/lib.rs | 5 +- src/options/mod.rs | 5 ++ src/skl/mod.rs | 2 +- src/skl/skip.rs | 83 +++++++++++++++++++++++-- src/st_manager.rs | 22 +++++-- src/table/table.rs | 6 +- src/types.rs | 26 +++++--- src/value_log.rs | 20 ++++-- src/y/mod.rs | 10 ++- 13 files changed, 325 insertions(+), 68 deletions(-) create mode 100644 src/db.rs diff --git a/src/db.rs b/src/db.rs new file mode 100644 index 0000000..00f7a9c --- /dev/null +++ b/src/db.rs @@ -0,0 +1,18 @@ +use crate::kv::KV; +use crate::options::Options; +use crate::types::{XArc, XWeak}; + +pub struct DataBase { + kv: XArc, + VL: Option, +} + +impl DataBase { + async fn new() { + let kv = KV::open(Options::default()).await; + } +} + +pub struct VL { + kv: XWeak, +} diff --git a/src/iterator.rs b/src/iterator.rs index 6ade34a..07e731f 100644 --- a/src/iterator.rs +++ b/src/iterator.rs @@ -136,11 +136,11 @@ impl KVItemInner { // Used to set options when iterating over Badger key-value stores. pub(crate) struct IteratorOptions { // Indicates whether we should prefetch values during iteration and store them. - pre_fetch_values: bool, + pub(crate) pre_fetch_values: bool, // How may KV pairs to prefetch while iterating. Valid only if PrefetchValues is true. - pre_fetch_size: isize, + pub(crate) pre_fetch_size: isize, // Direction of iteration. False is forward, true is backward. - reverse: bool, + pub(crate) reverse: bool, } pub(crate) const DEF_ITERATOR_OPTIONS: IteratorOptions = IteratorOptions { @@ -156,10 +156,23 @@ pub(crate) struct IteratorExt { opt: IteratorOptions, item: ArcRW>, data: ArcRW>, - waste: ArcRW>, } impl IteratorExt { + pub(crate) fn new( + kv: XArc, + itr: MergeIterOverIterator, + opt: IteratorOptions, + ) -> IteratorExt { + IteratorExt { + kv, + opt, + itr, + data: ArcRW::default(), + item: Arc::new(Default::default()), + } + } + // Seek to the provided key if present. If absent, if would seek to the next smallest key // greater than provided if iterating in the forward direction. Behavior would be reversed is // iterating backwards. @@ -236,7 +249,7 @@ impl IteratorExt { meta: 0, user_meta: 0, cas_counter: Arc::new(Default::default()), - wg: Closer::new(), + wg: Closer::new("IteratorExt".to_owned()), err: Ok(()), }; return KVItem::new(tokio::sync::RwLock::new(inner_item)); diff --git a/src/kv.rs b/src/kv.rs index f85b620..68a080a 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -1,4 +1,4 @@ -use crate::iterator::{IteratorExt, KVItemInner}; +use crate::iterator::{IteratorExt, IteratorOptions, KVItemInner}; use crate::levels::{LevelsController, XLevelsController}; use crate::manifest::{open_or_create_manifest_file, Manifest, ManifestFile}; use crate::options::Options; @@ -13,7 +13,10 @@ use crate::y::{ async_sync_directory, create_synced_file, sync_directory, Encode, Result, ValueStruct, }; use crate::Error::{NotFound, Unexpected}; -use crate::{Decode, Error, Node, SkipList, SkipListManager, Xiterator}; +use crate::{ + Decode, Error, MergeIterOverBuilder, MergeIterOverIterator, Node, SkipList, SkipListManager, + UniIterator, Xiterator, +}; use atomic::Atomic; use bytes::BufMut; use crossbeam_epoch::Shared; @@ -66,7 +69,7 @@ pub struct KV { pub vlog: Option, pub vptr: crossbeam_epoch::Atomic, pub manifest: Arc>, - lc: XWeak, + lc: Option, flush_chan: Channel, // write_chan: Channel, dir_lock_guard: File, @@ -80,6 +83,7 @@ pub struct KV { // we use an atomic op. last_used_cas_counter: AtomicU64, share_lock: tokio::sync::RwLock<()>, + // TODO user ctx replace closer ctx: tokio_context::context::Context, ctx_handle: tokio_context::context::Handle, } @@ -102,7 +106,7 @@ impl BoxKV { } impl KV { - async fn open(mut opt: Options) -> Result> { + pub async fn open(mut opt: Options) -> Result> { opt.max_batch_size = (15 * opt.max_table_size) / 100; opt.max_batch_count = opt.max_batch_size / Node::size() as u64; create_dir_all(opt.dir.as_str()).await?; @@ -116,34 +120,37 @@ impl KV { .append(true) .create(true) .open(Path::new(opt.dir.as_str()).join("dir_lock_guard.lock"))?; + dir_lock_guard.lock_exclusive()?; let value_dir_guard = OpenOptions::new() .write(true) .append(true) .create(true) .open(Path::new(opt.value_dir.as_str()).join("value_dir_guard.lock"))?; + value_dir_guard.lock_exclusive()?; let closers = Closers { - update_size: Closer::new(), - compactors: Closer::new(), - mem_table: Closer::new(), - writes: Closer::new(), - value_gc: Closer::new(), + update_size: Closer::new("update_size".to_owned()), + compactors: Closer::new("compactors".to_owned()), + mem_table: Closer::new("mem_table".to_owned()), + writes: Closer::new("writes".to_owned()), + value_gc: Closer::new("value_gc".to_owned()), }; + let (ctx, h) = tokio_context::context::Context::new(); let mut out = KV { opt: opt.clone(), vlog: None, vptr: crossbeam_epoch::Atomic::null(), manifest: Arc::new(RwLock::new(manifest_file)), - lc: XWeak::new(), + lc: None, flush_chan: Channel::new(1), dir_lock_guard, value_dir_guard, closers, write_ch: Channel::new(1), last_used_cas_counter: Default::default(), - mem_st_manger: Arc::new(SkipListManager::default()), + mem_st_manger: Arc::new(SkipListManager::new(opt.arena_size() as usize)), share_lock: tokio::sync::RwLock::new(()), ctx, ctx_handle: h, @@ -154,12 +161,14 @@ impl KV { // handle levels_controller let lc = LevelsController::new(manifest.clone(), out.opt.clone()).await?; lc.start_compact(out.closers.compactors.clone()); + out.lc.replace(lc); let mut vlog = ValueLogCore::default(); { let kv = unsafe { &out as *const KV }; vlog.open(kv, opt)?; } - out.vlog = Some(vlog); + out.vlog.replace(vlog); + let xout = XArc::new(out); // update size @@ -173,23 +182,20 @@ impl KV { { let _out = xout.clone(); tokio::spawn(async move { - _out.flush_mem_table(_out.closers.mem_table.clone()) + _out.flush_mem_table(_out.closers.mem_table.spawn()) .await .expect("TODO: panic message"); }); } // Get the lasted ValueLog Recover Pointer - let item = xout.get(_HEAD); - if item.is_err() { - return Err("Retrieving head".into()); - } - let item = item.unwrap(); + let item = match xout.get(_HEAD) { + Err(NotFound) => ValueStruct::default(), // Give it a default value + Err(_) => return Err("Retrieving head".into()), + Ok(item) => item, + }; let value = &item.value; - if value != _HEAD { - return Err("Retrieving head".into()); - } - + assert!(item.value.is_empty() || item.value == _HEAD.to_vec()); // lastUsedCasCounter will either be the value stored in !badger!head, or some subsequently // written value log entry that we replay. (Subsequent value log entries might be _less_ // than lastUsedCasCounter, if there was value log gc so we have to max() values while @@ -198,12 +204,13 @@ impl KV { .store(item.cas_counter, Ordering::Relaxed); let mut vptr = ValuePointer::default(); - vptr.dec(&mut Cursor::new(value))?; - - let replay_closer = Closer::new(); + if !item.value.is_empty() { + vptr.dec(&mut Cursor::new(value))?; + } + let replay_closer = Closer::new("tmp_replay".to_owned()); { let _out = xout.clone(); - let replay_closer = replay_closer.clone(); + let replay_closer = replay_closer.spawn(); tokio::spawn(async move { _out.do_writes(replay_closer).await; }); @@ -257,9 +264,27 @@ impl KV { .await?; // Wait for replay to be applied first. replay_closer.signal_and_wait().await; - // Mmap writeable log - // let lf = xout.must_vlog().files_log.read()[xout.must_vlog().max_fid.load(Ordering::Relaxed)]; + // let max_fid = xout.must_vlog().max_fid.load(Ordering::Relaxed); + // let lf = xout.must_vlog().pick_log_by_vlog_id(&max_fid); + // TODO + + { + let closer = xout.closers.writes.spawn(); + let _out = xout.clone(); + tokio::spawn(async move { + _out.do_writes(closer).await; + }); + } + + { + let closer = xout.closers.value_gc.spawn(); + let _out = xout.clone(); + tokio::spawn(async move { + _out.must_vlog().wait_on_gc(closer).await; + }); + } + Ok(xout) } @@ -373,6 +398,7 @@ impl KV { // async to flush memory table into zero level async fn flush_mem_table(&self, lc: Closer) -> Result<()> { defer! {lc.done()} + defer! {info!("exit flush table worker")}; while let Ok(task) = self.flush_chan.recv().await { // after kv send empty mt, it will close flush_chan, so we should return the job. if task.mt.is_none() { @@ -481,8 +507,8 @@ impl KV { } impl KV { - fn must_lc(&self) -> XArc { - let lc = self.lc.upgrade().unwrap(); + pub(crate) fn must_lc(&self) -> &LevelsController { + let lc = self.lc.as_ref().unwrap(); lc } @@ -553,30 +579,63 @@ impl ArcKV { // Stop writes next. self.to_ref().closers.writes.signal_and_wait().await; + self.flush_chan + .send(FlushTask { + mt: None, + vptr: ValuePointer::default(), + }) + .await + .unwrap(); Ok(()) } - pub(crate) async fn new_iterator(&self) -> IteratorExt { + // NewIterator returns a new iterator. Depending upon the options, either only keys, or both + // key-value pairs would be fetched. The keys are returned in lexicographically sorted order. + // Usage: + // opt := badger.DefaultIteratorOptions + // itr := kv.NewIterator(opt) + // for itr.Rewind(); itr.Valid(); itr.Next() { + // item := itr.Item() + // key := item.Key() + // var val []byte + // err = item.Value(func(v []byte) { + // val = make([]byte, len(v)) + // copy(val, v) + // }) // This could block while value is fetched from value log. + // // For key only iteration, set opt.PrefetchValues to false, and don't call + // // item.Value(func(v []byte)). + // + // // Remember that both key, val would become invalid in the next iteration of the loop. + // // So, if you need access to them outside, copy them or parse them. + // } + // itr.Close() + pub(crate) async fn new_iterator(&self, opt: IteratorOptions) -> IteratorExt { let p = crossbeam_epoch::pin(); let tables = self.get_mem_tables(&p); defer! { tables.iter().for_each(|table| unsafe {table.as_ref().unwrap().decr_ref()}); } + // add vlog reference. self.must_vlog().incr_iterator_count(); // Create iterators across all the tables involved first. let mut itrs: Vec>> = vec![]; - // for tb in tables { - // let iter = Box::new(IteratorImpl::new(tb, false)); - // itr.push(iter); - // } - todo!() + for tb in tables.clone() { + let st = unsafe { tb.as_ref().unwrap().clone() }; + let iter = Box::new(UniIterator::new(st, opt.reverse)); + itrs.push(iter); + } + itrs.extend(self.must_lc().as_iterator(opt.reverse)); + let mitr = MergeIterOverBuilder::default().add_batch(itrs).build(); + IteratorExt::new(self.clone(), mitr, opt) } } impl ArcKV { async fn do_writes(&self, lc: Closer) { info!("start do writes task!"); + defer! {lc.done();}; + defer! {info!("exit do write woker")}; // TODO add metrics let has_been_close = lc.has_been_closed(); let write_ch = self.write_ch.clone(); @@ -707,3 +766,22 @@ async fn write_level0_table(st: &SkipList, f: &mut tokio::fs::File) -> Result<() fn arena_size(opt: &Options) -> usize { (opt.max_table_size + opt.max_batch_size + opt.max_batch_count * Node::size() as u64) as usize } + +#[test] +fn t_pointer() { + struct Ext { + v: Vec, + name: String, + } + + let t = Ext { + v: vec![], + name: "Name".to_owned(), + }; + + let p = unsafe { &t as *const Ext }; + + let arc_p = Arc::new(t); + + print!("==> {:?}", p); +} diff --git a/src/levels.rs b/src/levels.rs index 208568d..7cb545c 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -191,6 +191,8 @@ impl LevelsController { lc.done(); return; } + defer! {lc.done()}; + defer! {info!("Exit level controller worker");}; // random sleep avoid all worker compact at same time { let duration = thread_rng_n(1000); @@ -400,6 +402,27 @@ impl LevelsController { Ok(()) } + pub(crate) fn as_iterator( + &self, + reverse: bool, + ) -> Vec>> { + let mut itrs: Vec>> = vec![]; + for level in self.levels.iter() { + if level.level() == 0 { + for table in level.tables.read().iter().rev() { + let itr = Box::new(IteratorImpl::new(table.clone(), reverse)); + itrs.push(itr); + } + } else { + for table in level.tables.read().iter() { + let itr = Box::new(IteratorImpl::new(table.clone(), reverse)); + itrs.push(itr); + } + } + } + itrs + } + // Merge top tables and bot tables to from a List of new tables. pub(crate) async fn compact_build_tables( &self, diff --git a/src/lib.rs b/src/lib.rs index b81ca60..fd33485 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,7 +16,6 @@ #![feature(arc_into_inner)] #![feature(async_closure)] #![feature(async_iterator)] -#![feature(atomic_mut_ptr)] use std::io; use std::mem::align_of; @@ -43,6 +42,10 @@ mod pb; mod st_manager; #[cfg(test)] mod test_util; +#[cfg(test)] +mod kv_test; + +mod db; pub use skl::*; pub use st_manager::*; diff --git a/src/options/mod.rs b/src/options/mod.rs index 1af61e9..a9e658d 100644 --- a/src/options/mod.rs +++ b/src/options/mod.rs @@ -1,5 +1,6 @@ use crate::value_log::Entry; use crate::y::{CAS_SIZE, META_SIZE, USER_META_SIZE}; +use crate::Node; /// Specifies how data in LSM table files and value log files should /// be loaded. @@ -77,6 +78,10 @@ impl Options { } entry.key.len() + 16 + META_SIZE + USER_META_SIZE + CAS_SIZE } + + pub fn arena_size(&self) -> u64 { + self.max_table_size + self.max_batch_size + self.max_batch_count * (Node::size() as u64) + } } impl Default for Options { diff --git a/src/skl/mod.rs b/src/skl/mod.rs index 2959a44..bc9303e 100644 --- a/src/skl/mod.rs +++ b/src/skl/mod.rs @@ -8,7 +8,7 @@ pub use alloc::{Allocate, BlockBytes, Chunk, SmartAllocate}; pub use arena::Arena; pub use cursor::Cursor; pub use node::Node; -pub use skip::SkipList; +pub use skip::*; const MAX_HEIGHT: usize = 20; const HEIGHT_INCREASE: u32 = u32::MAX / 3; diff --git a/src/skl/skip.rs b/src/skl/skip.rs index f018632..9b376dc 100644 --- a/src/skl/skip.rs +++ b/src/skl/skip.rs @@ -424,7 +424,56 @@ impl Display for SkipList { // A unidirectional memetable iterator. It is a thin wrapper around // `Iterator`. We like to keep `Iterator` as before, because it is more powerful and // we might support bidirectional iterations in the future. -pub struct UniIterator {} +pub struct UniIterator { + iter: SkipIterator, + reversed: bool, +} + +impl UniIterator { + pub fn new(st: SkipList, reversed: bool) -> UniIterator { + let itr = SkipIterator::new(st); + UniIterator { + iter: itr, + reversed, + } + } +} + +impl Xiterator for UniIterator { + type Output = IteratorItem; + + fn next(&self) -> Option { + if !self.reversed { + self.iter.prev() + } else { + self.iter.next() + } + } + + fn rewind(&self) -> Option { + if !self.reversed { + self.iter.seek_to_first() + } else { + self.iter.seek_to_first() + } + } + + fn seek(&self, key: &[u8]) -> Option { + if !self.reversed { + self.iter.seek(key) + } else { + self.iter.seek_to_prev(key) + } + } + + fn peek(&self) -> Option { + self.iter.peek() + } + + fn close(&self) { + self.iter.close() + } +} // An iterator over SkipList object. for new objects, you just // need to initialize Iterator.list. @@ -435,6 +484,13 @@ pub struct SkipIterator { } impl SkipIterator { + pub fn new(st: SkipList) -> SkipIterator { + SkipIterator { + st, + node: AtomicPtr::new(ptr::null_mut()), + } + } + pub fn get_item_by_node(&self, node: &Node) -> Option { if ptr::eq(node, self.st.get_head()) { return None; @@ -449,8 +505,13 @@ impl SkipIterator { } pub fn peek(&self) -> Option { - let node = unsafe { self.node.load(Ordering::Relaxed).as_ref().unwrap() }; - self.get_item_by_node(&node) + unsafe { + self.node + .load(Ordering::Relaxed) + .as_ref() + .map(|node| self.get_item_by_node(node)) + .unwrap_or_else(|| None) + } } // returns true iff the iterator is positioned at a valid node. @@ -486,9 +547,21 @@ impl SkipIterator { self.get_item_by_node(node.unwrap()) } + // Advances to the first entry with a key >= target. + pub fn seek(&self, key: &[u8]) -> Option { + let (node, _) = self.st.find_near(key, false, true); // find >=. + if node.is_none() { + self.set_node(self.st.get_head()); + return None; + } + self.node + .store(node.unwrap() as *const Node as *mut Node, Ordering::Relaxed); + self.get_item_by_node(node.unwrap()) + } + // finds an entry with key <= target. - pub fn seek_to_prev(&self, target: &[u8]) -> Option { - let (node, _) = self.st.find_near(target, true, true); // find <=1 + pub fn seek_to_prev(&self, key: &[u8]) -> Option { + let (node, _) = self.st.find_near(key, true, true); // find <=1 if node.is_none() { self.set_node(self.st.get_head()); return None; diff --git a/src/st_manager.rs b/src/st_manager.rs index a949c0d..5b70d98 100644 --- a/src/st_manager.rs +++ b/src/st_manager.rs @@ -17,23 +17,35 @@ type SkipListItem = crossbeam_epoch::Atomic; pub struct SkipListManager { share_lock: parking_lot::RwLock<()>, - mt: SkipListItem, + mt: Option, imm: Arc>>, } impl Default for SkipListManager { fn default() -> Self { - todo!() + SkipListManager { + share_lock: parking_lot::RwLock::new(()), + mt: None, + imm: Arc::new(parking_lot::RwLock::new(vec![])), + } } } impl SkipListManager { + pub fn new(sz: usize) -> SkipListManager { + SkipListManager { + share_lock: parking_lot::RwLock::new(()), + mt: Some(SkipListItem::new(SkipList::new(sz))), + imm: Arc::new(parking_lot::RwLock::new(vec![])), + } + } + pub fn take<'a>(&'a self, p: &'a crossbeam_epoch::Guard) -> Shared<'a, SkipList> { - self.mt.load_consume(p) + self.mt.as_ref().unwrap().load_consume(p) } pub fn mt_ref<'a>(&'a self, p: &'a crossbeam_epoch::Guard) -> Shared<'a, SkipList> { - let st = self.mt.load(Ordering::Relaxed, &p); + let st = self.mt.as_ref().unwrap().load(Ordering::Relaxed, &p); st } @@ -50,6 +62,8 @@ impl SkipListManager { self.imm.write().push(st); let st = SkipList::new(1000); self.mt + .as_ref() + .unwrap() .store(crossbeam_epoch::Owned::new(st), Ordering::Relaxed); } diff --git a/src/table/table.rs b/src/table/table.rs index 87e5037..93148fa 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -21,6 +21,8 @@ use std::os::unix::fs::FileExt; use crate::types::{XArc, XWeak}; use crate::y::iterator::Xiterator; +use crate::Error::Unexpected; +use log::{debug, info}; use serde_json::to_vec; #[cfg(target_os = "windows")] use std::os::windows::fs::FileExt; @@ -387,8 +389,10 @@ pub fn get_id_map(dir: &str) -> HashSet { } let fid = parse_file_id(dir_el.file_name().to_str().unwrap()); if fid.is_err() { + debug!("Skip file, {:?}", fid.unwrap_err()); continue; } + info!("What dir : {:?} {:?}", fid, dir_el.file_name()); ids.insert(fid.unwrap()); } ids @@ -399,7 +403,7 @@ pub fn parse_file_id(name: &str) -> Result { let path = Path::new(name); let filename = path.file_name().unwrap().to_str().unwrap(); if !FILE_SUFFIX.is_suffix_of(filename) { - return Ok(0); + return Err(format!("invalid file {}", name).into()); } let name = filename.trim_end_matches(FILE_SUFFIX); name.parse::() diff --git a/src/types.rs b/src/types.rs index 55f3513..6cc56e1 100644 --- a/src/types.rs +++ b/src/types.rs @@ -12,7 +12,7 @@ use async_channel::{ }; use atomic::Atomic; use crossbeam_epoch::Owned; -use log::info; +use log::{info, warn}; use crate::value_log::ValuePointer; use range_lock::{VecRangeLock, VecRangeLockGuard}; @@ -126,6 +126,7 @@ impl UnChannel { /// down. #[derive(Clone)] pub struct Closer { + name: String, closed: Channel<()>, wait: Arc, } @@ -138,8 +139,9 @@ impl Drop for Closer { impl Closer { /// create a Closer with *initial* cap Workers - pub fn new() -> Self { + pub fn new(name: String) -> Self { let mut close = Closer { + name, closed: Channel::new(1), wait: Arc::from(AtomicIsize::new(0)), }; @@ -176,6 +178,18 @@ impl Closer { /// Waiting until done pub async fn wait(&self) { + // loop { + // if self.wait.load(Ordering::Relaxed) <= 0 { + // break; + // } + // sleep(Duration::from_millis(1)).await; + // } + self.has_been_closed().recv().await; + } + + /// Send a close signal and waiting util done + pub async fn signal_and_wait(&self) { + self.signal(); loop { if self.wait.load(Ordering::Relaxed) <= 0 { break; @@ -183,12 +197,6 @@ impl Closer { sleep(Duration::from_millis(1)).await; } } - - /// Send a close signal and waiting util done - pub async fn signal_and_wait(&self) { - self.signal(); - self.wait().await; - } } #[derive(Debug, Clone)] @@ -284,7 +292,7 @@ impl Deref for XVec { fn it_closer() { let runtime = tokio::runtime::Runtime::new().unwrap(); runtime.block_on(async { - let closer = Closer::new(); + let closer = Closer::new("test".to_owned()); let count = Arc::new(AtomicUsize::new(100)); for i in 0..count.load(Ordering::Relaxed) { let c = closer.spawn(); diff --git a/src/value_log.rs b/src/value_log.rs index fb61cee..eac99e8 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -3,12 +3,11 @@ use awaitgroup::{WaitGroup, Worker}; use bitflags::bitflags; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use crc32fast::Hasher; -use libc::{difftime, nice}; +use drop_cell::defer; use log::info; use log::kv::Source; use memmap::{Mmap, MmapMut}; use parking_lot::*; -use protobuf::well_known_types::api::Mixin; use rand::random; use serde_json::to_vec; use std::cell::{Ref, RefCell, RefMut}; @@ -474,6 +473,7 @@ impl ValueLogCore { // If no files are found, creating a new file. if vlogs.is_empty() { let log_file = self.create_vlog_file(0)?; + info!("Create zero vlog {}!!", log_file._path.as_ref()); vlogs.insert(0, Arc::new(RwLock::new(log_file))); } Ok(()) @@ -568,6 +568,11 @@ impl ValueLogCore { .seek(SeekFrom::End(0))?; self.writable_log_offset .store(last_offset as u32, Ordering::Release); + info!( + "After recover, max_id:{}, last_offset:{}", + self.max_fid.load(Ordering::Relaxed), + last_offset + ); Ok(()) } @@ -852,7 +857,7 @@ impl ValueLogCore { Some(vlog.clone()) } - fn pick_log_by_vlog_id(&self, id: &u32) -> Arc> { + pub(crate) fn pick_log_by_vlog_id(&self, id: &u32) -> Arc> { let pick_vlogs = self.pick_log_guard(); let vlogs = pick_vlogs.vlogs.get(id); let vlog = vlogs.unwrap(); @@ -905,8 +910,13 @@ impl ValueLogCore { Ok(data_file_ids) } - fn wait_gc(&self) { - todo!() + pub(crate) async fn wait_on_gc(&self, lc: Closer) { + defer! {lc.done()}; + defer! {info!("exit gc worker")}; + lc.wait().await; // wait for lc to be closed. + // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up + // the channel of size 1. + self.garbage_ch.send(()).await.unwrap(); } } diff --git a/src/y/mod.rs b/src/y/mod.rs index 96e30b1..a45aab7 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -104,7 +104,15 @@ impl Error { pub fn is_io_existing(&self) -> bool { match self { - Error::StdIO(err) if err.kind() == ErrorKind::AlreadyExists => true, + Error::StdIO(err) => { + if err.kind() == io::ErrorKind::AlreadyExists { + return true; + } + if let Some(code) = err.raw_os_error() { + return code == 2; + } + false + } _ => false, } } From 0617b7f0a760145d32cd767a9e0bfe1323cf69f2 Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 27 Mar 2023 02:11:53 +0800 Subject: [PATCH 62/77] :dog: add close for kv --- src/kv.rs | 77 +++++++++++++++++++++++++++++++++-------------- src/levels.rs | 3 +- src/log_file.rs | 8 +++++ src/manifest.rs | 4 +++ src/st_manager.rs | 6 ++++ src/types.rs | 27 +++++++++++------ src/value_log.rs | 2 +- 7 files changed, 93 insertions(+), 34 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 68a080a..55275b0 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -48,7 +48,7 @@ pub const _HEAD: &[u8; 11] = b"!bager!head"; // For Storing value offset for rep struct Closers { update_size: Closer, compactors: Closer, - mem_table: Closer, + mem_table: Closer, // Wait flush job exit writes: Closer, value_gc: Closer, } @@ -165,7 +165,7 @@ impl KV { let mut vlog = ValueLogCore::default(); { let kv = unsafe { &out as *const KV }; - vlog.open(kv, opt)?; + vlog.open(kv, opt.clone())?; } out.vlog.replace(vlog); @@ -265,8 +265,9 @@ impl KV { // Wait for replay to be applied first. replay_closer.signal_and_wait().await; // Mmap writeable log - // let max_fid = xout.must_vlog().max_fid.load(Ordering::Relaxed); - // let lf = xout.must_vlog().pick_log_by_vlog_id(&max_fid); + let max_fid = xout.must_vlog().max_fid.load(Ordering::Relaxed); + let lf = xout.must_vlog().pick_log_by_vlog_id(&max_fid); + lf.write().set_write(opt.clone().value_log_file_size * 2)?; // TODO { @@ -287,21 +288,6 @@ impl KV { Ok(xout) } - - /// close kv, should be call only once - pub async fn close(&self) -> Result<()> { - self.dir_lock_guard - .unlock() - .map_err(|err| Unexpected(err.to_string()))?; - self.value_dir_guard - .unlock() - .map_err(|err| Unexpected(err.to_string()))?; - self.closers.compactors.signal_and_wait().await; - self.closers.mem_table.signal_and_wait().await; - self.closers.writes.signal_and_wait().await; - self.closers.update_size.signal_and_wait().await; - Ok(()) - } } impl KV { @@ -398,7 +384,6 @@ impl KV { // async to flush memory table into zero level async fn flush_mem_table(&self, lc: Closer) -> Result<()> { defer! {lc.done()} - defer! {info!("exit flush table worker")}; while let Ok(task) = self.flush_chan.recv().await { // after kv send empty mt, it will close flush_chan, so we should return the job. if task.mt.is_none() { @@ -547,7 +532,6 @@ impl ArcKV { pub async fn spawn_update_size(&self) { let lc = self.closers.update_size.spawn(); defer! {lc.done()} - defer! {info!("exit update size worker");} let mut tk = tokio::time::interval(tokio::time::Duration::from_secs(5 * 60)); let dir = self.opt.dir.clone(); @@ -572,6 +556,8 @@ impl ArcKV { self.manifest.write().await } + /// Closes a KV. It's crucial to call it to ensure all the pending updates + /// make their way to disk. pub async fn close(&self) -> Result<()> { info!("Closing database"); // Stop value GC first; @@ -579,6 +565,34 @@ impl ArcKV { // Stop writes next. self.to_ref().closers.writes.signal_and_wait().await; + // Now close the value log. + self.must_vlog().close()?; + + // Make sure that block writer is done pushing stuff into memtable! + // Otherwise, you will have a race condition: we are trying to flush memtables + // and remove them completely, while the block / memtable writer is still trying + // to push stuff into the memtable. This will also resolve the value + // offset problem: as we push into memtable, we update value offsets there. + if !self.must_mt().empty() { + info!("Flushing memtable!"); + let vptr = unsafe { + self.vptr + .load(Ordering::Relaxed, &crossbeam_epoch::pin()) + .as_ref() + .clone() + .unwrap() + .clone() + }; + self.flush_chan + .send(FlushTask { + mt: Some(self.mem_st_manger.mt_clone()), + vptr, + }) + .await + .unwrap(); + } + + // Tell flusher to quit. self.flush_chan .send(FlushTask { mt: None, @@ -586,6 +600,26 @@ impl ArcKV { }) .await .unwrap(); + self.closers.mem_table.signal_and_wait().await; + info!("Memtable flushed!"); + + self.closers.compactors.signal_and_wait().await; + info!("Compaction finished!"); + + self.must_lc().close()?; + + info!("Waiting for closer"); + self.closers.update_size.signal_and_wait().await; + + self.dir_lock_guard.unlock()?; + self.value_dir_guard.unlock()?; + + self.manifest.write().await.close(); + // Fsync directions to ensure that lock file, and any other removed files whose directory + // we haven't specifically fsynced, are guaranteed to have their directory entry removal + // persisted to disk. + async_sync_directory(self.opt.dir.clone().to_string()).await?; + async_sync_directory(self.opt.value_dir.clone().to_string()).await?; Ok(()) } @@ -635,7 +669,6 @@ impl ArcKV { async fn do_writes(&self, lc: Closer) { info!("start do writes task!"); defer! {lc.done();}; - defer! {info!("exit do write woker")}; // TODO add metrics let has_been_close = lc.has_been_closed(); let write_ch = self.write_ch.clone(); diff --git a/src/levels.rs b/src/levels.rs index 7cb545c..3a971d0 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -146,7 +146,7 @@ impl LevelsController { Ok(()) } - fn close(&self) -> Result<()> { + pub(crate) fn close(&self) -> Result<()> { self.cleanup_levels() } @@ -192,7 +192,6 @@ impl LevelsController { return; } defer! {lc.done()}; - defer! {info!("Exit level controller worker");}; // random sleep avoid all worker compact at same time { let duration = thread_rng_n(1000); diff --git a/src/log_file.rs b/src/log_file.rs index a7ecde9..359b553 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -249,6 +249,14 @@ impl LogFile { self.open_read_only() } + pub(crate) fn set_write(&mut self, sz: u64) -> Result<()> { + self.fd.as_mut().unwrap().set_len(sz as u64)?; + let mut _mmap = unsafe { Mmap::map(&self.fd.as_ref().unwrap())?.make_mut()? }; + self._mmap.replace(MmapType(Either::Right(_mmap))); + self.sz = sz as u32; + Ok(()) + } + // return mmap slice fn mmap_slice(&self) -> &[u8] { let mmap = self._mmap.as_ref().unwrap(); diff --git a/src/manifest.rs b/src/manifest.rs index df860c8..ebf0c6f 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -179,6 +179,10 @@ impl ManifestFile { Err(err) => Err(err), }; } + + pub(crate) fn close(&mut self) { + self.fp.take(); + } } /// Manifest represents the contents of the MANIFEST file in a Badger store. diff --git a/src/st_manager.rs b/src/st_manager.rs index 5b70d98..def789d 100644 --- a/src/st_manager.rs +++ b/src/st_manager.rs @@ -49,6 +49,12 @@ impl SkipListManager { st } + pub fn mt_clone(&self) -> SkipList { + let p = crossbeam_epoch::pin(); + let mt = self.mt_ref(&p); + unsafe { mt.as_ref().unwrap().clone() } + } + pub fn imm(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec> { self.imm.write() } diff --git a/src/types.rs b/src/types.rs index 6cc56e1..ead46f6 100644 --- a/src/types.rs +++ b/src/types.rs @@ -12,6 +12,7 @@ use async_channel::{ }; use atomic::Atomic; use crossbeam_epoch::Owned; +use libc::regoff_t; use log::{info, warn}; use crate::value_log::ValuePointer; @@ -134,6 +135,11 @@ pub struct Closer { impl Drop for Closer { fn drop(&mut self) { assert!(self.wait.load(Ordering::Relaxed) >= 0, "Sanity check!"); + info!( + "Worker-{}-{} exited", + self.name, + self.wait.load(Ordering::Relaxed) + ); } } @@ -178,13 +184,17 @@ impl Closer { /// Waiting until done pub async fn wait(&self) { - // loop { - // if self.wait.load(Ordering::Relaxed) <= 0 { - // break; - // } - // sleep(Duration::from_millis(1)).await; - // } - self.has_been_closed().recv().await; + loop { + if self.wait.load(Ordering::Relaxed) <= 0 { + break; + } + match self.has_been_closed().try_recv() { + Err(err) if err.is_closed() => return, + Err(_) => {} + Ok(()) => return, + } + tokio::time::sleep(Duration::from_micros(1)).await; + } } /// Send a close signal and waiting util done @@ -194,7 +204,7 @@ impl Closer { if self.wait.load(Ordering::Relaxed) <= 0 { break; } - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_nanos(1000)).await; } } } @@ -298,7 +308,6 @@ fn it_closer() { let c = closer.spawn(); let n = count.clone(); tokio::spawn(async move { - sleep(Duration::from_millis(10000)).await; n.fetch_sub(1, Ordering::Relaxed); c.done(); }); diff --git a/src/value_log.rs b/src/value_log.rs index eac99e8..0aa3f5d 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -418,6 +418,7 @@ impl ValueLogCore { for vlog in vlogs.iter() { let mut lf = vlog.1.write(); if *vlog.0 == self.max_fid.load(Ordering::Acquire) { + info!("close vlog: {}", vlog.0); let _mmap = lf._mmap.take().unwrap(); _mmap.get_mut_mmap().flush()?; lf.fd @@ -912,7 +913,6 @@ impl ValueLogCore { pub(crate) async fn wait_on_gc(&self, lc: Closer) { defer! {lc.done()}; - defer! {info!("exit gc worker")}; lc.wait().await; // wait for lc to be closed. // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up // the channel of size 1. From 28f8186589c7cf9a2b230f95e8507ddb1156fa17 Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 27 Mar 2023 09:45:35 +0800 Subject: [PATCH 63/77] :dog: add close for kv --- src/kv.rs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/kv.rs b/src/kv.rs index 55275b0..52862bd 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -37,6 +37,8 @@ use std::sync::atomic::{AtomicPtr, AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use std::time::Duration; use std::{string, vec}; +use anyhow::__private::kind::TraitKind; +use async_channel::RecvError; use tokio::fs::create_dir_all; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::sync::{RwLock, RwLockWriteGuard}; @@ -331,6 +333,22 @@ impl KV { self.must_lc().get(key).ok_or(NotFound) } + // Sets the provided value for a given key. If key is not present, it is created. If it is + // present, the existing value is overwritten with the one provided. + // Along with key and value, Set can also take an optional userMeta byte. This byte is stored + // alongside the key, and can be used as an aid to interpret the value or store other contextual + // bits corresponding to the key-value pair. + pub(crate) async fn set(&self, key: Vec, value: Vec, user_meta: u8) -> Result<()> { + let mut entry = Entry::default(); + entry.key = key; + entry.value = value; + entry.user_meta = user_meta; + let res = self.batch_set(vec![entry]).await?; + assert_eq!(res.len(), 1); + let first = res.first().unwrap().get_req(); + first.get_resp().await + } + // Returns the current `mem_tables` and get references. fn get_mem_tables<'a>(&'a self, p: &'a crossbeam_epoch::Guard) -> Vec> { self.mem_st_manger.lock_exclusive(); @@ -668,6 +686,7 @@ impl ArcKV { impl ArcKV { async fn do_writes(&self, lc: Closer) { info!("start do writes task!"); + defer! {info!("exit writes task!")}; defer! {lc.done();}; // TODO add metrics let has_been_close = lc.has_been_closed(); @@ -675,7 +694,9 @@ impl ArcKV { let reqs = ArcMx::>::new(Mutex::new(vec![])); loop { tokio::select! { - _ = has_been_close.recv() => { + ret = has_been_close.recv() => { + info!("receive ==> {:?}", ret.unwrap_err()); + let err = RecvError::fr break; }, req = write_ch.recv() => { From c4f1dd6bbc6d34d42b1a10151302fddcec13c19b Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 3 Apr 2023 02:23:26 +0800 Subject: [PATCH 64/77] :dog: add close for kv --- src/db.rs | 12 ++++++------ src/kv.rs | 38 ++++++++++++++++++++++++-------------- src/types.rs | 14 ++++++++++++++ src/value_log.rs | 10 ++++++++-- 4 files changed, 52 insertions(+), 22 deletions(-) diff --git a/src/db.rs b/src/db.rs index 00f7a9c..3d46aaa 100644 --- a/src/db.rs +++ b/src/db.rs @@ -6,12 +6,12 @@ pub struct DataBase { kv: XArc, VL: Option, } - -impl DataBase { - async fn new() { - let kv = KV::open(Options::default()).await; - } -} +// +// impl DataBase { +// async fn new() { +// let kv = KV::open(Options::default()).await; +// } +// } pub struct VL { kv: XWeak, diff --git a/src/kv.rs b/src/kv.rs index 52862bd..d36690a 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -47,12 +47,12 @@ pub const _BADGER_PREFIX: &[u8; 8] = b"!badger!"; // Prefix for internal keys used by badger. pub const _HEAD: &[u8; 11] = b"!bager!head"; // For Storing value offset for replay. -struct Closers { - update_size: Closer, - compactors: Closer, - mem_table: Closer, // Wait flush job exit - writes: Closer, - value_gc: Closer, +pub struct Closers { + pub update_size: Closer, + pub compactors: Closer, + pub mem_table: Closer, // Wait flush job exit + pub writes: Closer, + pub value_gc: Closer, } struct FlushTask { @@ -76,11 +76,11 @@ pub struct KV { // write_chan: Channel, dir_lock_guard: File, value_dir_guard: File, - closers: Closers, + pub closers: Closers, // Our latest (actively written) in-memory table. mem_st_manger: Arc, // Add here only AFTER pushing to flush_ch - write_ch: Channel, + pub write_ch: Channel, // Incremented in the non-concurrently accessed write loop. But also accessed outside. So // we use an atomic op. last_used_cas_counter: AtomicU64, @@ -93,6 +93,12 @@ pub struct KV { unsafe impl Send for KV {} unsafe impl Sync for KV {} +impl Drop for KV { + fn drop(&mut self) { + info!("Drop kv"); + } +} + pub struct BoxKV { pub kv: *const KV, } @@ -173,6 +179,7 @@ impl KV { let xout = XArc::new(out); + // update size { let _out = xout.clone(); @@ -209,7 +216,7 @@ impl KV { if !item.value.is_empty() { vptr.dec(&mut Cursor::new(value))?; } - let replay_closer = Closer::new("tmp_replay".to_owned()); + let replay_closer = Closer::new("tmp_writer_closer".to_owned()); { let _out = xout.clone(); let replay_closer = replay_closer.spawn(); @@ -266,6 +273,7 @@ impl KV { .await?; // Wait for replay to be applied first. replay_closer.signal_and_wait().await; + // Mmap writeable log let max_fid = xout.must_vlog().max_fid.load(Ordering::Relaxed); let lf = xout.must_vlog().pick_log_by_vlog_id(&max_fid); @@ -287,7 +295,6 @@ impl KV { _out.must_vlog().wait_on_gc(closer).await; }); } - Ok(xout) } } @@ -296,7 +303,7 @@ impl KV { async fn walk_dir(dir: &str) -> Result<(u64, u64)> { let mut lsm_size = 0; let mut vlog_size = 0; - let mut entries = tokio::fs::read_dir("dir").await?; + let mut entries = tokio::fs::read_dir(dir).await?; while let Some(entry) = entries.next_entry().await? { let meta = entry.metadata().await?; if meta.is_dir() { @@ -687,7 +694,7 @@ impl ArcKV { async fn do_writes(&self, lc: Closer) { info!("start do writes task!"); defer! {info!("exit writes task!")}; - defer! {lc.done();}; + defer! {lc.done()}; // TODO add metrics let has_been_close = lc.has_been_closed(); let write_ch = self.write_ch.clone(); @@ -695,11 +702,13 @@ impl ArcKV { loop { tokio::select! { ret = has_been_close.recv() => { - info!("receive ==> {:?}", ret.unwrap_err()); - let err = RecvError::fr break; }, req = write_ch.recv() => { + if req.is_err() { + info!("receive a invalid write task, err: {:?}", req.unwrap_err()); + break; + } reqs.lock().push(req.unwrap()); } } @@ -722,6 +731,7 @@ impl ArcKV { } // clear future requests + info!("close write channel"); write_ch.close(); loop { let req = write_ch.try_recv(); diff --git a/src/types.rs b/src/types.rs index ead46f6..dde6212 100644 --- a/src/types.rs +++ b/src/types.rs @@ -162,6 +162,7 @@ impl Closer { /// Spawn a worker pub fn spawn(&self) -> Self { + info!("spawn a new closer: Worker-{}-{}", self.name, self.wait.load(Ordering::Relaxed)); self.add_running(1); self.clone() } @@ -298,6 +299,19 @@ impl Deref for XVec { } } +#[tokio::test] +async fn it_closer1() { + let closer = Closer::new("test".to_owned()); + let ch = closer.has_been_closed(); + tokio::select! { + ret = ch.recv() => { + println!("{:?}", ret); + } + } + // let err = closer.has_been_closed().recv().await; + // println!("{:?}", err); +} + #[test] fn it_closer() { let runtime = tokio::runtime::Runtime::new().unwrap(); diff --git a/src/value_log.rs b/src/value_log.rs index 0aa3f5d..5150325 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -12,7 +12,7 @@ use rand::random; use serde_json::to_vec; use std::cell::{Ref, RefCell, RefMut}; use std::collections::{HashMap, HashSet}; -use std::fmt::Formatter; +use std::fmt::{Display, Formatter}; use std::fs::{read_dir, remove_file, File, OpenOptions}; use std::future::Future; use std::io::{BufWriter, Cursor, Read, Seek, SeekFrom, Write}; @@ -290,11 +290,17 @@ impl Request { } #[derive(Clone)] -pub(crate) struct ArcRequest { +pub struct ArcRequest { inner: Arc, err: Arc>>, } +impl std::fmt::Debug for ArcRequest { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("ArcRequest").finish() + } +} + unsafe impl Send for ArcRequest {} unsafe impl Sync for ArcRequest {} From b75a783ac81a6417ecc285a1f4e98207e5533daa Mon Sep 17 00:00:00 2001 From: Rg Date: Tue, 4 Apr 2023 09:30:26 +0800 Subject: [PATCH 65/77] :dog: add close for kv --- src/kv.rs | 28 ++++++++++++++++++++-------- src/types.rs | 14 +++++++++++++- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index d36690a..1971e3a 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -17,6 +17,8 @@ use crate::{ Decode, Error, MergeIterOverBuilder, MergeIterOverIterator, Node, SkipList, SkipListManager, UniIterator, Xiterator, }; +use anyhow::__private::kind::TraitKind; +use async_channel::RecvError; use atomic::Atomic; use bytes::BufMut; use crossbeam_epoch::Shared; @@ -37,8 +39,6 @@ use std::sync::atomic::{AtomicPtr, AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use std::time::Duration; use std::{string, vec}; -use anyhow::__private::kind::TraitKind; -use async_channel::RecvError; use tokio::fs::create_dir_all; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::sync::{RwLock, RwLockWriteGuard}; @@ -66,6 +66,11 @@ impl FlushTask { } } +pub struct KVBuilder { + opt: Options, + kv: BoxKV, +} + pub struct KV { pub opt: Options, pub vlog: Option, @@ -179,7 +184,6 @@ impl KV { let xout = XArc::new(out); - // update size { let _out = xout.clone(); @@ -187,6 +191,7 @@ impl KV { _out.spawn_update_size().await; }); } + // mem_table closer { let _out = xout.clone(); @@ -221,10 +226,11 @@ impl KV { let _out = xout.clone(); let replay_closer = replay_closer.spawn(); tokio::spawn(async move { - _out.do_writes(replay_closer).await; + _out.do_writes(replay_closer, true).await; }); } + // replay data from vlog let mut first = true; xout.vlog .as_ref() @@ -284,7 +290,7 @@ impl KV { let closer = xout.closers.writes.spawn(); let _out = xout.clone(); tokio::spawn(async move { - _out.do_writes(closer).await; + _out.do_writes(closer, false).await; }); } @@ -495,7 +501,9 @@ impl KV { } let arc_req = ArcRequest::from(req); reqs.push(arc_req.clone()); + assert!(!self.write_ch.is_close()); self.write_ch.send(arc_req).await.unwrap(); + info!("send task to write"); } if !bad.is_empty() { let req = Request::default(); @@ -691,7 +699,7 @@ impl ArcKV { } impl ArcKV { - async fn do_writes(&self, lc: Closer) { + async fn do_writes(&self, lc: Closer, without_close_write_ch: bool) { info!("start do writes task!"); defer! {info!("exit writes task!")}; defer! {lc.done()}; @@ -706,6 +714,7 @@ impl ArcKV { }, req = write_ch.recv() => { if req.is_err() { + assert!(write_ch.is_close()); info!("receive a invalid write task, err: {:?}", req.unwrap_err()); break; } @@ -731,11 +740,14 @@ impl ArcKV { } // clear future requests - info!("close write channel"); - write_ch.close(); + if !without_close_write_ch { + assert!(!write_ch.is_close()); + write_ch.close(); + } loop { let req = write_ch.try_recv(); if req.is_err() { + assert!(req.unwrap_err().is_closed()); break; } let req = req.unwrap(); diff --git a/src/types.rs b/src/types.rs index dde6212..982d029 100644 --- a/src/types.rs +++ b/src/types.rs @@ -82,10 +82,18 @@ impl Channel { /// close *Channel*, Sender will be consumed pub fn close(&self) { + info!("close channel"); if let Some(tx) = &self.tx { tx.close(); } } + + pub fn is_close(&self) -> bool { + if let Some(tx) = &self.tx { + return tx.is_closed(); + } + true + } } #[derive(Clone)] @@ -162,7 +170,11 @@ impl Closer { /// Spawn a worker pub fn spawn(&self) -> Self { - info!("spawn a new closer: Worker-{}-{}", self.name, self.wait.load(Ordering::Relaxed)); + info!( + "spawn a new closer: Worker-{}-{}", + self.name, + self.wait.load(Ordering::Relaxed) + ); self.add_running(1); self.clone() } From cec040100dd4ad12794a7831205c429c24efa28f Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 6 Apr 2023 09:31:59 +0800 Subject: [PATCH 66/77] :dog: --- src/kv.rs | 171 +++++++++++++++++++++++++++++++++++++---------- src/value_log.rs | 99 +++++++++++++++++++++++++-- 2 files changed, 227 insertions(+), 43 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 1971e3a..90b3ba4 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -7,7 +7,8 @@ use crate::table::iterator::{IteratorImpl, IteratorItem}; use crate::table::table::{new_file_name, Table, TableCore}; use crate::types::{ArcMx, Channel, Closer, TArcMx, XArc, XWeak}; use crate::value_log::{ - ArcRequest, Entry, MetaBit, Request, ValueLogCore, ValuePointer, MAX_KEY_SIZE, + ArcRequest, Entry, EntryPair, EntryType, MetaBit, Request, ValueLogCore, ValuePointer, + MAX_KEY_SIZE, }; use crate::y::{ async_sync_directory, create_synced_file, sync_directory, Encode, Result, ValueStruct, @@ -387,7 +388,10 @@ impl KV { if reqs.is_empty() { return Ok(()); } - info!("write_requests called. Writing to value log"); + info!( + "write_requests called. Writing to value log, count: {}", + reqs.len() + ); // CAS counter for all operations has to go onto value log. Otherwise, if it is just in // memtable for a long time, and following CAS operations use that as a check, when // replaying, we will think that these CAS operations should fail, when they are actually @@ -396,18 +400,26 @@ impl KV { // There is code (in flush_mem_table) whose correctness depends on us generating CAS Counter // values _before_ we modify s.vptr here. for req in reqs.iter() { - let entries = req.req_ref().entries.write(); + let entries = req.req_ref().entries.read(); let counter_base = self.new_cas_counter(entries.len() as u64); for (idx, entry) in entries.iter().enumerate() { - entry.borrow_mut().cas_counter = counter_base + idx as u64; + entry.write().mut_entry().cas_counter = counter_base + idx as u64; } } - - self.vlog.as_ref().unwrap().write(reqs.clone())?; info!("Writing to memory table"); + self.vlog.as_ref().unwrap().write(reqs.clone())?; + let mut count = 0; for req in reqs.iter() { + if req.get_req().entries.read().is_empty() { + continue; + } count += req.get_req().entries.read().len(); + // while let Err(err) = xout.ensure_room_for_write().await { + // tokio::time::sleep(Duration::from_millis(10)).await; + // } + + // xout.must_mt().put(&entry.key, v); } Ok(()) } @@ -489,7 +501,10 @@ impl KV { count += 1; sz += self.opt.estimate_size(&entry) as u64; let req = b.last_mut().unwrap(); - req.entries.write().push(RefCell::new(entry)); + req.entries + .write() + .push(parking_lot::RwLock::new(EntryType::from(entry))); + req.ptrs.lock().push(None); if count >= self.opt.max_batch_count || sz >= self.opt.max_batch_count { b.push(Request::default()); } @@ -507,8 +522,11 @@ impl KV { } if !bad.is_empty() { let req = Request::default(); - *req.entries.write() = - Vec::from_iter(bad.into_iter().map(|bad| RefCell::new(bad)).into_iter()); + *req.entries.write() = Vec::from_iter( + bad.into_iter() + .map(|bad| parking_lot::RwLock::new(EntryType::from(bad))) + .into_iter(), + ); let arc_req = ArcRequest::from(req); arc_req .set_err(Err("key too big or value to big".into())) @@ -518,10 +536,84 @@ impl KV { Ok(reqs) } + fn write_to_lsm(&self, req: ArcRequest) -> Result<()> { + let req = req.get_req(); //.entries.read(); + let ptrs = req.ptrs.lock(); + let entries = req.entries.read(); + assert_eq!(entries.len(), ptrs.len()); + + for (i, pair) in entries.iter().enumerate() { + let mut entry_pair = pair.write(); + let entry = entry_pair.entry(); + if entry.cas_counter_check != 0 { + let old_value = self.get(&entry.key)?; + // No need to decode existing value. Just need old CAS counter. + if old_value.cas_counter != entry.cas_counter_check { + entry_pair.set_resp(Err(Error::ValueCasMisMatch)); + continue; + } + } + + if entry.meta == MetaBit::BIT_SET_IF_ABSENT.bits() { + // Someone else might have written a value, so lets check again if key exists. + let exits = self.exists(&entry.key)?; + // Value already exists. don't write. + if exits { + entry_pair.set_resp(Err(Error::ValueKeyExists)); + continue; + } + } + + if self.should_write_value_to_lsm(entry) { + // Will include deletion/tombstone case. + self.must_mt().put( + &entry.key, + ValueStruct::new( + entry.value.clone(), // TODO avoid value clone + entry.meta, + entry.user_meta, + entry.cas_counter, + ), + ); + } else { + let ptr = ptrs.get(i).unwrap().as_ref().unwrap(); + let mut wt = Cursor::new(vec![0u8; ValuePointer::value_pointer_encoded_size()]); + ptr.enc(&mut wt).unwrap(); + self.must_mt().put( + &entry.key, + ValueStruct::new( + wt.into_inner(), + entry.meta | MetaBit::BIT_VALUE_POINTER.bits(), + entry.user_meta, + entry.cas_counter, + ), + ); + } + } + + todo!() + } + + fn exists(&self, key: &[u8]) -> Result { + let value = self.get(key)?; + if value.value.is_empty() && value.meta == 0 { + return Ok(false); + } + if value.meta & MetaBit::BIT_DELETE.bits() != 0 { + return Ok(false); + } + + Ok(true) + } + fn new_cas_counter(&self, how_many: u64) -> u64 { self.last_used_cas_counter .fetch_add(how_many, Ordering::Relaxed) } + + fn should_write_value_to_lsm(&self, entry: &Entry) -> bool { + entry.value.len() < self.opt.value_threshold + } } impl KV { @@ -707,6 +799,16 @@ impl ArcKV { let has_been_close = lc.has_been_closed(); let write_ch = self.write_ch.clone(); let reqs = ArcMx::>::new(Mutex::new(vec![])); + let to_reqs = || { + let to_reqs = reqs + .lock() + .clone() + .into_iter() + .map(|req| req.clone()) + .collect::>(); + reqs.lock().clear(); + Arc::new(to_reqs) + }; loop { tokio::select! { ret = has_been_close.recv() => { @@ -721,21 +823,23 @@ impl ArcKV { reqs.lock().push(req.unwrap()); } } - // TODO avoid memory allocate again - if reqs.lock().len() == 100 { - let to_reqs = reqs - .lock() - .clone() - .into_iter() - .map(|req| req.clone()) - .collect::>(); - let to_reqs = Arc::new(to_reqs); - if let Err(err) = self.write_requests(to_reqs).await { - // for req in reqs.lock().iter() { - // req.set_err(Err(err.clone())).await; - // } + + let to_reqs = if reqs.lock().len() == 100 { + to_reqs() + } else { + if let Ok(req) = write_ch.try_recv() { + reqs.lock().push(req); + Arc::new(vec![]) + } else { + to_reqs() + } + }; + + if !to_reqs.is_empty() { + let res = self.write_requests(to_reqs.clone()).await; + for req in to_reqs.clone().to_vec() { + req.set_err(res.clone()).await; } - reqs.lock().clear(); } } @@ -746,22 +850,15 @@ impl ArcKV { } loop { let req = write_ch.try_recv(); - if req.is_err() { - assert!(req.unwrap_err().is_closed()); + if let Err(err) = &req { + assert!(err.is_closed() || err.is_empty(), "{:?}", err); break; } - let req = req.unwrap(); - reqs.lock().push(req); - let to_reqs = reqs - .lock() - .clone() - .into_iter() - .map(|req| req.clone()) - .collect::>(); - if let Err(err) = self.write_requests(Arc::new(to_reqs)).await { - // for req in reqs.lock().iter() { - // req.set_err(Err(err.clone())).await; - // } + reqs.lock().push(req.unwrap()); + let to_reqs = to_reqs(); + let res = self.write_requests(to_reqs.clone()).await; + for req in to_reqs.clone().to_vec() { + req.set_err(res.clone()).await; } } } diff --git a/src/value_log.rs b/src/value_log.rs index 5150325..27168a7 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -4,6 +4,7 @@ use bitflags::bitflags; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use crc32fast::Hasher; use drop_cell::defer; +use either::Either; use log::info; use log::kv::Source; use memmap::{Mmap, MmapMut}; @@ -261,9 +262,88 @@ impl Decode for ValuePointer { } } +pub(crate) struct EntryType(Either>); + +impl EntryType { + pub(crate) fn entry(&self) -> &Entry { + match self.0 { + Either::Left(ref entry) => entry, + _ => panic!("It should be not happen"), + } + } + + pub(crate) fn mut_entry(&mut self) -> &mut Entry { + match self.0 { + Either::Left(ref mut entry) => entry, + _ => panic!("It should be not happen"), + } + } + + pub(crate) fn ret(&self) -> &Result<()> { + match self.0 { + Either::Right(ref m) => m, + _ => panic!("It should be not happen"), + } + } + + pub(crate) fn set_resp(&mut self, ret: Result<()>) { + self.0 = Either::Right(ret); + } +} + +impl Deref for EntryType { + type Target = Either>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl From for EntryType { + fn from(value: Entry) -> Self { + Self(Either::Left(value)) + } +} + +impl From> for EntryType { + fn from(value: Result<()>) -> Self { + Self(Either::Right(value)) + } +} + +pub(crate) struct EntryPair { + entry: Entry, + ret: RwLock>, +} + +impl EntryPair { + pub(crate) fn new(entry: Entry) -> Self { + EntryPair { + entry, + ret: RwLock::new(Ok(())), + } + } + + pub(crate) fn set_resp(&self, ret: Result<()>) { + *self.ret.write() = ret + } + + pub(crate) fn entry(&self) -> &Entry { + &self.entry + } + + pub(crate) fn mut_entry(&mut self) -> &mut Entry { + &mut self.entry + } + + pub(crate) fn resp(&self) -> Result<()> { + self.ret.read().clone() + } +} + pub(crate) struct Request { // Input values, NOTE: RefCell is called concurrency - pub(crate) entries: RwLock>>, + pub(crate) entries: RwLock>>, // Output Values and wait group stuff below pub(crate) ptrs: Mutex>>, pub(crate) res: Channel>, @@ -316,7 +396,7 @@ impl ArcRequest { pub(crate) async fn set_err(&self, err: Result<()>) { *self.err.lock() = err.clone(); - self.inner.res.send(err).await; + self.inner.res.send(err).await.expect("TODO: panic message"); } pub(crate) fn to_inner(self) -> Request { @@ -663,6 +743,7 @@ impl ValueLogCore { // write is thread-unsafe by design and should not be called concurrently. pub(crate) fn write(&self, reqs: Arc>) -> Result<()> { + defer! {info!("finished write value log");} let cur_vlog_file = self.pick_log_by_vlog_id(&self.max_fid.load(Ordering::Acquire)); let to_disk = || -> Result<()> { if self.buf.read().buffer().is_empty() { @@ -702,10 +783,16 @@ impl ValueLogCore { }; for req in reqs.iter() { - for (idx, entry) in req.get_req().entries.read().iter().enumerate() { - if !self.opt.sync_writes && entry.borrow().value.len() < self.opt.value_threshold { + let req = req.get_req(); + for (idx, entry) in req.entries.read().iter().enumerate() { + if !self.opt.sync_writes + && entry.read().entry().value.len() < self.opt.value_threshold + { // No need to write to value log. - req.get_req().ptrs.lock()[idx] = None; + info!("ptrs {}", req.ptrs.lock().len()); + req.ptrs.lock()[idx] = None; + info!("to disk~"); + continue; } @@ -715,7 +802,7 @@ impl ValueLogCore { ptr.offset = self.writable_log_offset.load(Ordering::Acquire) + self.buf.read().buffer().len() as u32; let mut buf = self.buf.write(); - entry.borrow_mut().enc(&mut *buf)?; + entry.write().entry().enc(&mut *buf)?; } } to_disk() From 599a7dd96be36fee0a9a4fc4fa585645fb94fc0a Mon Sep 17 00:00:00 2001 From: Rg Date: Fri, 7 Apr 2023 01:02:27 +0800 Subject: [PATCH 67/77] :dog: sleep --- src/kv.rs | 117 ++++++++++++++++++++++++++++------------------- src/skl/alloc.rs | 8 +++- src/skl/arena.rs | 3 +- src/value_log.rs | 3 -- 4 files changed, 79 insertions(+), 52 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 90b3ba4..3e09258 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -22,11 +22,12 @@ use anyhow::__private::kind::TraitKind; use async_channel::RecvError; use atomic::Atomic; use bytes::BufMut; -use crossbeam_epoch::Shared; +use crossbeam_epoch::{Owned, Shared}; use drop_cell::defer; use fs2::FileExt; use log::{info, Log}; -use parking_lot::Mutex; +use parking_lot::lock_api::MutexGuard; +use parking_lot::{Mutex, RawMutex}; use std::cell::RefCell; use std::fs::File; use std::fs::{try_exists, OpenOptions}; @@ -392,6 +393,7 @@ impl KV { "write_requests called. Writing to value log, count: {}", reqs.len() ); + // CAS counter for all operations has to go onto value log. Otherwise, if it is just in // memtable for a long time, and following CAS operations use that as a check, when // replaying, we will think that these CAS operations should fail, when they are actually @@ -406,21 +408,26 @@ impl KV { entry.write().mut_entry().cas_counter = counter_base + idx as u64; } } - info!("Writing to memory table"); + + // TODO add error set self.vlog.as_ref().unwrap().write(reqs.clone())?; + info!("Writing to memory table"); let mut count = 0; for req in reqs.iter() { if req.get_req().entries.read().is_empty() { continue; } count += req.get_req().entries.read().len(); - // while let Err(err) = xout.ensure_room_for_write().await { - // tokio::time::sleep(Duration::from_millis(10)).await; - // } - - // xout.must_mt().put(&entry.key, v); + info!("waiting for write"); + while let Err(err) = self.ensure_room_for_write().await { + tokio::time::sleep(Duration::from_millis(10)).await; + } + info!("waiting for write"); + self.write_to_lsm(req.clone())?; + self.update_offset(req.get_req().ptrs.lock()); } + info!("{} entries written", count); Ok(()) } @@ -591,7 +598,7 @@ impl KV { } } - todo!() + Ok(()) } fn exists(&self, key: &[u8]) -> Result { @@ -611,9 +618,62 @@ impl KV { .fetch_add(how_many, Ordering::Relaxed) } + async fn ensure_room_for_write(&self) -> Result<()> { + defer! {info!("exit ensure room for write!")} + // TODO a special global lock for this function + info!("(((((())))))))))))))"); + let _ = self.share_lock.write().await; + info!("====))))))))"); + if self.must_mt().mem_size() < self.opt.max_table_size as u32 { + info!("))))))))"); + return Ok(()); + } + // A nil mt indicates that KV is being closed. + info!(")11)))))))"); + assert!(!self.must_mt().empty()); + info!(")11)))))))"); + let flush_task = FlushTask { + mt: Some(self.must_mt().clone()), + vptr: self.must_vptr(), + }; + if let Ok(_) = self.flush_chan.try_send(flush_task) { + info!("Flushing value log to disk if async mode."); + // Ensure value log is synced to disk so this memtable's contents wouldn't be lost. + self.must_vlog().sync()?; + info!( + "Flushing memtable, mt.size={} size of flushChan: {}", + self.must_mt().mem_size(), + self.flush_chan.tx().len() + ); + // We manage to push this task. Let's modify imm. + self.mem_st_manger.swap_st(self.opt.clone()); + // New memtable is empty. We certainly have room. + Ok(()) + } else { + Err(Unexpected("No room for write".into())) + } + } + fn should_write_value_to_lsm(&self, entry: &Entry) -> bool { entry.value.len() < self.opt.value_threshold } + + fn update_offset(&self, ptrs: MutexGuard>>) { + let mut ptr = &ValuePointer::default(); + for tmp_ptr in ptrs.iter().rev() { + if tmp_ptr.is_none() || tmp_ptr.as_ref().unwrap().is_zero() { + continue; + } + ptr = tmp_ptr.as_ref().unwrap(); + break; + } + + if ptr.is_zero() { + return; + } + + self.vptr.store(Owned::new(ptr.clone()), Ordering::Release); + } } impl KV { @@ -625,6 +685,8 @@ impl KV { fn must_mt(&self) -> &SkipList { let p = crossbeam_epoch::pin(); let st = self.mem_st_manger.mt_ref(&p).as_raw(); + assert!(!st.is_null()); + info!("wat the fuct"); unsafe { &*st } } @@ -863,43 +925,6 @@ impl ArcKV { } } - fn should_write_value_to_lsm(&self, entry: &Entry) -> bool { - entry.value.len() < self.opt.value_threshold - } - - // Always called serially. - async fn ensure_room_for_write(&self) -> Result<()> { - // TODO a special global lock for this function - let _ = self.share_lock.write().await; - if self.must_mt().mem_size() < self.opt.max_table_size as u32 { - return Ok(()); - } - - // A nil mt indicates that KV is being closed. - assert!(!self.must_mt().empty()); - - let flush_task = FlushTask { - mt: Some(self.must_mt().clone()), - vptr: self.must_vptr(), - }; - if let Ok(_) = self.flush_chan.try_send(flush_task) { - info!("Flushing value log to disk if async mode."); - // Ensure value log is synced to disk so this memtable's contents wouldn't be lost. - self.must_vlog().sync()?; - info!( - "Flushing memtable, mt.size={} size of flushChan: {}", - self.must_mt().mem_size(), - self.flush_chan.tx().len() - ); - // We manage to push this task. Let's modify imm. - self.mem_st_manger.swap_st(self.opt.clone()); - // New memtable is empty. We certainly have room. - Ok(()) - } else { - Err(Unexpected("No room for write".into())) - } - } - // asyn yield item value from ValueLog pub(crate) async fn yield_item_value( &self, diff --git a/src/skl/alloc.rs b/src/skl/alloc.rs index 5378502..69557fa 100644 --- a/src/skl/alloc.rs +++ b/src/skl/alloc.rs @@ -102,10 +102,14 @@ unsafe impl Send for OnlyLayoutAllocate {} unsafe impl Sync for OnlyLayoutAllocate {} impl OnlyLayoutAllocate { - fn size() -> usize { + pub(crate) fn size() -> usize { size_of::() } + pub(crate) fn len(&self) -> usize { + self.len.load(Ordering::Relaxed) + } + pub fn new(n: usize) -> Self { OnlyLayoutAllocate { cursor: Arc::from(AtomicUsize::new(Self::size())), @@ -215,7 +219,7 @@ unsafe impl Send for SliceAllocate {} unsafe impl Sync for SliceAllocate {} impl SliceAllocate { - fn size(&self) -> usize { + pub(crate) fn size(&self) -> usize { self.len.load(Ordering::Relaxed) } diff --git a/src/skl/arena.rs b/src/skl/arena.rs index 363f76c..b1e5d3a 100644 --- a/src/skl/arena.rs +++ b/src/skl/arena.rs @@ -44,8 +44,9 @@ impl Arena { } } + // TODO pub(crate) fn size(&self) -> u32 { - todo!() + (self.slice.size() + self.node_alloc.len()) as u32 } pub(crate) fn cap(&self) -> usize { diff --git a/src/value_log.rs b/src/value_log.rs index 27168a7..5cea934 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -372,7 +372,6 @@ impl Request { #[derive(Clone)] pub struct ArcRequest { inner: Arc, - err: Arc>>, } impl std::fmt::Debug for ArcRequest { @@ -395,7 +394,6 @@ impl ArcRequest { } pub(crate) async fn set_err(&self, err: Result<()>) { - *self.err.lock() = err.clone(); self.inner.res.send(err).await.expect("TODO: panic message"); } @@ -408,7 +406,6 @@ impl From for ArcRequest { fn from(value: Request) -> Self { ArcRequest { inner: Arc::new(value), - err: Arc::new(Mutex::new(Ok(()))), } } } From 8a1af8427cf7eb000b9e45ba4d41245717b37da2 Mon Sep 17 00:00:00 2001 From: Rg Date: Sun, 9 Apr 2023 15:37:53 +0800 Subject: [PATCH 68/77] :dog: sleep --- src/kv.rs | 2 +- src/lib.rs | 4 +- src/skl/alloc.rs | 166 ++++++++++++++++++++++++++++++++++++++--------- src/skl/arena.rs | 84 ++++++++++++++---------- src/skl/skip.rs | 89 ++----------------------- 5 files changed, 195 insertions(+), 150 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 3e09258..8dae823 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -625,7 +625,7 @@ impl KV { let _ = self.share_lock.write().await; info!("====))))))))"); if self.must_mt().mem_size() < self.opt.max_table_size as u32 { - info!("))))))))"); + info!(")))))))) {}", self.must_mt().mem_size()); return Ok(()); } // A nil mt indicates that KV is being closed. diff --git a/src/lib.rs b/src/lib.rs index fd33485..4e00f24 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,6 +35,8 @@ mod value_log_tests; mod y; mod compaction; +#[cfg(test)] +mod kv_test; mod level_handler; mod levels; mod mmap; @@ -42,8 +44,6 @@ mod pb; mod st_manager; #[cfg(test)] mod test_util; -#[cfg(test)] -mod kv_test; mod db; diff --git a/src/skl/alloc.rs b/src/skl/alloc.rs index 69557fa..b588b33 100644 --- a/src/skl/alloc.rs +++ b/src/skl/alloc.rs @@ -5,8 +5,9 @@ use std::fmt::{Debug, Display, Formatter}; use std::marker::PhantomData; use std::mem::{align_of, size_of, ManuallyDrop}; +use log::info; use std::ptr::{slice_from_raw_parts, slice_from_raw_parts_mut, NonNull}; -use std::sync::atomic::{AtomicPtr, AtomicU64, AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicPtr, AtomicU32, AtomicU64, AtomicUsize, Ordering}; use std::sync::Arc; use std::thread::{sleep, spawn}; use std::time::Duration; @@ -14,41 +15,162 @@ use std::{ptr, thread}; pub trait Allocate: Send + Sync { type Block; + #[inline] fn alloc(&self, start: usize, n: usize) -> Self::Block; + #[inline] fn size(&self) -> usize; + #[inline] + fn used_count(&self) -> usize; } pub trait Chunk: Send + Sync { + #[inline] fn get_data(&self) -> &[u8]; + #[inline] fn get_data_mut(&self) -> &mut [u8]; + #[inline] fn size(&self) -> usize; } +/// FixSizeAllocate fixed size memory allocator, WARNING: zero offset not store any T that for wrap Arena +pub struct FixSizeAllocate { + ptr: NonNull, + cap: AtomicUsize, + len: AtomicUsize, +} + +impl Debug for FixSizeAllocate { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("FixSizeAllocate") + .field("cap", &self.cap) + .field("len", &self.cap) + .finish() + } +} + +impl FixSizeAllocate { + pub(crate) fn size() -> usize { + size_of::() + } + + pub fn new(mut sz: usize) -> Self { + let layout = std::alloc::Layout::array::(sz).unwrap(); + let ptr = unsafe { std::alloc::alloc(layout) } as *mut T; + let mut allocate = FixSizeAllocate { + ptr: NonNull::new(ptr).unwrap(), + cap: AtomicUsize::new(sz), + len: AtomicUsize::new(0), + }; + allocate + } + + #[inline] + pub fn alloc(&self) -> (&mut T, usize) { + let offset = self.len.fetch_add(Self::size(), Ordering::Release); + let end = offset + Self::size(); + // println!("{}, {}, {}", offset, end, self.cap.load(Ordering::Acquire)); + assert!(end <= self.cap.load(Ordering::Acquire)); + let ptr = self.get_data_mut_ptr(); + let ref_data = unsafe { &mut *slice_from_raw_parts_mut(ptr.add(offset), end - offset) }; + (&mut ref_data[0], offset) + } + + #[inline] + pub fn alloc_slice(&self, sz: usize) -> (&mut [T], usize) { + let offset = self.len.fetch_add(Self::size() * sz, Ordering::Release); + let end = offset + sz * Self::size(); + assert!(end <= self.cap.load(Ordering::Acquire)); + let ptr = self.get_data_mut_ptr(); + let mut ref_data = + unsafe { &mut *slice_from_raw_parts_mut(ptr.add(offset), (end - offset)) }; + (ref_data, offset) + } + + #[inline] + pub fn get(&self, offset: usize) -> &mut T { + let mut ptr = self.get_data_mut_ptr(); + let mut ref_data = unsafe { &mut *slice_from_raw_parts_mut(ptr.add(offset), Self::size()) }; + &mut ref_data[0] + } + + #[inline] + pub fn get_slice(&self, offset: usize, n: usize) -> &mut [T] { + let ptr = self.get_data_mut_ptr(); + let mut ref_data = unsafe { &mut *slice_from_raw_parts_mut(ptr.add(offset), n) }; + ref_data + } + + #[inline] + pub fn len(&self) -> usize { + self.len.load(Ordering::Acquire) + } + + #[inline] + pub fn empty(&self) -> bool { + self.len.load(Ordering::Acquire) == 0 + } + + #[inline] + pub(crate) fn get_data_mut_ptr(&self) -> *mut T { + self.get_data_ptr() as *mut T + } + + #[inline] + pub(crate) fn get_data_ptr(&self) -> *const T { + self.ptr.as_ptr() + } +} + +impl Drop for FixSizeAllocate { + fn drop(&mut self) { + info!( + "Drop fix size allocator, cap:{}, len:{}", + self.cap.load(Ordering::Acquire), + self.len.load(Ordering::Acquire) + ); + if self.cap.load(Ordering::Acquire) != 0 { + let layout = std::alloc::Layout::array::(self.cap.load(Ordering::Acquire)).unwrap(); + unsafe { + std::alloc::dealloc(self.ptr.as_ptr() as *mut u8, layout); + } + } + } +} + #[derive(Debug)] #[repr(C)] pub struct SmartAllocate { pub(crate) ptr: std::mem::ManuallyDrop>, + pub(crate) count: AtomicU64, } impl Allocate for SmartAllocate { type Block = impl Chunk; - + #[inline] fn alloc(&self, start: usize, n: usize) -> Self::Block { + assert!(start + n <= self.size()); + self.count.store((start + n) as u64, Ordering::Release); let ptr = self.get_data_ptr(); let block_ptr = unsafe { ptr.add(start) as *mut u8 }; let block = BlockBytes::new(NonNull::new(block_ptr).unwrap(), n); block } - + #[inline] fn size(&self) -> usize { self.ptr.len() } + #[inline] + fn used_count(&self) -> usize { + self.count.load(Ordering::Acquire) as usize + } } impl SmartAllocate { pub(crate) fn new(m: std::mem::ManuallyDrop>) -> Self { - println!("new a alloc memory, len: {}", m.len()); - SmartAllocate { ptr: m } + SmartAllocate { + ptr: m, + count: Default::default(), + } } #[inline] @@ -57,6 +179,12 @@ impl SmartAllocate { } } +impl Drop for SmartAllocate { + fn drop(&mut self) { + unsafe { std::mem::ManuallyDrop::drop(&mut self.ptr) }; + } +} + #[derive(Clone, Debug)] #[repr(C)] pub struct BlockBytes { @@ -75,14 +203,15 @@ impl BlockBytes { } impl Chunk for BlockBytes { + #[inline] fn get_data(&self) -> &[u8] { unsafe { &*slice_from_raw_parts(self.start.as_ptr(), self.n) } } - + #[inline] fn get_data_mut(&self) -> &mut [u8] { unsafe { &mut *slice_from_raw_parts_mut(self.start.as_ptr(), self.n) } } - + #[inline] fn size(&self) -> usize { self.n } @@ -169,7 +298,6 @@ impl OnlyLayoutAllocate { pub(crate) fn reset(&self) { self.len.store(0, Ordering::Relaxed); self.cursor.store(0, Ordering::Relaxed); - //self.ptr.clear(); } #[inline] @@ -199,9 +327,8 @@ impl Drop for OnlyLayoutAllocate { fn drop(&mut self) { self.cursor.store(0, Ordering::Relaxed); self.cursor.store(0, Ordering::Relaxed); - // TODO: free memory unsafe { - // ManuallyDrop::into_inner(self.ptr); + ManuallyDrop::drop(&mut self.ptr); } } } @@ -235,10 +362,6 @@ impl SliceAllocate { self.borrow_slice(start, size) } - // fn get_mut(&mut self, start: usize, size: usize) -> &mut [u8] { - // self.borrow_mut_slice(start, size) - // } - // Return the start locate offset pub(crate) fn alloc(&self, size: usize) -> &[u8] { let offset = self.cursor.fetch_add(size, Ordering::Relaxed); @@ -295,18 +418,6 @@ impl SliceAllocate { } } -// impl Allocate for SliceAllocate { -// type Block = (); -// -// fn alloc(&self, start: usize, n: usize) -> Self::Block { -// todo!() -// } -// -// fn size(&self) -> usize { -// todo!() -// } -// } - #[test] fn t_onlylayoutalloc() { let mut alloc: OnlyLayoutAllocate = OnlyLayoutAllocate::new(1 << 10); @@ -367,6 +478,3 @@ fn t_block_bytes() { assert_eq!(buffer[datum], datum as u8); } } - -#[test] -fn t_clone() {} diff --git a/src/skl/arena.rs b/src/skl/arena.rs index b1e5d3a..f749b30 100644 --- a/src/skl/arena.rs +++ b/src/skl/arena.rs @@ -3,9 +3,11 @@ use crate::skl::alloc::{OnlyLayoutAllocate, SliceAllocate}; use crate::skl::node::Node; use crate::skl::Allocate; use crate::skl::{alloc::Chunk, SmartAllocate}; +use crate::test_util::{mock_log, mock_log_terminal, tracing_log}; use crate::y::ValueStruct; use std::default; use std::fmt::format; +use std::io::Write; use std::marker::PhantomData; use std::mem::{size_of, ManuallyDrop}; use std::ptr::{addr_of, slice_from_raw_parts, slice_from_raw_parts_mut, NonNull}; @@ -15,6 +17,9 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::thread::{sleep, spawn}; use std::time::Duration; +use tracing::info; + +use super::alloc::FixSizeAllocate; const OFFSET_SIZE: usize = size_of::(); // FIXME: i don't know @@ -23,8 +28,8 @@ const PTR_ALIGN: usize = 7; /// `Arena` should be lock-free. #[derive(Debug)] pub struct Arena { - slice: SliceAllocate, - node_alloc: OnlyLayoutAllocate, + slice: FixSizeAllocate, + node_alloc: FixSizeAllocate, } unsafe impl Send for Arena {} @@ -34,34 +39,24 @@ unsafe impl Sync for Arena {} impl Arena { pub(crate) fn new(n: usize) -> Self { assert!(n > 0); - let slice_alloc = SliceAllocate::new(n); - let node_alloc = OnlyLayoutAllocate::new(n); + let slice_alloc = FixSizeAllocate::new(n + 1); + let node_alloc = FixSizeAllocate::new(n + Node::size()); // Don't store data at position 0 in order to reverse offset = 0 as a kind // of nil pointer + slice_alloc.alloc(); + node_alloc.alloc(); Self { slice: slice_alloc, node_alloc, } } - // TODO pub(crate) fn size(&self) -> u32 { - (self.slice.size() + self.node_alloc.len()) as u32 - } - - pub(crate) fn cap(&self) -> usize { - // self.slice.size() - todo!() - } - - // TODO: maybe use MaybeUint instead - pub(crate) fn reset(&self) { - self.slice.reset(); - self.node_alloc.reset(); + (self.slice.len() + self.node_alloc.len()) as u32 } pub(crate) fn valid(&self) -> bool { - !self.slice.ptr.is_empty() + !self.slice.empty() } // Returns a pointer to the node located at offset. If the offset is @@ -77,12 +72,15 @@ impl Arena { if offset == 0 { return None; } - Some(self.node_alloc.get_mut(offset)) + Some(self.node_alloc.get(offset)) } // Returns start location pub(crate) fn put_key(&self, key: &[u8]) -> u32 { - self.slice.append(key) as u32 + let (mut buffer, offset) = self.slice.alloc_slice(key.len()); + buffer.copy_from_slice(key); + println!("==》 {:?}, {:?}", buffer, key); + offset as u32 } // Put will *copy* val into arena. To make better use of this, reuse your input @@ -97,20 +95,19 @@ impl Arena { // Returns byte slice at offset. pub(crate) fn get_key(&self, offset: u32, size: u16) -> &[u8] { - self.slice.get(offset as usize, size as usize) + self.slice.get_slice(offset as usize, size as usize) } // Returns byte slice at offset. The given size should be just the value // size and should NOT include the meta bytes. pub(crate) fn get_val(&self, offset: u32, size: u16) -> ValueStruct { - let buffer = self.slice.get(offset as usize, size as usize); + let buffer = self.slice.get_slice(offset as usize, size as usize); ValueStruct::from(buffer) } // Return byte slice at offset. - // FIXME: pub(crate) fn put_node(&self, height: isize) -> u32 { - let (_, offset) = self.node_alloc.alloc_offset(); + let (_, offset) = self.node_alloc.alloc(); offset as u32 } @@ -121,8 +118,9 @@ impl Arena { return 0; } let node = node as *const u8; - let ptr = self.node_alloc.ptr.as_ptr(); + let ptr = self.node_alloc.get_data_ptr() as *const u8; let offset = unsafe { node.offset_from(ptr) }; + info!("node offset {}", offset); offset as usize } @@ -161,32 +159,46 @@ fn t_arena_value() { assert_eq!(value, load_value); } +#[test] +fn t_arena_memory_allocator() { + tracing_log(); + let sz = 1 << 20; + let n = sz / Node::size(); + let arena = Arena::new(sz); + for i in 1..=n { + let start = arena.put_node(0); + let mut node = arena.get_mut_node(start as usize); + assert!(node.is_some()); + } + let len = arena.node_alloc.len(); + // had a zero node + assert_eq!(len, n * 96 + Node::size()); +} + #[test] fn t_arena_store_node() { - let arena = Arena::new(1 << 20); + tracing_log(); + let sz = 1 << 20; + let n = sz / Node::size(); + let arena = Arena::new(sz); let mut starts = vec![]; - for i in 0..5 { - let start = arena.put_node(i); + for i in 1..=n { + let start = arena.put_node(0); let mut node = arena.get_mut_node(start as usize).unwrap(); - node.height = i as u16; - node.value.fetch_add(i as u64, Ordering::Relaxed); + node.value.fetch_add(i as u64, Ordering::Release); starts.push((i, start)); } for (i, start) in starts { let node = arena.get_mut_node(start as usize).unwrap(); - let value = node.value.load(Ordering::Relaxed); - assert_eq!(node.height, i as u16); + let value = node.value.load(Ordering::Acquire); assert_eq!(value, i as u64); } - - let second_node = arena.get_node(Node::size()).unwrap(); - let offset = arena.get_node_offset(second_node); - assert_eq!(offset, Node::size()); } #[test] fn t_arena_currency() { + mock_log(); let arena = Arc::new(Arena::new(1 << 20)); let mut waits = vec![]; for i in 0..100 { diff --git a/src/skl/skip.rs b/src/skl/skip.rs index 9b376dc..d84213e 100644 --- a/src/skl/skip.rs +++ b/src/skl/skip.rs @@ -2,7 +2,7 @@ use crate::skl::{Cursor, HEIGHT_INCREASE, MAX_HEIGHT}; use crate::table::iterator::IteratorItem; use crate::Xiterator; use atom_box::AtomBox; -use log::info; +use log::{debug, info}; use rand::random; use serde_json::Value; use std::borrow::Cow; @@ -13,6 +13,7 @@ use std::ptr::null_mut; use std::sync::atomic::{AtomicPtr, Ordering}; use std::sync::Arc; use std::{cmp, ptr, ptr::NonNull, sync::atomic::AtomicI32}; +use tracing::field::debug; use crate::y::ValueStruct; @@ -67,12 +68,10 @@ impl SkipList { } pub(crate) fn arena_ref(&self) -> &Arena { - // unsafe {self.arena.as_ref()} &self.arena } pub(crate) fn arena_mut_ref(&self) -> &Arena { - // unsafe {self.arena.as_mut()} &self.arena } @@ -108,7 +107,6 @@ impl SkipList { ) -> (Option<&Node>, bool) { let mut x = self.get_head(); let mut level = self.get_height() - 1; - //println!("start to hight: {}", level); loop { // Assume x.key < key let mut next = self.get_next(x, level); @@ -348,11 +346,11 @@ impl SkipList { // gets the value associated with the key. // FIXME: maybe return Option<&ValueStruct> pub(crate) fn get(&self, key: &[u8]) -> Option { + info!("find a key: {:?}", key); let (node, found) = self.find_near(key, false, true); if !found { return None; } - // println!("find a key: {:?}", key); let (value_offset, value_size) = node.unwrap().get_value_offset(); Some(self.arena_ref().get_val(value_offset, value_size)) } @@ -395,7 +393,6 @@ impl Drop for SkipList { fn drop(&mut self) { let _ref = self._ref.load(Ordering::Relaxed); info!("Drop SkipList, reference: {}", _ref); - self.arena_mut_ref().reset(); } } @@ -602,63 +599,6 @@ impl SkipIterator { } } -// impl Xiterator for SkipIterator { -// type Output = IteratorItem; -// -// fn next(&self) -> Option { -// todo!() -// } -// -// fn rewind(&self) -> Option { -// todo!() -// } -// -// fn seek(&self, key: &[u8]) -> Option { -// if self.node.load(Ordering::Relaxed).is_null() { -// return None; -// } -// -// let node = self.node.load(Ordering::Relaxed); -// if node.is_null() { -// return None; -// } -// let key = node.key(self.st.arena_ref()).to_vec(); -// let value = node.value.load(Ordering::Relaxed); -// Some(IteratorItem{ key: node.key(self.st.arena_ref()).to_vec(), value: Default::default() }) -// } -// -// fn peek(&self) -> Option { -// todo!() -// } -// -// fn close(&self) { -// todo!() -// } -// } - -// impl<'a> Xiterator for SkipIterator<'a> { -// type Output = &'a Node; -// fn next(&self) -> Option { -// todo!() -// } -// -// fn rewind(&self) -> Option { -// todo!() -// } -// -// fn seek(&self, key: &[u8]) -> Option { -// todo!() -// } -// -// fn peek(&self) -> Option { -// todo!() -// } -// -// fn close(&self) { -// self.st.decr_ref(); -// } -// } - mod tests { use crate::skl::node::Node; use crate::skl::skip::SkipList; @@ -1089,30 +1029,15 @@ mod tests { } mod tests2 { - use crate::SkipList; + use crate::{SkipList, ValueStruct}; const ARENA_SIZE: usize = 1 << 20; #[test] fn atomic_swap_skip_list() { let mut st = SkipList::new(ARENA_SIZE); - } - - #[test] - fn gat() { - // #![allow(unused)] - - // trait IterableTypes { - // type Item<'me>; - // type Iterator<'me>: Iterator>; - // } - - // trait Iterable: IterableTypes { - // fn iter<'a>(&'a self) -> Self::Iterator<'a>; - // } - - // struct GatSimple {} - - // impl GatSimple {} + st.put(b"hello", ValueStruct::new(vec![], 0, 0, 0)); + let got = st.get(b"hello"); + assert!(got.is_some()); } } From ffca0f67fb72848931c8848edabbe2df18d95577 Mon Sep 17 00:00:00 2001 From: Rg Date: Sun, 9 Apr 2023 21:57:55 +0800 Subject: [PATCH 69/77] :dog: sleep --- src/kv.rs | 8 +-- src/skl/alloc.rs | 140 +++++++++++++++++++++++++++++++++++++---------- src/skl/arena.rs | 76 ++++++++++--------------- 3 files changed, 141 insertions(+), 83 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 8dae823..233ac43 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -419,7 +419,6 @@ impl KV { continue; } count += req.get_req().entries.read().len(); - info!("waiting for write"); while let Err(err) = self.ensure_room_for_write().await { tokio::time::sleep(Duration::from_millis(10)).await; } @@ -621,17 +620,13 @@ impl KV { async fn ensure_room_for_write(&self) -> Result<()> { defer! {info!("exit ensure room for write!")} // TODO a special global lock for this function - info!("(((((())))))))))))))"); let _ = self.share_lock.write().await; - info!("====))))))))"); if self.must_mt().mem_size() < self.opt.max_table_size as u32 { - info!(")))))))) {}", self.must_mt().mem_size()); return Ok(()); } + info!("flush memory table"); // A nil mt indicates that KV is being closed. - info!(")11)))))))"); assert!(!self.must_mt().empty()); - info!(")11)))))))"); let flush_task = FlushTask { mt: Some(self.must_mt().clone()), vptr: self.must_vptr(), @@ -686,7 +681,6 @@ impl KV { let p = crossbeam_epoch::pin(); let st = self.mem_st_manger.mt_ref(&p).as_raw(); assert!(!st.is_null()); - info!("wat the fuct"); unsafe { &*st } } diff --git a/src/skl/alloc.rs b/src/skl/alloc.rs index b588b33..fa2a9be 100644 --- a/src/skl/alloc.rs +++ b/src/skl/alloc.rs @@ -5,7 +5,10 @@ use std::fmt::{Debug, Display, Formatter}; use std::marker::PhantomData; use std::mem::{align_of, size_of, ManuallyDrop}; +use either::Either; +use libc::off_t; use log::info; +use std::alloc::alloc; use std::ptr::{slice_from_raw_parts, slice_from_raw_parts_mut, NonNull}; use std::sync::atomic::{AtomicPtr, AtomicU32, AtomicU64, AtomicUsize, Ordering}; use std::sync::Arc; @@ -32,6 +35,53 @@ pub trait Chunk: Send + Sync { fn size(&self) -> usize; } +pub struct EitherAllocate { + pub(crate) alloc: FixSizeAllocate, Node>>, + byte_size: AtomicUsize, +} + +impl EitherAllocate { + pub fn new(n: usize) -> Self { + EitherAllocate { + alloc: FixSizeAllocate::new(n), + byte_size: AtomicUsize::new(0), + } + } + + pub fn alloc_vec(&self, sz: usize) -> (&mut Vec, usize) { + let (either, offset) = self.alloc.alloc(); + *either = Either::Left(vec![0u8; sz]); + self.byte_size.fetch_add(sz, Ordering::Relaxed); + (either.as_mut().left().unwrap(), offset) + } + + pub fn alloc_node(&self) -> (&mut Node, usize) { + let (either, offset) = self.alloc.alloc(); + self.byte_size.fetch_add(Node::size(), Ordering::Relaxed); + *either = Either::Right(Node::default()); + (either.as_mut().right().unwrap(), offset) + } + + pub fn get_vec(&self, offset: usize) -> &mut Vec { + let either = self.alloc.get(offset); + either.as_mut().unwrap_left() + } + + pub fn get_node(&self, offset: usize) -> &mut Node { + let either = self.alloc.get(offset); + either.as_mut().unwrap_right() + } + + pub fn first_node(&self) -> &mut Node { + let either = self.alloc.get(0); + either.as_mut().unwrap_right() + } + + pub fn len(&self) -> usize { + self.byte_size.load(Ordering::Relaxed) + } +} + /// FixSizeAllocate fixed size memory allocator, WARNING: zero offset not store any T that for wrap Arena pub struct FixSizeAllocate { ptr: NonNull, @@ -66,37 +116,34 @@ impl FixSizeAllocate { #[inline] pub fn alloc(&self) -> (&mut T, usize) { - let offset = self.len.fetch_add(Self::size(), Ordering::Release); - let end = offset + Self::size(); - // println!("{}, {}, {}", offset, end, self.cap.load(Ordering::Acquire)); - assert!(end <= self.cap.load(Ordering::Acquire)); + let offset = self.len.fetch_add(1, Ordering::Release); let ptr = self.get_data_mut_ptr(); - let ref_data = unsafe { &mut *slice_from_raw_parts_mut(ptr.add(offset), end - offset) }; + let ref_data = + unsafe { &mut *slice_from_raw_parts_mut(ptr.add(self.cal_offset(offset)), 1) }; (&mut ref_data[0], offset) } #[inline] pub fn alloc_slice(&self, sz: usize) -> (&mut [T], usize) { let offset = self.len.fetch_add(Self::size() * sz, Ordering::Release); - let end = offset + sz * Self::size(); - assert!(end <= self.cap.load(Ordering::Acquire)); let ptr = self.get_data_mut_ptr(); - let mut ref_data = - unsafe { &mut *slice_from_raw_parts_mut(ptr.add(offset), (end - offset)) }; + let mut ref_data = unsafe { &mut *slice_from_raw_parts_mut(ptr.add(offset), sz) }; (ref_data, offset) } #[inline] pub fn get(&self, offset: usize) -> &mut T { let mut ptr = self.get_data_mut_ptr(); - let mut ref_data = unsafe { &mut *slice_from_raw_parts_mut(ptr.add(offset), Self::size()) }; + let mut ref_data = + unsafe { &mut *slice_from_raw_parts_mut(ptr.add(self.cal_offset(offset)), 1) }; &mut ref_data[0] } #[inline] pub fn get_slice(&self, offset: usize, n: usize) -> &mut [T] { let ptr = self.get_data_mut_ptr(); - let mut ref_data = unsafe { &mut *slice_from_raw_parts_mut(ptr.add(offset), n) }; + let mut ref_data = + unsafe { &mut *slice_from_raw_parts_mut(ptr.add(self.cal_offset(offset)), n) }; ref_data } @@ -110,6 +157,11 @@ impl FixSizeAllocate { self.len.load(Ordering::Acquire) == 0 } + #[inline] + pub(crate) fn cal_offset(&self, offset: usize) -> usize { + offset * size_of::() + } + #[inline] pub(crate) fn get_data_mut_ptr(&self) -> *mut T { self.get_data_ptr() as *mut T @@ -221,7 +273,7 @@ impl Chunk for BlockBytes { #[derive(Debug, Clone)] pub struct OnlyLayoutAllocate { cursor: Arc, - len: Arc, + cap: Arc, pub(crate) ptr: ManuallyDrop>, _data: PhantomData, } @@ -236,13 +288,13 @@ impl OnlyLayoutAllocate { } pub(crate) fn len(&self) -> usize { - self.len.load(Ordering::Relaxed) + self.cursor.load(Ordering::Relaxed) } pub fn new(n: usize) -> Self { OnlyLayoutAllocate { cursor: Arc::from(AtomicUsize::new(Self::size())), - len: Arc::from(AtomicUsize::new(n)), + cap: Arc::from(AtomicUsize::new(n)), ptr: ManuallyDrop::new(vec![0u8; n]), _data: Default::default(), } @@ -252,7 +304,7 @@ impl OnlyLayoutAllocate { /// **Note** if more than len, it will be panic. pub fn alloc(&self, start: usize) -> &T { let end = self.cursor.fetch_add(Self::size(), Ordering::Acquire); - assert!(end < self.len.load(Ordering::Relaxed)); + assert!(end < self.cap.load(Ordering::Relaxed)); let ptr = self.borrow_slice(start, Self::size()); let (pre, mid, suf) = unsafe { ptr.align_to() }; assert!(pre.is_empty()); @@ -263,7 +315,7 @@ impl OnlyLayoutAllocate { /// **Note** if more than len, it will be panic. pub fn mut_alloc(&self, start: usize) -> &mut T { let end = self.cursor.fetch_add(Self::size(), Ordering::Relaxed); - assert!(end < self.len.load(Ordering::Relaxed)); + assert!(end < self.cap.load(Ordering::Relaxed)); let ptr = self.borrow_mut_slice(start, Self::size()); let (pre, mid, _) = unsafe { ptr.align_to_mut() }; assert!(pre.is_empty()); @@ -272,7 +324,7 @@ impl OnlyLayoutAllocate { pub fn alloc_offset(&self) -> (&T, usize) { let offset = self.cursor.fetch_add(Self::size(), Ordering::Relaxed); - assert!(offset + Self::size() < self.len.load(Ordering::Relaxed)); + assert!(offset + Self::size() < self.cap.load(Ordering::Relaxed)); let ptr = self.borrow_slice(offset, Self::size()); let (pre, mid, _) = unsafe { ptr.align_to() }; assert!(pre.is_empty()); @@ -296,7 +348,7 @@ impl OnlyLayoutAllocate { } pub(crate) fn reset(&self) { - self.len.store(0, Ordering::Relaxed); + self.cap.store(0, Ordering::Relaxed); self.cursor.store(0, Ordering::Relaxed); } @@ -325,7 +377,6 @@ impl OnlyLayoutAllocate { impl Drop for OnlyLayoutAllocate { fn drop(&mut self) { - self.cursor.store(0, Ordering::Relaxed); self.cursor.store(0, Ordering::Relaxed); unsafe { ManuallyDrop::drop(&mut self.ptr); @@ -337,23 +388,21 @@ impl Drop for OnlyLayoutAllocate { #[derive(Debug)] pub struct SliceAllocate { cursor: Arc, - len: Arc, + cap: Arc, pub(crate) ptr: ManuallyDrop>, } unsafe impl Send for SliceAllocate {} -unsafe impl Sync for SliceAllocate {} - impl SliceAllocate { - pub(crate) fn size(&self) -> usize { - self.len.load(Ordering::Relaxed) + pub fn len(&self) -> usize { + self.cursor.load(Ordering::Relaxed) } pub(crate) fn new(n: usize) -> Self { SliceAllocate { cursor: Arc::from(AtomicUsize::new(1)), - len: Arc::from(AtomicUsize::new(n)), + cap: Arc::from(AtomicUsize::new(n)), ptr: ManuallyDrop::new(vec![0u8; n]), } } @@ -365,19 +414,19 @@ impl SliceAllocate { // Return the start locate offset pub(crate) fn alloc(&self, size: usize) -> &[u8] { let offset = self.cursor.fetch_add(size, Ordering::Relaxed); - assert!(self.cursor.load(Ordering::Relaxed) < self.len.load(Ordering::Relaxed)); + assert!(self.cursor.load(Ordering::Relaxed) < self.cap.load(Ordering::Relaxed)); self.borrow_slice(offset, size) } fn alloc_mut(&self, size: usize) -> &mut [u8] { let offset = self.cursor.fetch_add(size, Ordering::Relaxed); - assert!(self.cursor.load(Ordering::Relaxed) < self.len.load(Ordering::Relaxed)); + assert!(self.cursor.load(Ordering::Relaxed) < self.cap.load(Ordering::Relaxed)); self.borrow_mut_slice(offset, size) } pub fn append(&self, bytes: &[u8]) -> usize { let offset = self.cursor.fetch_add(bytes.len(), Ordering::Relaxed); - assert!(self.cursor.load(Ordering::Relaxed) < self.len.load(Ordering::Relaxed)); + assert!(self.cursor.load(Ordering::Relaxed) < self.cap.load(Ordering::Relaxed)); let buffer = self.borrow_mut_slice(offset, bytes.len()); buffer.copy_from_slice(bytes); offset @@ -390,7 +439,7 @@ impl SliceAllocate { } pub fn reset(&self) { - self.len.swap(0, Ordering::Relaxed); + self.cap.swap(0, Ordering::Relaxed); self.cursor.store(0, Ordering::Relaxed); //self.ptr.clear(); } @@ -478,3 +527,36 @@ fn t_block_bytes() { assert_eq!(buffer[datum], datum as u8); } } + +#[test] +fn t_enum() { + let mut alloc = EitherAllocate::new(1 << 10); + let mut offsets = vec![]; + for i in 0..1 << 10 { + if i % 2 == 0 { + let (mut slice, offset) = alloc.alloc_vec(i); + slice.fill((i % u8::MAX as usize) as u8); + offsets.push(offset); + } else { + let (mut node, offset) = alloc.alloc_node(); + offsets.push(offset); + node.value.store(i as u64, Ordering::Relaxed); + } + } + + for i in 0..1 << 10 { + if i % 2 == 0 { + let slice = alloc.get_vec(offsets[i]); + let value = (i % u8::MAX as usize) as u8; + assert_eq!(slice.len(), i); + let mut v = vec![0u8; slice.len()]; + v.fill(value); + assert_eq!(&mut v, slice); + } else { + let node = alloc.get_node(offsets[i]); + assert_eq!(node.value.load(Ordering::Relaxed), i as u64); + } + } + let len = alloc.len(); + println!("{}", len); +} diff --git a/src/skl/arena.rs b/src/skl/arena.rs index f749b30..8effb15 100644 --- a/src/skl/arena.rs +++ b/src/skl/arena.rs @@ -3,11 +3,9 @@ use crate::skl::alloc::{OnlyLayoutAllocate, SliceAllocate}; use crate::skl::node::Node; use crate::skl::Allocate; use crate::skl::{alloc::Chunk, SmartAllocate}; -use crate::test_util::{mock_log, mock_log_terminal, tracing_log}; use crate::y::ValueStruct; use std::default; use std::fmt::format; -use std::io::Write; use std::marker::PhantomData; use std::mem::{size_of, ManuallyDrop}; use std::ptr::{addr_of, slice_from_raw_parts, slice_from_raw_parts_mut, NonNull}; @@ -17,9 +15,6 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::thread::{sleep, spawn}; use std::time::Duration; -use tracing::info; - -use super::alloc::FixSizeAllocate; const OFFSET_SIZE: usize = size_of::(); // FIXME: i don't know @@ -28,8 +23,8 @@ const PTR_ALIGN: usize = 7; /// `Arena` should be lock-free. #[derive(Debug)] pub struct Arena { - slice: FixSizeAllocate, - node_alloc: FixSizeAllocate, + slice: SliceAllocate, + node_alloc: OnlyLayoutAllocate, } unsafe impl Send for Arena {} @@ -39,12 +34,10 @@ unsafe impl Sync for Arena {} impl Arena { pub(crate) fn new(n: usize) -> Self { assert!(n > 0); - let slice_alloc = FixSizeAllocate::new(n + 1); - let node_alloc = FixSizeAllocate::new(n + Node::size()); + let slice_alloc = SliceAllocate::new(n); + let node_alloc = OnlyLayoutAllocate::new(n); // Don't store data at position 0 in order to reverse offset = 0 as a kind // of nil pointer - slice_alloc.alloc(); - node_alloc.alloc(); Self { slice: slice_alloc, node_alloc, @@ -55,8 +48,14 @@ impl Arena { (self.slice.len() + self.node_alloc.len()) as u32 } + // TODO: maybe use MaybeUint instead + pub(crate) fn reset(&self) { + self.slice.reset(); + self.node_alloc.reset(); + } + pub(crate) fn valid(&self) -> bool { - !self.slice.empty() + !self.slice.ptr.is_empty() } // Returns a pointer to the node located at offset. If the offset is @@ -72,15 +71,12 @@ impl Arena { if offset == 0 { return None; } - Some(self.node_alloc.get(offset)) + Some(self.node_alloc.get_mut(offset)) } // Returns start location pub(crate) fn put_key(&self, key: &[u8]) -> u32 { - let (mut buffer, offset) = self.slice.alloc_slice(key.len()); - buffer.copy_from_slice(key); - println!("==》 {:?}, {:?}", buffer, key); - offset as u32 + self.slice.append(key) as u32 } // Put will *copy* val into arena. To make better use of this, reuse your input @@ -95,19 +91,20 @@ impl Arena { // Returns byte slice at offset. pub(crate) fn get_key(&self, offset: u32, size: u16) -> &[u8] { - self.slice.get_slice(offset as usize, size as usize) + self.slice.get(offset as usize, size as usize) } // Returns byte slice at offset. The given size should be just the value // size and should NOT include the meta bytes. pub(crate) fn get_val(&self, offset: u32, size: u16) -> ValueStruct { - let buffer = self.slice.get_slice(offset as usize, size as usize); + let buffer = self.slice.get(offset as usize, size as usize); ValueStruct::from(buffer) } // Return byte slice at offset. + // FIXME: pub(crate) fn put_node(&self, height: isize) -> u32 { - let (_, offset) = self.node_alloc.alloc(); + let (_, offset) = self.node_alloc.alloc_offset(); offset as u32 } @@ -118,9 +115,8 @@ impl Arena { return 0; } let node = node as *const u8; - let ptr = self.node_alloc.get_data_ptr() as *const u8; + let ptr = self.node_alloc.ptr.as_ptr(); let offset = unsafe { node.offset_from(ptr) }; - info!("node offset {}", offset); offset as usize } @@ -159,46 +155,32 @@ fn t_arena_value() { assert_eq!(value, load_value); } -#[test] -fn t_arena_memory_allocator() { - tracing_log(); - let sz = 1 << 20; - let n = sz / Node::size(); - let arena = Arena::new(sz); - for i in 1..=n { - let start = arena.put_node(0); - let mut node = arena.get_mut_node(start as usize); - assert!(node.is_some()); - } - let len = arena.node_alloc.len(); - // had a zero node - assert_eq!(len, n * 96 + Node::size()); -} - #[test] fn t_arena_store_node() { - tracing_log(); - let sz = 1 << 20; - let n = sz / Node::size(); - let arena = Arena::new(sz); + let arena = Arena::new(1 << 20); let mut starts = vec![]; - for i in 1..=n { - let start = arena.put_node(0); + for i in 0..5 { + let start = arena.put_node(i); let mut node = arena.get_mut_node(start as usize).unwrap(); - node.value.fetch_add(i as u64, Ordering::Release); + node.height = i as u16; + node.value.fetch_add(i as u64, Ordering::Relaxed); starts.push((i, start)); } for (i, start) in starts { let node = arena.get_mut_node(start as usize).unwrap(); - let value = node.value.load(Ordering::Acquire); + let value = node.value.load(Ordering::Relaxed); + assert_eq!(node.height, i as u16); assert_eq!(value, i as u64); } + + let second_node = arena.get_node(Node::size()).unwrap(); + let offset = arena.get_node_offset(second_node); + assert_eq!(offset, Node::size()); } #[test] fn t_arena_currency() { - mock_log(); let arena = Arc::new(Arena::new(1 << 20)); let mut waits = vec![]; for i in 0..100 { From 1f198cd0988b377ee45214a9a4d740cfd94c7f2f Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 10 Apr 2023 09:38:40 +0800 Subject: [PATCH 70/77] :dog: sleep --- src/iterator.rs | 36 ++++++- src/kv.rs | 64 +++++++---- src/options/mod.rs | 3 - src/skl/alloc.rs | 257 +++------------------------------------------ src/skl/arena.rs | 6 +- src/skl/mod.rs | 2 +- src/skl/skip.rs | 2 +- src/types.rs | 13 --- src/value_log.rs | 27 +++-- 9 files changed, 110 insertions(+), 300 deletions(-) diff --git a/src/iterator.rs b/src/iterator.rs index 07e731f..188aedd 100644 --- a/src/iterator.rs +++ b/src/iterator.rs @@ -1,13 +1,13 @@ use crate::iterator::PreFetchStatus::Prefetched; use crate::kv::_BADGER_PREFIX; -use crate::types::{ArcMx, ArcRW, Closer, TArcMx, TArcRW}; -use crate::MergeIterOverIterator; +use crate::types::{ArcMx, ArcRW, Channel, Closer, TArcMx, TArcRW}; use crate::{ kv::KV, types::XArc, value_log::{MetaBit, ValuePointer}, Decode, Result, Xiterator, }; +use crate::{MergeIterOverIterator, ValueStruct}; use log::Metadata; use parking_lot::RwLock; use std::future::Future; @@ -15,6 +15,8 @@ use std::pin::Pin; use std::sync::atomic::{AtomicU8, Ordering}; use std::sync::Arc; use std::{io::Cursor, sync::atomic::AtomicU64}; +use tokio::io::AsyncWriteExt; +use tokio::time::Sleep; #[derive(Debug, PartialEq)] pub(crate) enum PreFetchStatus { @@ -22,7 +24,7 @@ pub(crate) enum PreFetchStatus { Prefetched, } -type KVItem = TArcRW; +pub(crate) type KVItem = TArcRW; // Returned during iteration. Both the key() and value() output is only valid until // iterator.next() is called. @@ -41,11 +43,39 @@ pub(crate) struct KVItemInner { } impl KVItemInner { + pub(crate) fn new(key: Vec, value: ValueStruct, kv: XArc) -> KVItemInner { + Self { + status: Arc::new(tokio::sync::RwLock::new(PreFetchStatus::Empty)), + kv, + key, + value: Arc::new(Default::default()), + vptr: value.value, + meta: value.meta, + user_meta: value.user_meta, + cas_counter: Arc::new(AtomicU64::new(value.cas_counter)), + wg: Closer::new("kv".to_owned()), + err: Ok(()), + } + } + // Returns the key. Remember to copy if you need to access it outside the iteration loop. pub(crate) fn key(&self) -> &[u8] { &self.key } + pub async fn get_value(&self) -> Result> { + let ch = Channel::new(1); + self.value(|value| { + let tx = ch.tx(); + Box::pin(async move { + tx.send(value).await.unwrap(); + Ok(()) + }) + }) + .await?; + Ok(ch.recv().await.unwrap()) + } + // Value retrieves the value of the item from the value log. It calls the // consumer function with a slice argument representing the value. In case // of error, the consumer function is not called. diff --git a/src/kv.rs b/src/kv.rs index 233ac43..9bc47b7 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -1,11 +1,11 @@ -use crate::iterator::{IteratorExt, IteratorOptions, KVItemInner}; +use crate::iterator::{IteratorExt, IteratorOptions, KVItem, KVItemInner}; use crate::levels::{LevelsController, XLevelsController}; use crate::manifest::{open_or_create_manifest_file, Manifest, ManifestFile}; use crate::options::Options; use crate::table::builder::Builder; use crate::table::iterator::{IteratorImpl, IteratorItem}; use crate::table::table::{new_file_name, Table, TableCore}; -use crate::types::{ArcMx, Channel, Closer, TArcMx, XArc, XWeak}; +use crate::types::{ArcMx, Channel, Closer, TArcMx, TArcRW, XArc, XWeak}; use crate::value_log::{ ArcRequest, Entry, EntryPair, EntryType, MetaBit, Request, ValueLogCore, ValuePointer, MAX_KEY_SIZE, @@ -85,7 +85,7 @@ pub struct KV { value_dir_guard: File, pub closers: Closers, // Our latest (actively written) in-memory table. - mem_st_manger: Arc, + pub mem_st_manger: Arc, // Add here only AFTER pushing to flush_ch pub write_ch: Channel, // Incremented in the non-concurrently accessed write loop. But also accessed outside. So @@ -205,7 +205,7 @@ impl KV { } // Get the lasted ValueLog Recover Pointer - let item = match xout.get(_HEAD) { + let item = match xout._get(_HEAD) { Err(NotFound) => ValueStruct::default(), // Give it a default value Err(_) => return Err("Retrieving head".into()), Ok(item) => item, @@ -251,7 +251,7 @@ impl KV { // TODO why? if entry.cas_counter_check != 0 { - let old_value = xout.get(&entry.key)?; + let old_value = xout._get(&entry.key)?; if old_value.cas_counter != entry.cas_counter_check { return Ok(true); } @@ -328,10 +328,10 @@ impl KV { // get returns the value in `mem_table` or disk for given key. // Note that value will include meta byte. - pub(crate) fn get(&self, key: &[u8]) -> Result { + pub(crate) fn _get(&self, key: &[u8]) -> Result { let p = crossbeam_epoch::pin(); let tables = self.get_mem_tables(&p); - + // info!("tabels {}", tables.len()); // TODO add metrics for tb in tables { let vs = unsafe { tb.as_ref().unwrap().get(key) }; @@ -340,7 +340,7 @@ impl KV { } let vs = vs.unwrap(); // TODO why - if vs.meta != 0 && !vs.value.is_empty() { + if vs.meta != 0 || !vs.value.is_empty() { return Ok(vs); } } @@ -492,7 +492,8 @@ impl KV { // TODO pub(crate) async fn batch_set(&self, entries: Vec) -> Result> { let mut bad = vec![]; - let mut b = vec![Request::default()]; + let mut batch_reqs = vec![]; + let mut b = Some(Request::default()); let mut count = 0; let mut sz = 0u64; for entry in entries { @@ -506,25 +507,42 @@ impl KV { } count += 1; sz += self.opt.estimate_size(&entry) as u64; - let req = b.last_mut().unwrap(); - req.entries - .write() - .push(parking_lot::RwLock::new(EntryType::from(entry))); - req.ptrs.lock().push(None); + + { + b.as_ref() + .unwrap() + .entries + .write() + .push(parking_lot::RwLock::new(EntryType::from(entry))); + b.as_ref().unwrap().ptrs.lock().push(None); + } + if count >= self.opt.max_batch_count || sz >= self.opt.max_batch_count { - b.push(Request::default()); + let task_req = b.replace(Request::default()); + batch_reqs.push(task_req.unwrap()); + count = 0; + sz = 0; + } + } + if let Some(req) = b { + if !req.entries.read().is_empty() { + batch_reqs.push(req); } } + let mut reqs = vec![]; - for req in b { + for req in batch_reqs { if req.entries.read().is_empty() { break; } let arc_req = ArcRequest::from(req); reqs.push(arc_req.clone()); assert!(!self.write_ch.is_close()); + info!( + "send tasks to write, entries: {}", + arc_req.get_req().entries.read().len() + ); self.write_ch.send(arc_req).await.unwrap(); - info!("send task to write"); } if !bad.is_empty() { let req = Request::default(); @@ -552,7 +570,7 @@ impl KV { let mut entry_pair = pair.write(); let entry = entry_pair.entry(); if entry.cas_counter_check != 0 { - let old_value = self.get(&entry.key)?; + let old_value = self._get(&entry.key)?; // No need to decode existing value. Just need old CAS counter. if old_value.cas_counter != entry.cas_counter_check { entry_pair.set_resp(Err(Error::ValueCasMisMatch)); @@ -601,7 +619,7 @@ impl KV { } fn exists(&self, key: &[u8]) -> Result { - let value = self.get(key)?; + let value = self._get(key)?; if value.value.is_empty() && value.meta == 0 { return Ok(false); } @@ -677,7 +695,7 @@ impl KV { lc } - fn must_mt(&self) -> &SkipList { + pub(crate) fn must_mt(&self) -> &SkipList { let p = crossbeam_epoch::pin(); let st = self.mem_st_manger.mt_ref(&p).as_raw(); assert!(!st.is_null()); @@ -737,6 +755,12 @@ impl ArcKV { self.manifest.write().await } + pub async fn get(&self, key: &[u8]) -> Result> { + let got = self._get(key)?; + let inner = KVItemInner::new(key.to_vec(), got, self.clone()); + inner.get_value().await + } + /// Closes a KV. It's crucial to call it to ensure all the pending updates /// make their way to disk. pub async fn close(&self) -> Result<()> { diff --git a/src/options/mod.rs b/src/options/mod.rs index a9e658d..79d970a 100644 --- a/src/options/mod.rs +++ b/src/options/mod.rs @@ -107,6 +107,3 @@ impl Default for Options { } } } - -#[test] -fn it() {} diff --git a/src/skl/alloc.rs b/src/skl/alloc.rs index fa2a9be..0777510 100644 --- a/src/skl/alloc.rs +++ b/src/skl/alloc.rs @@ -5,6 +5,7 @@ use std::fmt::{Debug, Display, Formatter}; use std::marker::PhantomData; use std::mem::{align_of, size_of, ManuallyDrop}; +use atom_box::AtomBox; use either::Either; use libc::off_t; use log::info; @@ -35,234 +36,33 @@ pub trait Chunk: Send + Sync { fn size(&self) -> usize; } -pub struct EitherAllocate { - pub(crate) alloc: FixSizeAllocate, Node>>, - byte_size: AtomicUsize, -} - -impl EitherAllocate { - pub fn new(n: usize) -> Self { - EitherAllocate { - alloc: FixSizeAllocate::new(n), - byte_size: AtomicUsize::new(0), - } - } - - pub fn alloc_vec(&self, sz: usize) -> (&mut Vec, usize) { - let (either, offset) = self.alloc.alloc(); - *either = Either::Left(vec![0u8; sz]); - self.byte_size.fetch_add(sz, Ordering::Relaxed); - (either.as_mut().left().unwrap(), offset) - } - - pub fn alloc_node(&self) -> (&mut Node, usize) { - let (either, offset) = self.alloc.alloc(); - self.byte_size.fetch_add(Node::size(), Ordering::Relaxed); - *either = Either::Right(Node::default()); - (either.as_mut().right().unwrap(), offset) - } - - pub fn get_vec(&self, offset: usize) -> &mut Vec { - let either = self.alloc.get(offset); - either.as_mut().unwrap_left() - } - - pub fn get_node(&self, offset: usize) -> &mut Node { - let either = self.alloc.get(offset); - either.as_mut().unwrap_right() - } - - pub fn first_node(&self) -> &mut Node { - let either = self.alloc.get(0); - either.as_mut().unwrap_right() - } - - pub fn len(&self) -> usize { - self.byte_size.load(Ordering::Relaxed) - } -} - -/// FixSizeAllocate fixed size memory allocator, WARNING: zero offset not store any T that for wrap Arena -pub struct FixSizeAllocate { - ptr: NonNull, - cap: AtomicUsize, - len: AtomicUsize, -} - -impl Debug for FixSizeAllocate { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("FixSizeAllocate") - .field("cap", &self.cap) - .field("len", &self.cap) - .finish() - } -} - -impl FixSizeAllocate { - pub(crate) fn size() -> usize { - size_of::() - } - - pub fn new(mut sz: usize) -> Self { - let layout = std::alloc::Layout::array::(sz).unwrap(); - let ptr = unsafe { std::alloc::alloc(layout) } as *mut T; - let mut allocate = FixSizeAllocate { - ptr: NonNull::new(ptr).unwrap(), - cap: AtomicUsize::new(sz), - len: AtomicUsize::new(0), - }; - allocate - } - - #[inline] - pub fn alloc(&self) -> (&mut T, usize) { - let offset = self.len.fetch_add(1, Ordering::Release); - let ptr = self.get_data_mut_ptr(); - let ref_data = - unsafe { &mut *slice_from_raw_parts_mut(ptr.add(self.cal_offset(offset)), 1) }; - (&mut ref_data[0], offset) - } - - #[inline] - pub fn alloc_slice(&self, sz: usize) -> (&mut [T], usize) { - let offset = self.len.fetch_add(Self::size() * sz, Ordering::Release); - let ptr = self.get_data_mut_ptr(); - let mut ref_data = unsafe { &mut *slice_from_raw_parts_mut(ptr.add(offset), sz) }; - (ref_data, offset) - } - - #[inline] - pub fn get(&self, offset: usize) -> &mut T { - let mut ptr = self.get_data_mut_ptr(); - let mut ref_data = - unsafe { &mut *slice_from_raw_parts_mut(ptr.add(self.cal_offset(offset)), 1) }; - &mut ref_data[0] - } - - #[inline] - pub fn get_slice(&self, offset: usize, n: usize) -> &mut [T] { - let ptr = self.get_data_mut_ptr(); - let mut ref_data = - unsafe { &mut *slice_from_raw_parts_mut(ptr.add(self.cal_offset(offset)), n) }; - ref_data - } - - #[inline] - pub fn len(&self) -> usize { - self.len.load(Ordering::Acquire) - } - - #[inline] - pub fn empty(&self) -> bool { - self.len.load(Ordering::Acquire) == 0 - } - - #[inline] - pub(crate) fn cal_offset(&self, offset: usize) -> usize { - offset * size_of::() - } - - #[inline] - pub(crate) fn get_data_mut_ptr(&self) -> *mut T { - self.get_data_ptr() as *mut T - } - - #[inline] - pub(crate) fn get_data_ptr(&self) -> *const T { - self.ptr.as_ptr() - } -} - -impl Drop for FixSizeAllocate { - fn drop(&mut self) { - info!( - "Drop fix size allocator, cap:{}, len:{}", - self.cap.load(Ordering::Acquire), - self.len.load(Ordering::Acquire) - ); - if self.cap.load(Ordering::Acquire) != 0 { - let layout = std::alloc::Layout::array::(self.cap.load(Ordering::Acquire)).unwrap(); - unsafe { - std::alloc::dealloc(self.ptr.as_ptr() as *mut u8, layout); - } - } - } -} - #[derive(Debug)] #[repr(C)] -pub struct SmartAllocate { - pub(crate) ptr: std::mem::ManuallyDrop>, - pub(crate) count: AtomicU64, -} - -impl Allocate for SmartAllocate { - type Block = impl Chunk; - #[inline] - fn alloc(&self, start: usize, n: usize) -> Self::Block { - assert!(start + n <= self.size()); - self.count.store((start + n) as u64, Ordering::Release); - let ptr = self.get_data_ptr(); - let block_ptr = unsafe { ptr.add(start) as *mut u8 }; - let block = BlockBytes::new(NonNull::new(block_ptr).unwrap(), n); - block - } - #[inline] - fn size(&self) -> usize { - self.ptr.len() - } - #[inline] - fn used_count(&self) -> usize { - self.count.load(Ordering::Acquire) as usize - } -} - -impl SmartAllocate { - pub(crate) fn new(m: std::mem::ManuallyDrop>) -> Self { - SmartAllocate { - ptr: m, - count: Default::default(), - } - } - - #[inline] - pub(crate) fn get_data_ptr(&self) -> *const u8 { - self.ptr.as_ptr() - } -} - -impl Drop for SmartAllocate { - fn drop(&mut self) { - unsafe { std::mem::ManuallyDrop::drop(&mut self.ptr) }; - } -} - -#[derive(Clone, Debug)] -#[repr(C)] pub struct BlockBytes { - start: NonNull, + start: AtomicPtr, n: usize, } -unsafe impl Send for BlockBytes {} - -unsafe impl Sync for BlockBytes {} - impl BlockBytes { - pub(crate) fn new(start: NonNull, n: usize) -> Self { - BlockBytes { start, n } + pub(crate) fn new(start: *mut u8, n: usize) -> Self { + BlockBytes { + start: AtomicPtr::new(start), + n, + } } } impl Chunk for BlockBytes { #[inline] fn get_data(&self) -> &[u8] { - unsafe { &*slice_from_raw_parts(self.start.as_ptr(), self.n) } + unsafe { &*slice_from_raw_parts(self.start.load(Ordering::Relaxed), self.n) } } + #[inline] fn get_data_mut(&self) -> &mut [u8] { - unsafe { &mut *slice_from_raw_parts_mut(self.start.as_ptr(), self.n) } + unsafe { &mut *slice_from_raw_parts_mut(self.start.load(Ordering::Relaxed), self.n) } } + #[inline] fn size(&self) -> usize { self.n @@ -516,7 +316,7 @@ fn t_onlylayoutalloc_slice() { #[test] fn t_block_bytes() { let mut buffer = vec![0u8; 1024]; - let block = BlockBytes::new(NonNull::new(buffer.as_mut_ptr()).unwrap(), 10); + let block = BlockBytes::new(buffer.as_mut_ptr(), 10); { let data = block.get_data_mut(); for datum in 0..data.len() { @@ -527,36 +327,3 @@ fn t_block_bytes() { assert_eq!(buffer[datum], datum as u8); } } - -#[test] -fn t_enum() { - let mut alloc = EitherAllocate::new(1 << 10); - let mut offsets = vec![]; - for i in 0..1 << 10 { - if i % 2 == 0 { - let (mut slice, offset) = alloc.alloc_vec(i); - slice.fill((i % u8::MAX as usize) as u8); - offsets.push(offset); - } else { - let (mut node, offset) = alloc.alloc_node(); - offsets.push(offset); - node.value.store(i as u64, Ordering::Relaxed); - } - } - - for i in 0..1 << 10 { - if i % 2 == 0 { - let slice = alloc.get_vec(offsets[i]); - let value = (i % u8::MAX as usize) as u8; - assert_eq!(slice.len(), i); - let mut v = vec![0u8; slice.len()]; - v.fill(value); - assert_eq!(&mut v, slice); - } else { - let node = alloc.get_node(offsets[i]); - assert_eq!(node.value.load(Ordering::Relaxed), i as u64); - } - } - let len = alloc.len(); - println!("{}", len); -} diff --git a/src/skl/arena.rs b/src/skl/arena.rs index 8effb15..2fdfb67 100644 --- a/src/skl/arena.rs +++ b/src/skl/arena.rs @@ -1,8 +1,8 @@ // use crate::skl::{Node, OwnedNode, MAX_HEIGHT, MAX_NODE_SIZE}; +use crate::skl::alloc::Chunk; use crate::skl::alloc::{OnlyLayoutAllocate, SliceAllocate}; use crate::skl::node::Node; use crate::skl::Allocate; -use crate::skl::{alloc::Chunk, SmartAllocate}; use crate::y::ValueStruct; use std::default; use std::fmt::format; @@ -27,10 +27,6 @@ pub struct Arena { node_alloc: OnlyLayoutAllocate, } -unsafe impl Send for Arena {} - -unsafe impl Sync for Arena {} - impl Arena { pub(crate) fn new(n: usize) -> Self { assert!(n > 0); diff --git a/src/skl/mod.rs b/src/skl/mod.rs index bc9303e..7cda973 100644 --- a/src/skl/mod.rs +++ b/src/skl/mod.rs @@ -4,7 +4,7 @@ mod cursor; mod node; mod skip; -pub use alloc::{Allocate, BlockBytes, Chunk, SmartAllocate}; +pub use alloc::{Allocate, BlockBytes, Chunk}; pub use arena::Arena; pub use cursor::Cursor; pub use node::Node; diff --git a/src/skl/skip.rs b/src/skl/skip.rs index d84213e..9df4c13 100644 --- a/src/skl/skip.rs +++ b/src/skl/skip.rs @@ -346,7 +346,7 @@ impl SkipList { // gets the value associated with the key. // FIXME: maybe return Option<&ValueStruct> pub(crate) fn get(&self, key: &[u8]) -> Option { - info!("find a key: {:?}", key); + // info!("find a key: {:?}", key); let (node, found) = self.find_near(key, false, true); if !found { return None; diff --git a/src/types.rs b/src/types.rs index 982d029..3fea7a8 100644 --- a/src/types.rs +++ b/src/types.rs @@ -311,19 +311,6 @@ impl Deref for XVec { } } -#[tokio::test] -async fn it_closer1() { - let closer = Closer::new("test".to_owned()); - let ch = closer.has_been_closed(); - tokio::select! { - ret = ch.recv() => { - println!("{:?}", ret); - } - } - // let err = closer.has_been_closed().recv().await; - // println!("{:?}", err); -} - #[test] fn it_closer() { let runtime = tokio::runtime::Runtime::new().unwrap(); diff --git a/src/value_log.rs b/src/value_log.rs index 5cea934..4f6e69a 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -341,7 +341,7 @@ impl EntryPair { } } -pub(crate) struct Request { +pub struct Request { // Input values, NOTE: RefCell is called concurrency pub(crate) entries: RwLock>>, // Output Values and wait group stuff below @@ -385,20 +385,29 @@ unsafe impl Send for ArcRequest {} unsafe impl Sync for ArcRequest {} impl ArcRequest { - pub(crate) fn get_req(&self) -> Arc { - self.inner.clone() + pub fn req_ref(&self) -> &Arc { + &self.inner } - pub(crate) fn req_ref(&self) -> &Arc { - &self.inner + pub fn to_inner(self) -> Request { + Arc::into_inner(self.inner).unwrap() + } + + pub async fn is_ok(&self) -> bool { + let resp = self.get_req().get_resp().await; + resp.is_ok() + } + + pub async fn get_resp(&self) -> Result<()> { + self.get_req().get_resp().await } pub(crate) async fn set_err(&self, err: Result<()>) { self.inner.res.send(err).await.expect("TODO: panic message"); } - pub(crate) fn to_inner(self) -> Request { - Arc::into_inner(self.inner).unwrap() + pub(crate) fn get_req(&self) -> Arc { + self.inner.clone() } } @@ -833,7 +842,7 @@ impl ValueLogCore { } // TODO don't need decode vptr let entry = &mut entries[0].0; - let vs = kv.get(&entry.key); + let vs = kv._get(&entry.key); if let Err(ref err) = vs { if err.is_not_found() { info!( @@ -1094,7 +1103,7 @@ impl SafeValueLog { if start.elapsed().unwrap().as_secs() > 10 { return Err("stop iteration".into()); } - let vs = kv.get(&entry.key)?; + let vs = kv._get(&entry.key)?; if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { // Key has been deleted. Discard. reason.discard += esz; From de96f5d98abf14a99f182abde220db5010a90a2b Mon Sep 17 00:00:00 2001 From: Rg Date: Mon, 10 Apr 2023 19:28:51 +0800 Subject: [PATCH 71/77] :dog: --- src/db.rs | 18 -- src/kv.rs | 114 ++++++++++++- src/levels.rs | 5 + src/lib.rs | 7 +- src/value_log.rs | 418 +++++++++++++++++++++++++++++++---------------- 5 files changed, 400 insertions(+), 162 deletions(-) delete mode 100644 src/db.rs diff --git a/src/db.rs b/src/db.rs deleted file mode 100644 index 3d46aaa..0000000 --- a/src/db.rs +++ /dev/null @@ -1,18 +0,0 @@ -use crate::kv::KV; -use crate::options::Options; -use crate::types::{XArc, XWeak}; - -pub struct DataBase { - kv: XArc, - VL: Option, -} -// -// impl DataBase { -// async fn new() { -// let kv = KV::open(Options::default()).await; -// } -// } - -pub struct VL { - kv: XWeak, -} diff --git a/src/kv.rs b/src/kv.rs index 9bc47b7..52df73a 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -52,7 +52,8 @@ pub const _HEAD: &[u8; 11] = b"!bager!head"; // For Storing value offset for rep pub struct Closers { pub update_size: Closer, pub compactors: Closer, - pub mem_table: Closer, // Wait flush job exit + pub mem_table: Closer, + // Wait flush job exit pub writes: Closer, pub value_gc: Closer, } @@ -98,6 +99,7 @@ pub struct KV { } unsafe impl Send for KV {} + unsafe impl Sync for KV {} impl Drop for KV { @@ -580,7 +582,7 @@ impl KV { if entry.meta == MetaBit::BIT_SET_IF_ABSENT.bits() { // Someone else might have written a value, so lets check again if key exists. - let exits = self.exists(&entry.key)?; + let exits = self._exists(&entry.key)?; // Value already exists. don't write. if exits { entry_pair.set_resp(Err(Error::ValueKeyExists)); @@ -618,7 +620,7 @@ impl KV { Ok(()) } - fn exists(&self, key: &[u8]) -> Result { + fn _exists(&self, key: &[u8]) -> Result { let value = self._get(key)?; if value.value.is_empty() && value.meta == 0 { return Ok(false); @@ -755,12 +757,118 @@ impl ArcKV { self.manifest.write().await } + /// Return a value that will async load value, if want not return value, should be `exists` pub async fn get(&self, key: &[u8]) -> Result> { let got = self._get(key)?; let inner = KVItemInner::new(key.to_vec(), got, self.clone()); inner.get_value().await } + /// Set sets the provided value for a given key. If key is not present, it is created. If it is + /// present, the existing value is overwritten with the one provided. + /// Along with key and value, Set can also take an optional userMeta byte. This byte is stored + /// alongside the key, and can be used as an aid to interpret the value or store other contextual + /// bits corresponding to the key-value pair. + pub async fn set(&self, key: Vec, value: Vec, user_meta: u8) -> Result<()> { + self.to_ref().set(key, value, user_meta).await + } + + /// Sets value of key if key is not present. + /// If it is present, it returns the key_exists error. + /// TODO it should be atomic operate + pub async fn set_if_ab_sent(&self, key: Vec, value: Vec, user_meta: u8) -> Result<()> { + let exists = self.exists(&key).await?; + // found the key, return key_exists + if exists { + return Err(Error::ValueKeyExists); + } + let entry = Entry::default() + .key(key) + .value(value) + .user_meta(user_meta) + .meta(MetaBit::BIT_SET_IF_ABSENT.bits()); + let ret = self.batch_set(vec![entry]).await; + ret[0].to_owned() + } + + /// Return Ok(true) if key exists, Ok(false) if key not exists, Otherwise Err(err) if happen some error. + pub async fn exists(&self, key: &[u8]) -> Result { + return self._exists(key); + } + + /// Batch set entries, returns result sets + pub async fn batch_set(&self, entries: Vec) -> Vec> { + let reqs = self.to_ref().batch_set(entries).await.unwrap(); + let mut res = Vec::with_capacity(reqs.len()); + for req in reqs { + let ret = req.get_resp().await; + res.push(ret); + } + res + } + + /// CompareAndSetAsync is the asynchronous version of CompareAndSet. It accepts a callback function + /// which is called when the CompareAndSet completes. Any error encountered during execution is + /// passed as an argument to the callback function. + pub async fn compare_and_set( + &self, + key: Vec, + value: Vec, + cas_counter: u64, + ) -> Result<()> { + let entry = Entry::default() + .key(key) + .value(value) + .cas_counter_check(cas_counter); + let ret = self.batch_set(vec![entry]).await; + ret[0].to_owned() + } + + /// Delete deletes a key. + /// Exposing this so that user does not have to specify the Entry directly. + /// For example, BitDelete seems internal to badger. + pub async fn delete(&self, key: Vec) -> Result<()> { + let entry = Entry::default().key(key); + let ret = self.batch_set(vec![entry]).await; + ret[0].to_owned() + } + + /// CompareAndDelete deletes a key ensuring that it has not been changed since last read. + /// If existing key has different casCounter, this would not delete the key and return an error. + pub async fn compare_and_delete(&self, key: Vec, cas_counter: u64) -> Result<()> { + let entry = Entry::default().key(key).cas_counter_check(cas_counter); + let ret = self.batch_set(vec![entry]).await; + ret[0].to_owned() + } + + /// RunValueLogGC would trigger a value log garbage collection with no guarantees that a call would + /// result in a space reclaim. Every run would in the best case rewrite only one log file. So, + /// repeated calls may be necessary. + /// + /// The way it currently works is that it would randomly pick up a value log file, and sample it. If + /// the sample shows that we can discard at least discardRatio space of that file, it would be + /// rewritten. Else, an ErrNoRewrite error would be returned indicating that the GC didn't result in + /// any file rewrite. + /// + /// We recommend setting discardRatio to 0.5, thus indicating that a file be rewritten if half the + /// space can be discarded. This results in a lifetime value log write amplification of 2 (1 from + /// original write + 0.5 rewrite + 0.25 + 0.125 + ... = 2). Setting it to higher value would result + /// in fewer space reclaims, while setting it to a lower value would result in more space reclaims at + /// the cost of increased activity on the LSM tree. discardRatio must be in the range (0.0, 1.0), + /// both endpoints excluded, otherwise an ErrInvalidRequest is returned. + /// + /// Only one GC is allowed at a time. If another value log GC is running, or KV has been closed, this + /// would return an ErrRejected. + /// + /// Note: Every time GC is run, it would produce a spike of activity on the LSM tree. + pub async fn run_value_log_gc(&self, discard_ratio: f64) -> Result<()> { + // if discard_ratio >= 1.0 || discard_ratio <= 0.0 { + // Err(Error::ValueInvalidRequest); + // } + // self.must_vlog().wait_on_gc(lc) + todo!() + } + /// Closes a KV. It's crucial to call it to ensure all the pending updates /// make their way to disk. pub async fn close(&self) -> Result<()> { diff --git a/src/levels.rs b/src/levels.rs index 3a971d0..0743031 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -830,3 +830,8 @@ async fn revert_to_manifest( } Ok(()) } + +#[test] +fn it() { + +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 4e00f24..325a4ea 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,11 +33,11 @@ mod value_log; #[cfg(test)] mod value_log_tests; mod y; +mod level_handler; mod compaction; -#[cfg(test)] -mod kv_test; -mod level_handler; +// #[cfg(test)] +// mod kv_test; mod levels; mod mmap; mod pb; @@ -45,7 +45,6 @@ mod st_manager; #[cfg(test)] mod test_util; -mod db; pub use skl::*; pub use st_manager::*; diff --git a/src/value_log.rs b/src/value_log.rs index 4f6e69a..b26eeba 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -5,6 +5,7 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use crc32fast::Hasher; use drop_cell::defer; use either::Either; +use libc::memchr; use log::info; use log::kv::Source; use memmap::{Mmap, MmapMut}; @@ -118,6 +119,33 @@ pub struct Entry { pub(crate) cas_counter: u64, } +impl Entry { + pub fn key(mut self, key: Vec) -> Self { + self.key = key; + self + } + + pub fn value(mut self, value: Vec) -> Self { + self.value = value; + self + } + + pub fn meta(mut self, meta: u8) -> Self { + self.meta = meta; + self + } + + pub fn user_meta(mut self, user_meta: u8) -> Self { + self.user_meta = user_meta; + self + } + + pub fn cas_counter_check(mut self, cas: u64) -> Self { + self.cas_counter_check = cas; + self + } +} + impl Entry { pub(crate) fn from_slice(cursor_offset: u32, m: &[u8]) -> Result { let mut entry = Entry::default(); @@ -369,6 +397,8 @@ impl Request { } } +/// TODO add a field to indicate the request is every import, must be handle it immediately +/// Eg: compare_and_set #[derive(Clone)] pub struct ArcRequest { inner: Arc, @@ -425,7 +455,8 @@ pub struct ValueLogCore { // TODO // guards our view of which files exist, which to be deleted, how many active iterators pub(crate) files_log: Arc>, - vlogs: Arc>>>>, // TODO It is not good idea that use raw lock for Arc>, it maybe lock AsyncRuntime thread. + vlogs: Arc>>>>, + // TODO It is not good idea that use raw lock for Arc>, it maybe lock AsyncRuntime thread. dirty_vlogs: Arc>>, // TODO why? // A refcount of iterators -- when this hits zero, we can delete the files_to_be_deleted. Why? @@ -587,7 +618,7 @@ impl ValueLogCore { vp.offset, self.writable_log_offset.load(Ordering::Acquire) ) - .into()); + .into()); } self.read_value_bytes(vp, |buffer| { @@ -606,7 +637,7 @@ impl ValueLogCore { pub async fn async_read( &self, vp: &ValuePointer, - consumer: impl FnMut(Vec) -> Pin> + Send>>, + consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { // Check for valid offset if we are reading to writable log. if vp.fid == self.max_fid.load(Ordering::Acquire) @@ -617,7 +648,7 @@ impl ValueLogCore { vp.offset, self.writable_log_offset.load(Ordering::Acquire) ) - .into()); + .into()); } self.async_read_bytes(vp, consumer).await?; Ok(()) @@ -630,7 +661,7 @@ impl ValueLogCore { mut f: impl for<'a> FnMut( &'a Entry, &'a ValuePointer, - ) -> Pin> + 'a>>, + ) -> Pin> + 'a>>, ) -> Result<()> { let vlogs = self.pick_log_guard(); info!("Seeking at value pointer: {:?}", vp); @@ -734,7 +765,7 @@ impl ValueLogCore { async fn async_read_bytes( &self, vp: &ValuePointer, - mut consumer: impl FnMut(Vec) -> Pin> + Send>>, + mut consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { let mut buffer = self.pick_log_by_vlog_id(&vp.fid).read().read(&vp)?.to_vec(); let value_buffer = buffer.split_off(Header::encoded_size()); @@ -1011,12 +1042,246 @@ impl ValueLogCore { } pub(crate) async fn wait_on_gc(&self, lc: Closer) { - defer! {lc.done()}; + defer! {lc.done()} lc.wait().await; // wait for lc to be closed. - // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up - // the channel of size 1. + // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up + // the channel of size 1. self.garbage_ch.send(()).await.unwrap(); } + async fn do_run_gc(&self, gc_threshold: f64) -> Result<()> { + let ptr = self as *const ValueLogCore as *mut ValueLogCore; + let runner = SafeValueLog { gc_channel: self.garbage_ch.clone(), value_log: AtomicPtr::new(ptr) }; + todo!() + } + // async fn do_run_gc(&self, gc_threshold: f64) -> Result<()> { + // let lf = self.pick_log().ok_or(Error::ValueNoRewrite)?; + // #[derive(Debug, Default)] + // struct Reason { + // total: f64, + // keep: f64, + // discard: f64, + // } + // let mut reason: TArcMx = TArcMx::default(); + // let mut window = 100.0; // lasted 100M + // let mut count = 0; + // // Pick a random start point for the log. + // let mut skip_first_m = + // thread_rng_n((self.opt.value_log_file_size / M) as u32) as f64 - window; + // let mut skipped = 0.0; + // let mut start = SystemTime::now(); + // let vlog = self.clone(); + // let kv = vlog.get_kv(); + // let err = lf + // .clone() + // .read() + // .iterate_by_offset(0, &mut |entry, vptr| { + // // let vlg = self.clone(); + // let reason = reason.clone(); + // let lfc = lf.clone(); + // Box::pin(async move { + // // let kv = vlg.get_kv(); + // let mut reason = reason.lock().await; + // let esz = vptr.len as f64 / (1 << 20) as f64; // in MBs, +4 for the CAS stuff. + // skipped += esz; + // if skipped < skip_first_m { + // return Ok(true); + // } + // count += 1; + // if count % 100 == 0 { + // tokio::time::sleep(Duration::from_millis(1)).await; + // } + // reason.total += esz; + // if reason.total > window { + // return Err("stop iteration".into()); + // } + // if start.elapsed().unwrap().as_secs() > 10 { + // return Err("stop iteration".into()); + // } + // let vs = kv._get(&entry.key)?; + // if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { + // // Key has been deleted. Discard. + // reason.discard += esz; + // return Ok(true); // Continue + // } + // if (vs.meta & MetaBit::BIT_VALUE_POINTER.bits()) == 0 { + // // Value is stored alongside key. Discard. + // reason.discard += esz; + // return Ok(true); + // } + // // Value is still present in value log. + // assert!(!vs.value.is_empty()); + // let mut vptr = vptr.clone(); // TODO avoid copy + // vptr.dec(&mut io::Cursor::new(vs.value))?; + // if vptr.fid > lfc.read().fid { + // // Value is present in a later log. Discard. + // reason.discard += esz; + // return Ok(true); + // } + // if vptr.offset > entry.offset { + // // Value is present in a later offset, but in the same log. + // reason.discard += esz; + // return Ok(true); + // } + // if vptr.fid == lfc.read().fid && vptr.offset == entry.offset { + // // This is still the active entry, This would need to be rewritten. + // reason.keep += esz; + // } else { + // info!("Reason={:?}", reason); + // let err = vlg.read_value_bytes(&vptr, |buf| { + // let mut unexpect_entry = Entry::default(); + // unexpect_entry.dec(&mut io::Cursor::new(buf))?; + // unexpect_entry.offset = vptr.offset; + // if unexpect_entry.cas_counter == entry.cas_counter { + // info!("Latest Entry Header in LSM: {}", unexpect_entry); + // info!("Latest Entry in Log: {}", entry); + // } + // Ok(()) + // }); + // if err.is_err() { + // return Err("Stop iteration".into()); + // } + // } + // Ok(true) + // }) + // }) + // .await; + // + // if err.is_err() { + // info!( + // "Error while iterating for RunGC: {}", + // err.as_ref().unwrap_err() + // ); + // return err; + // } + // + // info!("Fid: {} Data status={:?}", lf.read().fid, reason); + // if reason.lock().await.total < 10.0 + // || reason.lock().await.discard < gc_threshold * reason.lock().await.total + // { + // info!("Skipping GC on fid: {}", lf.read().fid); + // return Err(Error::ValueNoRewrite); + // } + // + // info!("REWRITING VLOG {}", lf.read().fid); + // // self.rewrite(lf, self.get_kv()).await?; + // Ok(()) + // // todo!() + // } + // async fn do_run_gcc(&self, gc_threshold: f64) -> Result<()> { + // let runner = SafeValueLog{ gc_channel: self.garbage_ch.clone(), value_log: Arc::new(self.clone()) } + // } + // async fn do_run_gcc(&self, gc_threshold: f64) -> Result<()> { + // let lf = self.pick_log().ok_or(Error::ValueNoRewrite)?; + // #[derive(Debug, Default)] + // struct Reason { + // total: f64, + // keep: f64, + // discard: f64, + // } + // let mut reason: TArcMx = TArcMx::default(); + // let mut window = 100.0; // lasted 100M + // let mut count = 0; + // // Pick a random start point for the log. + // let mut skip_first_m = + // thread_rng_n((self.opt.value_log_file_size / M) as u32) as f64 - window; + // let mut skipped = 0.0; + // let mut start = SystemTime::now(); + // // assert!(!self.value_log.kv.is_null()); + // let err = lf + // .clone() + // .read() + // .iterate_by_offset(0, &mut |entry, vptr| { + // let vlg = self.clone(); + // let reason = reason.clone(); + // let lfc = lf.clone(); + // Box::pin(async move { + // let kv = vlg.get_kv(); + // let mut reason = reason.lock().await; + // let esz = vptr.len as f64 / (1 << 20) as f64; // in MBs, +4 for the CAS stuff. + // skipped += esz; + // if skipped < skip_first_m { + // return Ok(true); + // } + // count += 1; + // if count % 100 == 0 { + // tokio::time::sleep(Duration::from_millis(1)).await; + // } + // reason.total += esz; + // if reason.total > window { + // return Err("stop iteration".into()); + // } + // if start.elapsed().unwrap().as_secs() > 10 { + // return Err("stop iteration".into()); + // } + // let vs = kv._get(&entry.key)?; + // if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { + // // Key has been deleted. Discard. + // reason.discard += esz; + // return Ok(true); // Continue + // } + // if (vs.meta & MetaBit::BIT_VALUE_POINTER.bits()) == 0 { + // // Value is stored alongside key. Discard. + // reason.discard += esz; + // return Ok(true); + // } + // // Value is still present in value log. + // assert!(!vs.value.is_empty()); + // let mut vptr = vptr.clone(); // TODO avoid copy + // vptr.dec(&mut io::Cursor::new(vs.value))?; + // if vptr.fid > lfc.read().fid { + // // Value is present in a later log. Discard. + // reason.discard += esz; + // return Ok(true); + // } + // if vptr.offset > entry.offset { + // // Value is present in a later offset, but in the same log. + // reason.discard += esz; + // return Ok(true); + // } + // if vptr.fid == lfc.read().fid && vptr.offset == entry.offset { + // // This is still the active entry, This would need to be rewritten. + // reason.keep += esz; + // } else { + // info!("Reason={:?}", reason); + // let err = vlg.read_value_bytes(&vptr, |buf| { + // let mut unexpect_entry = Entry::default(); + // unexpect_entry.dec(&mut io::Cursor::new(buf))?; + // unexpect_entry.offset = vptr.offset; + // if unexpect_entry.cas_counter == entry.cas_counter { + // info!("Latest Entry Header in LSM: {}", unexpect_entry); + // info!("Latest Entry in Log: {}", entry); + // } + // Ok(()) + // }); + // if err.is_err() { + // return Err("Stop iteration".into()); + // } + // } + // Ok(true) + // }) + // }) + // .await; + // + // if err.is_err() { + // info!( + // "Error while iterating for RunGC: {}", + // err.as_ref().unwrap_err() + // ); + // return err; + // } + // + // info!("Fid: {} Data status={:?}", lf.read().fid, reason); + // if reason.lock().await.total < 10.0 + // || reason.lock().await.discard < gc_threshold * reason.lock().await.total + // { + // info!("Skipping GC on fid: {}", lf.read().fid); + // return Err(Error::ValueNoRewrite); + // } + // + // info!("REWRITING VLOG {}", lf.read().fid); + // // self.rewrite(lf, self.get_kv()).await?; + // Ok(()) + // } } struct PickVlogsGuardsReadLock<'a> { @@ -1028,24 +1293,9 @@ struct PickVlogsGuardsReadLock<'a> { fids: Vec, } -struct ValueLogIterator<'a> { - fd: &'a File, -} - -impl<'a> ValueLogIterator<'a> { - fn new(fd: &mut std::fs::File, offset: u32) -> Result> { - fd.seek(SeekFrom::Start(offset as u64))?; - Ok(ValueLogIterator { fd }) - } - - fn iterate(&mut self, log_file: &mut LogFile, offset: u32) -> Result<()> { - todo!() - } -} - pub struct SafeValueLog { gc_channel: Channel<()>, - value_log: Arc, + value_log: AtomicPtr, } impl SafeValueLog { @@ -1061,7 +1311,8 @@ impl SafeValueLog { } async fn do_run_gcc(&self, gc_threshold: f64) -> Result<()> { - let lf = self.value_log.pick_log().ok_or(Error::ValueNoRewrite)?; + let vlog = unsafe { self.value_log.load(Ordering::Relaxed).as_ref().unwrap() }; + let lf = vlog.pick_log().ok_or(Error::ValueNoRewrite)?; #[derive(Debug, Default)] struct Reason { total: f64, @@ -1073,7 +1324,7 @@ impl SafeValueLog { let mut count = 0; // Pick a random start point for the log. let mut skip_first_m = - thread_rng_n((self.value_log.opt.value_log_file_size / M) as u32) as f64 - window; + thread_rng_n((vlog.opt.value_log_file_size / M) as u32) as f64 - window; let mut skipped = 0.0; let mut start = SystemTime::now(); // assert!(!self.value_log.kv.is_null()); @@ -1081,7 +1332,7 @@ impl SafeValueLog { .clone() .read() .iterate_by_offset(0, &mut |entry, vptr| { - let vlg = self.value_log.clone(); + let vlg = vlog.clone(); let reason = reason.clone(); let lfc = lf.clone(); Box::pin(async move { @@ -1169,117 +1420,10 @@ impl SafeValueLog { } info!("REWRITING VLOG {}", lf.read().fid); - self.value_log.rewrite(lf, self.value_log.get_kv()).await?; + vlog.rewrite(lf, vlog.get_kv()).await?; Ok(()) } } #[test] -fn it() { - use parking_lot::*; - struct Flock { - df: RwLock>>, - age: u32, - } - // impl Flock { - // fn get_df( - // &self, - // ) -> std::result::Result< - // lock_api::MappedRwLockReadGuard<'_, RawRwLock, String>, - // lock_api::RwLockReadGuard<'_, RawRwLock, HashMap>, - // > { - // RwLockReadGuard::try_map(self.df.read(), |df| df.get(&0)) - // } - // - // fn get_mut( - // &self, - // idx: u32, - // ) -> std::result::Result< - // lock_api::MappedRwLockWriteGuard<'_, RawRwLock, String>, - // lock_api::RwLockWriteGuard<'_, RawRwLock, HashMap>, - // > { - // RwLockWriteGuard::try_map(self.df.write(), |df| df.get_mut(&idx)) - // } - // } - - let mut flock = Flock { - df: RwLock::new(HashMap::new()), - age: 19, - }; - { - flock - .df - .write() - .insert(0, RwLock::new("foobat".to_string())); - flock.df.write().insert(1, RwLock::new("ok!".to_string())); - } - // let lock1 = flock.df.write().get(&0).as_mut().unwrap(); - // let lock2 = flock.df.write().get(&1).as_mut().unwrap(); - // flock.df.write().insert(3, RwLock::new("ok!".to_string())); - // let value = RwLockReadGuard::try_map(lock1.read(), |df| Some(df)); - // println!("WHat??? {:?}", value); -} - -#[tokio::test] -async fn lock1() { - let req: RwLock>> = RwLock::new(Vec::new()); - - tokio::spawn(async move { - let _a = &req.write()[0]; - }); -} - -#[tokio::test] -async fn lock() { - use parking_lot::*; - - #[derive(Debug)] - struct FileLog {} - - #[derive(Debug)] - struct FileLogProxy { - files: HashMap>, - } - - impl FileLogProxy { - fn get_file( - &self, - idx: u32, - ) -> parking_lot::lock_api::RwLockReadGuard<'_, RawRwLock, FileLog> { - let flog = self.files.get(&idx).unwrap(); - let c = flog.read(); - c - } - - fn get_mut_file( - &self, - idx: u32, - ) -> std::result::Result< - parking_lot::lock_api::MappedRwLockWriteGuard<'_, RawRwLock, FileLog>, - parking_lot::lock_api::RwLockWriteGuard<'_, RawRwLock, FileLog>, - > { - let flog = self.files.get(&idx).unwrap(); - RwLockWriteGuard::try_map(flog.write(), |df| Some(df)) - } - } - - struct ValueLog { - df: RwLock, - age: u32, - } - - impl ValueLog { - // fn max_vlog_rl( - // &self, - // ) -> parking_lot::lock_api::RwLockReadGuard<'_, RawRwLock, FileLog> { - // let rl = self.rl(); - // let vlog = rl.get_file(0); - // vlog - // } - - // fn rl(&self) -> parking_lot::lock_api::RwLockReadGuard<'_, RawRwLock, FileLog> { - // let df = self.df.read().get_file(0); - // df - // } - } -} +fn it() {} From 2a66ba511e103794a57351ef73bb591deae76e08 Mon Sep 17 00:00:00 2001 From: Rg Date: Tue, 11 Apr 2023 01:09:08 +0800 Subject: [PATCH 72/77] :dog: sleep --- src/kv_test.rs | 137 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 src/kv_test.rs diff --git a/src/kv_test.rs b/src/kv_test.rs new file mode 100644 index 0000000..aeb3ba3 --- /dev/null +++ b/src/kv_test.rs @@ -0,0 +1,137 @@ +use log::info; +use log::kv::ToValue; +use std::env::temp_dir; +use std::io::Write; +use std::time::Duration; + +use crate::iterator::IteratorOptions; +use crate::types::XArc; +use crate::value_log::Entry; +use crate::{kv::KV, options::Options}; + +fn get_test_option(dir: &str) -> Options { + let mut opt = Options::default(); + opt.max_table_size = 1 << 15; //Force more compaction. + opt.level_one_size = 4 << 15; // Force more compaction. + opt.dir = Box::new(dir.clone().to_string()); + opt.value_dir = Box::new(dir.to_string()); + opt +} + +#[tokio::test] +async fn t_write() { + use crate::test_util::{mock_log, mock_log_terminal, random_tmp_dir, tracing_log}; + tracing_log(); + let dir = random_tmp_dir(); + let kv = KV::open(get_test_option(&dir)).await; + let kv = kv.unwrap(); + let res = kv.set(b"hello".to_vec(), b"word".to_vec(), 10).await; + assert!(res.is_ok()); + let got = kv._get(b"hello"); + assert!(got.is_ok()); + assert_eq!(&got.unwrap().value, b"word"); +} + +#[tokio::test] +async fn t_batch_write() { + use crate::test_util::{mock_log, mock_log_terminal, random_tmp_dir, tracing_log}; + tracing_log(); + let dir = random_tmp_dir(); + let kv = KV::open(get_test_option(&dir)).await; + let kv = kv.unwrap(); + let n = 100; + for i in 0..n { + let res = kv + .set(format!("{}", i).as_bytes().to_vec(), b"word".to_vec(), 10) + .await; + assert!(res.is_ok()); + } + + for i in 0..n { + let got = kv._get(format!("{}", i).as_bytes()); + assert!(got.is_ok()); + assert_eq!(&got.unwrap().value, b"word"); + } +} + +#[tokio::test] +async fn t_concurrent_write() { + use crate::test_util::{mock_log, mock_log_terminal, random_tmp_dir, tracing_log}; + tracing_log(); + let dir = random_tmp_dir(); + let kv = KV::open(get_test_option(&dir)).await; + let kv = kv.unwrap(); + let mut wg = awaitgroup::WaitGroup::new(); + let n = 200; + for i in 0..n { + let kv = kv.clone(); + let wk = wg.worker(); + tokio::spawn(async move { + let res = kv + .set(format!("{}", i).as_bytes().to_vec(), b"word".to_vec(), 10) + .await; + assert!(res.is_ok()); + wk.done(); + }); + } + + wg.wait().await; + info!("Starting iteration"); + let itr = kv + .new_iterator(IteratorOptions { + reverse: false, + pre_fetch_values: true, + pre_fetch_size: 10, + }) + .await; + let mut i = 0; + while let Some(item) = itr.next().await { + let item = item.read().await; + assert_eq!(item.key(), format!("{}", i).as_bytes()); + assert_eq!(item.get_value().await.unwrap(), b"word".to_vec()); + i += 1; + } +} + +#[tokio::test] +async fn t_cas() { + let n = 100; + let kv = build_kv().await; + let entries = (0..n) + .into_iter() + .map(|i| Entry { + key: format!("{}", i).as_bytes().to_vec(), + meta: 0, + user_meta: 0, + value: format!("{}", i).as_bytes().to_vec(), + cas_counter_check: 0, + offset: 0, + cas_counter: 0, + }) + .collect::>(); + + let got = kv.batch_set(entries).await; + assert!(got.is_ok()); + let resp = got.unwrap(); + for res in resp { + assert!(res.is_ok().await); + } + tokio::time::sleep(Duration::from_secs(1)).await; + let mut items = vec![]; + for i in 0..n { + let key = format!("{}", i).as_bytes().to_vec(); + let value = format!("{}", i).as_bytes().to_vec(); + let got = kv.get(&key).await.unwrap(); + assert_eq!(got, value); + items.push(got); + } +} + +async fn build_kv() -> XArc { + use crate::test_util::{mock_log, mock_log_terminal, random_tmp_dir, tracing_log}; + tracing_log(); + let dir = random_tmp_dir(); + let kv = KV::open(get_test_option(&dir)).await; + let kv = kv.unwrap(); + kv +} From 1cab12fba6bc1345fe5a3b58e14bd3f68a6687ee Mon Sep 17 00:00:00 2001 From: Rg Date: Tue, 11 Apr 2023 20:27:16 +0800 Subject: [PATCH 73/77] :dog: --- src/kv.rs | 34 ++--- src/kv_test.rs | 37 ++--- src/lib.rs | 2 + src/log_file.rs | 2 +- src/value_log.rs | 349 +++++++---------------------------------------- src/y/mod.rs | 4 + 6 files changed, 91 insertions(+), 337 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 52df73a..9d91231 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -764,6 +764,12 @@ impl ArcKV { inner.get_value().await } + pub(crate) async fn get_with_ext(&self, key: &[u8]) -> Result { + let got = self._get(key)?; + let inner = KVItemInner::new(key.to_vec(), got, self.clone()); + Ok(TArcRW::new(tokio::sync::RwLock::new(inner))) + } + /// Set sets the provided value for a given key. If key is not present, it is created. If it is /// present, the existing value is overwritten with the one provided. /// Along with key and value, Set can also take an optional userMeta byte. This byte is stored @@ -862,11 +868,10 @@ impl ArcKV { /// /// Note: Every time GC is run, it would produce a spike of activity on the LSM tree. pub async fn run_value_log_gc(&self, discard_ratio: f64) -> Result<()> { - // if discard_ratio >= 1.0 || discard_ratio <= 0.0 { - // Err(Error::ValueInvalidRequest); - // } - // self.must_vlog().wait_on_gc(lc) - todo!() + if discard_ratio >= 1.0 || discard_ratio <= 0.0 { + return Err(Error::ValueInvalidRequest); + } + self.must_vlog().trigger_gc(discard_ratio).await } /// Closes a KV. It's crucial to call it to ensure all the pending updates @@ -1090,22 +1095,3 @@ async fn write_level0_table(st: &SkipList, f: &mut tokio::fs::File) -> Result<() fn arena_size(opt: &Options) -> usize { (opt.max_table_size + opt.max_batch_size + opt.max_batch_count * Node::size() as u64) as usize } - -#[test] -fn t_pointer() { - struct Ext { - v: Vec, - name: String, - } - - let t = Ext { - v: vec![], - name: "Name".to_owned(), - }; - - let p = unsafe { &t as *const Ext }; - - let arc_p = Arc::new(t); - - print!("==> {:?}", p); -} diff --git a/src/kv_test.rs b/src/kv_test.rs index aeb3ba3..e69eaac 100644 --- a/src/kv_test.rs +++ b/src/kv_test.rs @@ -99,32 +99,37 @@ async fn t_cas() { let kv = build_kv().await; let entries = (0..n) .into_iter() - .map(|i| Entry { - key: format!("{}", i).as_bytes().to_vec(), - meta: 0, - user_meta: 0, - value: format!("{}", i).as_bytes().to_vec(), - cas_counter_check: 0, - offset: 0, - cas_counter: 0, + .map(|i| { + Entry::default() + .key(format!("{}", i).into_bytes()) + .value(format!("{}", i).into_bytes()) }) .collect::>(); - - let got = kv.batch_set(entries).await; - assert!(got.is_ok()); - let resp = got.unwrap(); - for res in resp { - assert!(res.is_ok().await); + for got in kv.batch_set(entries.clone()).await { + assert!(got.is_ok()); } tokio::time::sleep(Duration::from_secs(1)).await; let mut items = vec![]; for i in 0..n { let key = format!("{}", i).as_bytes().to_vec(); let value = format!("{}", i).as_bytes().to_vec(); - let got = kv.get(&key).await.unwrap(); - assert_eq!(got, value); + let got = kv.get_with_ext(&key).await.unwrap(); + let got_value = got.read().await.get_value().await.unwrap(); + assert_eq!(got_value, value); items.push(got); } + + for i in 0..n { + let key = format!("{}", i).as_bytes().to_vec(); + let value = format!("{}", i).as_bytes().to_vec(); + let mut cc = entries[i].cas_counter; + if cc == 5 { + cc = 6; + } else { + cc = 5; + } + assert!(kv.compare_and_set(key, value, cc).await.is_err()); + } } async fn build_kv() -> XArc { diff --git a/src/lib.rs b/src/lib.rs index 325a4ea..d7e554d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -44,6 +44,8 @@ mod pb; mod st_manager; #[cfg(test)] mod test_util; +#[cfg(test)] +mod kv_test; pub use skl::*; diff --git a/src/log_file.rs b/src/log_file.rs index 359b553..8ae83eb 100644 --- a/src/log_file.rs +++ b/src/log_file.rs @@ -99,7 +99,7 @@ impl LogFile { Ok((v, cursor_offset)) } - // async iterate from offset that must be call with thread safty + // async iterate from offset that must be call with thread safety pub(crate) async fn iterate_by_offset( &self, mut offset: u32, diff --git a/src/value_log.rs b/src/value_log.rs index b26eeba..7013f79 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -37,7 +37,7 @@ use crate::log_file::LogFile; use crate::options::Options; use crate::skl::BlockBytes; use crate::table::iterator::BlockSlice; -use crate::types::{ArcRW, Channel, Closer, TArcMx, XArc}; +use crate::types::{ArcMx, ArcRW, Channel, Closer, TArcMx, XArc}; use crate::y::{ create_synced_file, is_eof, open_existing_synced_file, read_at, sync_directory, Decode, Encode, }; @@ -106,7 +106,7 @@ impl Decode for Header { /// Entry provides Key, Value and if required, cas_counter_check to kv.batch_set() API. /// If cas_counter_check is provided, it would be compared against the current `cas_counter` /// assigned to this key-value. Set be done on this key only if the counters match. -#[derive(Default)] +#[derive(Default, Clone)] pub struct Entry { pub(crate) key: Vec, pub(crate) meta: u8, @@ -618,7 +618,7 @@ impl ValueLogCore { vp.offset, self.writable_log_offset.load(Ordering::Acquire) ) - .into()); + .into()); } self.read_value_bytes(vp, |buffer| { @@ -637,7 +637,7 @@ impl ValueLogCore { pub async fn async_read( &self, vp: &ValuePointer, - consumer: impl FnMut(Vec) -> Pin> + Send>>, + consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { // Check for valid offset if we are reading to writable log. if vp.fid == self.max_fid.load(Ordering::Acquire) @@ -648,7 +648,7 @@ impl ValueLogCore { vp.offset, self.writable_log_offset.load(Ordering::Acquire) ) - .into()); + .into()); } self.async_read_bytes(vp, consumer).await?; Ok(()) @@ -661,7 +661,7 @@ impl ValueLogCore { mut f: impl for<'a> FnMut( &'a Entry, &'a ValuePointer, - ) -> Pin> + 'a>>, + ) -> Pin> + 'a>>, ) -> Result<()> { let vlogs = self.pick_log_guard(); info!("Seeking at value pointer: {:?}", vp); @@ -765,7 +765,7 @@ impl ValueLogCore { async fn async_read_bytes( &self, vp: &ValuePointer, - mut consumer: impl FnMut(Vec) -> Pin> + Send>>, + mut consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { let mut buffer = self.pick_log_by_vlog_id(&vp.fid).read().read(&vp)?.to_vec(); let value_buffer = buffer.split_off(Header::encoded_size()); @@ -1044,316 +1044,71 @@ impl ValueLogCore { pub(crate) async fn wait_on_gc(&self, lc: Closer) { defer! {lc.done()} lc.wait().await; // wait for lc to be closed. - // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up - // the channel of size 1. + // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up + // the channel of size 1. self.garbage_ch.send(()).await.unwrap(); } - async fn do_run_gc(&self, gc_threshold: f64) -> Result<()> { - let ptr = self as *const ValueLogCore as *mut ValueLogCore; - let runner = SafeValueLog { gc_channel: self.garbage_ch.clone(), value_log: AtomicPtr::new(ptr) }; - todo!() - } - // async fn do_run_gc(&self, gc_threshold: f64) -> Result<()> { - // let lf = self.pick_log().ok_or(Error::ValueNoRewrite)?; - // #[derive(Debug, Default)] - // struct Reason { - // total: f64, - // keep: f64, - // discard: f64, - // } - // let mut reason: TArcMx = TArcMx::default(); - // let mut window = 100.0; // lasted 100M - // let mut count = 0; - // // Pick a random start point for the log. - // let mut skip_first_m = - // thread_rng_n((self.opt.value_log_file_size / M) as u32) as f64 - window; - // let mut skipped = 0.0; - // let mut start = SystemTime::now(); - // let vlog = self.clone(); - // let kv = vlog.get_kv(); - // let err = lf - // .clone() - // .read() - // .iterate_by_offset(0, &mut |entry, vptr| { - // // let vlg = self.clone(); - // let reason = reason.clone(); - // let lfc = lf.clone(); - // Box::pin(async move { - // // let kv = vlg.get_kv(); - // let mut reason = reason.lock().await; - // let esz = vptr.len as f64 / (1 << 20) as f64; // in MBs, +4 for the CAS stuff. - // skipped += esz; - // if skipped < skip_first_m { - // return Ok(true); - // } - // count += 1; - // if count % 100 == 0 { - // tokio::time::sleep(Duration::from_millis(1)).await; - // } - // reason.total += esz; - // if reason.total > window { - // return Err("stop iteration".into()); - // } - // if start.elapsed().unwrap().as_secs() > 10 { - // return Err("stop iteration".into()); - // } - // let vs = kv._get(&entry.key)?; - // if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { - // // Key has been deleted. Discard. - // reason.discard += esz; - // return Ok(true); // Continue - // } - // if (vs.meta & MetaBit::BIT_VALUE_POINTER.bits()) == 0 { - // // Value is stored alongside key. Discard. - // reason.discard += esz; - // return Ok(true); - // } - // // Value is still present in value log. - // assert!(!vs.value.is_empty()); - // let mut vptr = vptr.clone(); // TODO avoid copy - // vptr.dec(&mut io::Cursor::new(vs.value))?; - // if vptr.fid > lfc.read().fid { - // // Value is present in a later log. Discard. - // reason.discard += esz; - // return Ok(true); - // } - // if vptr.offset > entry.offset { - // // Value is present in a later offset, but in the same log. - // reason.discard += esz; - // return Ok(true); - // } - // if vptr.fid == lfc.read().fid && vptr.offset == entry.offset { - // // This is still the active entry, This would need to be rewritten. - // reason.keep += esz; - // } else { - // info!("Reason={:?}", reason); - // let err = vlg.read_value_bytes(&vptr, |buf| { - // let mut unexpect_entry = Entry::default(); - // unexpect_entry.dec(&mut io::Cursor::new(buf))?; - // unexpect_entry.offset = vptr.offset; - // if unexpect_entry.cas_counter == entry.cas_counter { - // info!("Latest Entry Header in LSM: {}", unexpect_entry); - // info!("Latest Entry in Log: {}", entry); - // } - // Ok(()) - // }); - // if err.is_err() { - // return Err("Stop iteration".into()); - // } - // } - // Ok(true) - // }) - // }) - // .await; - // - // if err.is_err() { - // info!( - // "Error while iterating for RunGC: {}", - // err.as_ref().unwrap_err() - // ); - // return err; - // } - // - // info!("Fid: {} Data status={:?}", lf.read().fid, reason); - // if reason.lock().await.total < 10.0 - // || reason.lock().await.discard < gc_threshold * reason.lock().await.total - // { - // info!("Skipping GC on fid: {}", lf.read().fid); - // return Err(Error::ValueNoRewrite); - // } - // - // info!("REWRITING VLOG {}", lf.read().fid); - // // self.rewrite(lf, self.get_kv()).await?; - // Ok(()) - // // todo!() - // } - // async fn do_run_gcc(&self, gc_threshold: f64) -> Result<()> { - // let runner = SafeValueLog{ gc_channel: self.garbage_ch.clone(), value_log: Arc::new(self.clone()) } - // } - // async fn do_run_gcc(&self, gc_threshold: f64) -> Result<()> { - // let lf = self.pick_log().ok_or(Error::ValueNoRewrite)?; - // #[derive(Debug, Default)] - // struct Reason { - // total: f64, - // keep: f64, - // discard: f64, - // } - // let mut reason: TArcMx = TArcMx::default(); - // let mut window = 100.0; // lasted 100M - // let mut count = 0; - // // Pick a random start point for the log. - // let mut skip_first_m = - // thread_rng_n((self.opt.value_log_file_size / M) as u32) as f64 - window; - // let mut skipped = 0.0; - // let mut start = SystemTime::now(); - // // assert!(!self.value_log.kv.is_null()); - // let err = lf - // .clone() - // .read() - // .iterate_by_offset(0, &mut |entry, vptr| { - // let vlg = self.clone(); - // let reason = reason.clone(); - // let lfc = lf.clone(); - // Box::pin(async move { - // let kv = vlg.get_kv(); - // let mut reason = reason.lock().await; - // let esz = vptr.len as f64 / (1 << 20) as f64; // in MBs, +4 for the CAS stuff. - // skipped += esz; - // if skipped < skip_first_m { - // return Ok(true); - // } - // count += 1; - // if count % 100 == 0 { - // tokio::time::sleep(Duration::from_millis(1)).await; - // } - // reason.total += esz; - // if reason.total > window { - // return Err("stop iteration".into()); - // } - // if start.elapsed().unwrap().as_secs() > 10 { - // return Err("stop iteration".into()); - // } - // let vs = kv._get(&entry.key)?; - // if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { - // // Key has been deleted. Discard. - // reason.discard += esz; - // return Ok(true); // Continue - // } - // if (vs.meta & MetaBit::BIT_VALUE_POINTER.bits()) == 0 { - // // Value is stored alongside key. Discard. - // reason.discard += esz; - // return Ok(true); - // } - // // Value is still present in value log. - // assert!(!vs.value.is_empty()); - // let mut vptr = vptr.clone(); // TODO avoid copy - // vptr.dec(&mut io::Cursor::new(vs.value))?; - // if vptr.fid > lfc.read().fid { - // // Value is present in a later log. Discard. - // reason.discard += esz; - // return Ok(true); - // } - // if vptr.offset > entry.offset { - // // Value is present in a later offset, but in the same log. - // reason.discard += esz; - // return Ok(true); - // } - // if vptr.fid == lfc.read().fid && vptr.offset == entry.offset { - // // This is still the active entry, This would need to be rewritten. - // reason.keep += esz; - // } else { - // info!("Reason={:?}", reason); - // let err = vlg.read_value_bytes(&vptr, |buf| { - // let mut unexpect_entry = Entry::default(); - // unexpect_entry.dec(&mut io::Cursor::new(buf))?; - // unexpect_entry.offset = vptr.offset; - // if unexpect_entry.cas_counter == entry.cas_counter { - // info!("Latest Entry Header in LSM: {}", unexpect_entry); - // info!("Latest Entry in Log: {}", entry); - // } - // Ok(()) - // }); - // if err.is_err() { - // return Err("Stop iteration".into()); - // } - // } - // Ok(true) - // }) - // }) - // .await; - // - // if err.is_err() { - // info!( - // "Error while iterating for RunGC: {}", - // err.as_ref().unwrap_err() - // ); - // return err; - // } - // - // info!("Fid: {} Data status={:?}", lf.read().fid, reason); - // if reason.lock().await.total < 10.0 - // || reason.lock().await.discard < gc_threshold * reason.lock().await.total - // { - // info!("Skipping GC on fid: {}", lf.read().fid); - // return Err(Error::ValueNoRewrite); - // } - // - // info!("REWRITING VLOG {}", lf.read().fid); - // // self.rewrite(lf, self.get_kv()).await?; - // Ok(()) - // } -} -struct PickVlogsGuardsReadLock<'a> { - vlogs: lock_api::RwLockReadGuard< - 'a, - RawRwLock, - HashMap>>, - >, - fids: Vec, -} - -pub struct SafeValueLog { - gc_channel: Channel<()>, - value_log: AtomicPtr, -} - -impl SafeValueLog { - async fn trigger_gc(&self, gc_threshold: f64) -> Result<()> { - return match self.gc_channel.try_send(()) { + // only one gc worker + pub async fn trigger_gc(&self, gc_threshold: f64) -> Result<()> { + return match self.garbage_ch.try_send(()) { Ok(()) => { - let ok = self.do_run_gcc(gc_threshold).await; - self.gc_channel.recv().await.unwrap(); + let ok = self.do_run_gc(gc_threshold).await; + self.garbage_ch.recv().await.unwrap(); ok } Err(err) => Err(Error::ValueRejected), }; } - async fn do_run_gcc(&self, gc_threshold: f64) -> Result<()> { - let vlog = unsafe { self.value_log.load(Ordering::Relaxed).as_ref().unwrap() }; - let lf = vlog.pick_log().ok_or(Error::ValueNoRewrite)?; + pub async fn do_run_gc(&self, gc_threshold: f64) -> Result<()> { #[derive(Debug, Default)] struct Reason { total: f64, keep: f64, discard: f64, } - let mut reason: TArcMx = TArcMx::default(); - let mut window = 100.0; // lasted 100M + let mut reason = ArcMx::new(parking_lot::Mutex::new(Reason::default())); + let mut window = 100.0; // limit 100M for gc every time let mut count = 0; // Pick a random start point for the log. let mut skip_first_m = - thread_rng_n((vlog.opt.value_log_file_size / M) as u32) as f64 - window; + thread_rng_n((self.opt.value_log_file_size / M) as u32) as f64 - window; let mut skipped = 0.0; let mut start = SystemTime::now(); - // assert!(!self.value_log.kv.is_null()); - let err = lf - .clone() + // Random pick a vlog file for gc + let lf = self.pick_log().ok_or(Error::ValueNoRewrite)?; + let fid = lf.read().fid; + // Ennnnnnn + let vlog = unsafe { &*(self as *const ValueLogCore as *mut ValueLogCore) }; + lf.clone() .read() .iterate_by_offset(0, &mut |entry, vptr| { - let vlg = vlog.clone(); + let kv = vlog.get_kv(); let reason = reason.clone(); - let lfc = lf.clone(); Box::pin(async move { - let kv = vlg.get_kv(); - let mut reason = reason.lock().await; + let mut reason = reason.lock(); let esz = vptr.len as f64 / (1 << 20) as f64; // in MBs, +4 for the CAS stuff. skipped += esz; if skipped < skip_first_m { + // Skip return Ok(true); } count += 1; + // TODO confiure if count % 100 == 0 { tokio::time::sleep(Duration::from_millis(1)).await; } reason.total += esz; if reason.total > window { - return Err("stop iteration".into()); + // return Err(Error::StopGC); + return Ok(false); } if start.elapsed().unwrap().as_secs() > 10 { - return Err("stop iteration".into()); + // return Err(Error::StopGC); + return Ok(false); } + // Get the late value let vs = kv._get(&entry.key)?; if (vs.meta & MetaBit::BIT_DELETE.bits()) > 0 { // Key has been deleted. Discard. @@ -1369,22 +1124,25 @@ impl SafeValueLog { assert!(!vs.value.is_empty()); let mut vptr = vptr.clone(); // TODO avoid copy vptr.dec(&mut io::Cursor::new(vs.value))?; - if vptr.fid > lfc.read().fid { + if vptr.fid > fid { // Value is present in a later log. Discard. reason.discard += esz; return Ok(true); } + if vptr.offset > entry.offset { // Value is present in a later offset, but in the same log. reason.discard += esz; return Ok(true); } - if vptr.fid == lfc.read().fid && vptr.offset == entry.offset { + + if vptr.fid == fid && vptr.offset == entry.offset { // This is still the active entry, This would need to be rewritten. reason.keep += esz; } else { + // TODO Maybe abort gc process, it should be happen info!("Reason={:?}", reason); - let err = vlg.read_value_bytes(&vptr, |buf| { + let err = vlog.read_value_bytes(&vptr, |buf| { let mut unexpect_entry = Entry::default(); unexpect_entry.dec(&mut io::Cursor::new(buf))?; unexpect_entry.offset = vptr.offset; @@ -1401,29 +1159,28 @@ impl SafeValueLog { Ok(true) }) }) - .await; - - if err.is_err() { - info!( - "Error while iterating for RunGC: {}", - err.as_ref().unwrap_err() - ); - return err; - } - - info!("Fid: {} Data status={:?}", lf.read().fid, reason); - if reason.lock().await.total < 10.0 - || reason.lock().await.discard < gc_threshold * reason.lock().await.total - { + .await?; + let reason = reason.lock(); + info!("Fid: {} Data status={:?}", fid, reason); + if reason.total < 10.0 || reason.discard < gc_threshold * reason.total { info!("Skipping GC on fid: {}", lf.read().fid); return Err(Error::ValueNoRewrite); } info!("REWRITING VLOG {}", lf.read().fid); - vlog.rewrite(lf, vlog.get_kv()).await?; + self.rewrite(lf, self.get_kv()).await?; Ok(()) } } +struct PickVlogsGuardsReadLock<'a> { + vlogs: lock_api::RwLockReadGuard< + 'a, + RawRwLock, + HashMap>>, + >, + fids: Vec, +} + #[test] fn it() {} diff --git a/src/y/mod.rs b/src/y/mod.rs index a45aab7..5c49257 100644 --- a/src/y/mod.rs +++ b/src/y/mod.rs @@ -79,6 +79,10 @@ pub enum Error { ///////////////////////////////// #[error("Not found")] NotFound, + //////////////////////////////// + // GC + #[error("Stop iteration")] + StopGC, } impl Default for Error { From 331aff23fd057a9298741e6e367030528928d95e Mon Sep 17 00:00:00 2001 From: Rg Date: Wed, 12 Apr 2023 21:05:00 +0800 Subject: [PATCH 74/77] :dog: --- src/kv.rs | 88 +++++++++++++++++++++++++++++++++++++++++++----- src/kv_test.rs | 8 +++-- src/value_log.rs | 1 + 3 files changed, 86 insertions(+), 11 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 9d91231..003ddec 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -91,7 +91,7 @@ pub struct KV { pub write_ch: Channel, // Incremented in the non-concurrently accessed write loop. But also accessed outside. So // we use an atomic op. - last_used_cas_counter: AtomicU64, + pub(crate) last_used_cas_counter: AtomicU64, share_lock: tokio::sync::RwLock<()>, // TODO user ctx replace closer ctx: tokio_context::context::Context, @@ -166,7 +166,7 @@ impl KV { value_dir_guard, closers, write_ch: Channel::new(1), - last_used_cas_counter: Default::default(), + last_used_cas_counter: AtomicU64::new(1), mem_st_manger: Arc::new(SkipListManager::new(opt.arena_size() as usize)), share_lock: tokio::sync::RwLock::new(()), ctx, @@ -408,6 +408,7 @@ impl KV { let counter_base = self.new_cas_counter(entries.len() as u64); for (idx, entry) in entries.iter().enumerate() { entry.write().mut_entry().cas_counter = counter_base + idx as u64; + info!("update cas counter: {}", entry.read().entry().cas_counter); } } @@ -562,6 +563,76 @@ impl KV { Ok(reqs) } + pub(crate) async fn batch_set2(&self, entries: Vec) -> Result> { + let mut bad = vec![]; + let mut batch_reqs = vec![]; + let mut b = Some(Request::default()); + let mut count = 0; + let mut sz = 0u64; + for entry in entries { + if entry.key.len() > MAX_KEY_SIZE { + bad.push(entry); + continue; + } + if entry.value.len() as u64 > self.opt.value_log_file_size { + bad.push(entry); + continue; + } + count += 1; + sz += self.opt.estimate_size(&entry) as u64; + + { + b.as_ref() + .unwrap() + .entries + .write() + .push(parking_lot::RwLock::new(EntryType::from(entry))); + b.as_ref().unwrap().ptrs.lock().push(None); + } + + if count >= self.opt.max_batch_count || sz >= self.opt.max_batch_count { + let task_req = b.replace(Request::default()); + batch_reqs.push(task_req.unwrap()); + count = 0; + sz = 0; + } + } + if let Some(req) = b { + if !req.entries.read().is_empty() { + batch_reqs.push(req); + } + } + + let mut reqs = vec![]; + for req in batch_reqs { + if req.entries.read().is_empty() { + break; + } + let arc_req = ArcRequest::from(req); + reqs.push(arc_req.clone()); + assert!(!self.write_ch.is_close()); + info!( + "send tasks to write, entries: {}", + arc_req.get_req().entries.read().len() + ); + self.write_ch.send(arc_req).await.unwrap(); + } + if !bad.is_empty() { + let req = Request::default(); + *req.entries.write() = Vec::from_iter( + bad.into_iter() + .map(|bad| parking_lot::RwLock::new(EntryType::from(bad))) + .into_iter(), + ); + let arc_req = ArcRequest::from(req); + arc_req + .set_err(Err("key too big or value to big".into())) + .await; + reqs.push(arc_req); + } + Ok(reqs) + } + fn write_to_lsm(&self, req: ArcRequest) -> Result<()> { let req = req.get_req(); //.entries.read(); let ptrs = req.ptrs.lock(); @@ -633,8 +704,7 @@ impl KV { } fn new_cas_counter(&self, how_many: u64) -> u64 { - self.last_used_cas_counter - .fetch_add(how_many, Ordering::Relaxed) + self.last_used_cas_counter.fetch_add(how_many, Ordering::Relaxed) + 1 } async fn ensure_room_for_write(&self) -> Result<()> { @@ -971,7 +1041,7 @@ impl ArcKV { self.must_vlog().incr_iterator_count(); // Create iterators across all the tables involved first. - let mut itrs: Vec>> = vec![]; + let mut itrs: Vec>> = vec![]; for tb in tables.clone() { let st = unsafe { tb.as_ref().unwrap().clone() }; let iter = Box::new(UniIterator::new(st, opt.reverse)); @@ -986,8 +1056,10 @@ impl ArcKV { impl ArcKV { async fn do_writes(&self, lc: Closer, without_close_write_ch: bool) { info!("start do writes task!"); - defer! {info!("exit writes task!")}; - defer! {lc.done()}; + defer! {info!("exit writes task!")} + ; + defer! {lc.done()} + ; // TODO add metrics let has_been_close = lc.has_been_closed(); let write_ch = self.write_ch.clone(); @@ -1060,7 +1132,7 @@ impl ArcKV { pub(crate) async fn yield_item_value( &self, item: KVItemInner, - mut consumer: impl FnMut(Vec) -> Pin> + Send>>, + mut consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { // no value if !item.has_value() { diff --git a/src/kv_test.rs b/src/kv_test.rs index e69eaac..2fc3d84 100644 --- a/src/kv_test.rs +++ b/src/kv_test.rs @@ -2,6 +2,7 @@ use log::info; use log::kv::ToValue; use std::env::temp_dir; use std::io::Write; +use std::sync::atomic::Ordering; use std::time::Duration; use crate::iterator::IteratorOptions; @@ -120,9 +121,10 @@ async fn t_cas() { } for i in 0..n { - let key = format!("{}", i).as_bytes().to_vec(); - let value = format!("{}", i).as_bytes().to_vec(); - let mut cc = entries[i].cas_counter; + let key = format!("{}", i).into_bytes(); + let value = format!("{}", i).into_bytes(); + let mut cc = items[i].read().await.counter(); + println!("counter: {}", cc); if cc == 5 { cc = 6; } else { diff --git a/src/value_log.rs b/src/value_log.rs index 7013f79..6ebe542 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -374,6 +374,7 @@ pub struct Request { pub(crate) entries: RwLock>>, // Output Values and wait group stuff below pub(crate) ptrs: Mutex>>, + // The res not same to EntryType.1 error pub(crate) res: Channel>, } From 18f319b872359fce6c79bb2740ddd355b5ecada5 Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 13 Apr 2023 01:43:52 +0800 Subject: [PATCH 75/77] :dog: sleep --- src/kv.rs | 267 +++++++++++++++++++++-------------------------- src/kv_test.rs | 8 +- src/value_log.rs | 46 ++++---- 3 files changed, 153 insertions(+), 168 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 003ddec..5bf99ad 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -360,10 +360,9 @@ impl KV { entry.key = key; entry.value = value; entry.user_meta = user_meta; - let res = self.batch_set(vec![entry]).await?; + let res = self.batch_set(vec![entry]).await; assert_eq!(res.len(), 1); - let first = res.first().unwrap().get_req(); - first.get_resp().await + res[0].to_owned() } // Returns the current `mem_tables` and get references. @@ -413,7 +412,11 @@ impl KV { } // TODO add error set - self.vlog.as_ref().unwrap().write(reqs.clone())?; + if let Err(err) = self.vlog.as_ref().unwrap().write(reqs.clone()) { + for req in reqs.iter() { + req.set_err(Err(err.clone())); + } + } info!("Writing to memory table"); let mut count = 0; @@ -426,7 +429,11 @@ impl KV { tokio::time::sleep(Duration::from_millis(10)).await; } info!("waiting for write"); - self.write_to_lsm(req.clone())?; + if let Err(err) = self.write_to_lsm(req.clone()) { + req.set_err(Err(err)); + } else { + req.set_err(Ok(())); + } self.update_offset(req.get_req().ptrs.lock()); } info!("{} entries written", count); @@ -493,144 +500,120 @@ impl KV { // Check(e.Error); // } // TODO - pub(crate) async fn batch_set(&self, entries: Vec) -> Result> { - let mut bad = vec![]; - let mut batch_reqs = vec![]; - let mut b = Some(Request::default()); - let mut count = 0; - let mut sz = 0u64; - for entry in entries { - if entry.key.len() > MAX_KEY_SIZE { - bad.push(entry); - continue; - } - if entry.value.len() as u64 > self.opt.value_log_file_size { - bad.push(entry); - continue; - } - count += 1; - sz += self.opt.estimate_size(&entry) as u64; - - { - b.as_ref() - .unwrap() - .entries - .write() - .push(parking_lot::RwLock::new(EntryType::from(entry))); - b.as_ref().unwrap().ptrs.lock().push(None); - } - - if count >= self.opt.max_batch_count || sz >= self.opt.max_batch_count { - let task_req = b.replace(Request::default()); - batch_reqs.push(task_req.unwrap()); - count = 0; - sz = 0; - } - } - if let Some(req) = b { - if !req.entries.read().is_empty() { - batch_reqs.push(req); - } - } - - let mut reqs = vec![]; - for req in batch_reqs { - if req.entries.read().is_empty() { - break; - } - let arc_req = ArcRequest::from(req); - reqs.push(arc_req.clone()); - assert!(!self.write_ch.is_close()); - info!( - "send tasks to write, entries: {}", - arc_req.get_req().entries.read().len() - ); - self.write_ch.send(arc_req).await.unwrap(); - } - if !bad.is_empty() { - let req = Request::default(); - *req.entries.write() = Vec::from_iter( - bad.into_iter() - .map(|bad| parking_lot::RwLock::new(EntryType::from(bad))) - .into_iter(), - ); - let arc_req = ArcRequest::from(req); - arc_req - .set_err(Err("key too big or value to big".into())) - .await; - reqs.push(arc_req); - } - Ok(reqs) - } - - pub(crate) async fn batch_set2(&self, entries: Vec) -> Result> { - let mut bad = vec![]; - let mut batch_reqs = vec![]; - let mut b = Some(Request::default()); + // pub(crate) async fn batch_set(&self, entries: Vec) -> Result> { + // let mut bad = vec![]; + // let mut batch_reqs = vec![]; + // let mut b = Some(Request::default()); + // let mut count = 0; + // let mut sz = 0u64; + // for entry in entries { + // if entry.key.len() > MAX_KEY_SIZE { + // bad.push(entry); + // continue; + // } + // if entry.value.len() as u64 > self.opt.value_log_file_size { + // bad.push(entry); + // continue; + // } + // count += 1; + // sz += self.opt.estimate_size(&entry) as u64; + // + // { + // b.as_ref() + // .unwrap() + // .entries + // .write() + // .push(parking_lot::RwLock::new(EntryType::from(entry))); + // b.as_ref().unwrap().ptrs.lock().push(None); + // } + // + // if count >= self.opt.max_batch_count || sz >= self.opt.max_batch_count { + // let task_req = b.replace(Request::default()); + // batch_reqs.push(task_req.unwrap()); + // count = 0; + // sz = 0; + // } + // } + // if let Some(req) = b { + // if !req.entries.read().is_empty() { + // batch_reqs.push(req); + // } + // } + // + // let mut reqs = vec![]; + // for req in batch_reqs { + // if req.entries.read().is_empty() { + // break; + // } + // let arc_req = ArcRequest::from(req); + // reqs.push(arc_req.clone()); + // assert!(!self.write_ch.is_close()); + // info!( + // "send tasks to write, entries: {}", + // arc_req.get_req().entries.read().len() + // ); + // self.write_ch.send(arc_req).await.unwrap(); + // } + // if !bad.is_empty() { + // let req = Request::default(); + // *req.entries.write() = Vec::from_iter( + // bad.into_iter() + // .map(|bad| parking_lot::RwLock::new(EntryType::from(bad))) + // .into_iter(), + // ); + // let arc_req = ArcRequest::from(req); + // arc_req + // .set_err(Err("key too big or value to big".into())) + // .await; + // reqs.push(arc_req); + // } + // Ok(reqs) + // } + pub(crate) async fn batch_set(&self, entries: Vec) -> Vec> { let mut count = 0; let mut sz = 0u64; + let mut res = vec![]; + let mut req = ArcRequest::from(Request::default()); for entry in entries { if entry.key.len() > MAX_KEY_SIZE { - bad.push(entry); + res.push(Err("Key too big".into())); continue; } if entry.value.len() as u64 > self.opt.value_log_file_size { - bad.push(entry); + res.push(Err("Value to big".into())); continue; } count += 1; sz += self.opt.estimate_size(&entry) as u64; - { - b.as_ref() - .unwrap() - .entries - .write() - .push(parking_lot::RwLock::new(EntryType::from(entry))); - b.as_ref().unwrap().ptrs.lock().push(None); - } + req.req_ref() + .entries + .write() + .push(parking_lot::RwLock::new(EntryType::from(entry))); + req.req_ref().ptrs.lock().push(None); if count >= self.opt.max_batch_count || sz >= self.opt.max_batch_count { - let task_req = b.replace(Request::default()); - batch_reqs.push(task_req.unwrap()); + assert!(!self.write_ch.is_close()); + info!( + "send tasks to write, entries: {}", + req.get_req().entries.read().len() + ); + self.write_ch.send(req.clone()).await.unwrap(); count = 0; sz = 0; + let errs = req.get_req().get_errs(); + res.extend(errs.into_iter()); + req.req_ref().entries.write().clear(); + req.req_ref().ptrs.lock().clear(); } } - if let Some(req) = b { - if !req.entries.read().is_empty() { - batch_reqs.push(req); - } - } - - let mut reqs = vec![]; - for req in batch_reqs { - if req.entries.read().is_empty() { - break; - } - let arc_req = ArcRequest::from(req); - reqs.push(arc_req.clone()); - assert!(!self.write_ch.is_close()); - info!( - "send tasks to write, entries: {}", - arc_req.get_req().entries.read().len() - ); - self.write_ch.send(arc_req).await.unwrap(); - } - if !bad.is_empty() { - let req = Request::default(); - *req.entries.write() = Vec::from_iter( - bad.into_iter() - .map(|bad| parking_lot::RwLock::new(EntryType::from(bad))) - .into_iter(), - ); - let arc_req = ArcRequest::from(req); - arc_req - .set_err(Err("key too big or value to big".into())) - .await; - reqs.push(arc_req); + if !req.req_ref().entries.read().is_empty() { + self.write_ch.send(req.clone()).await.unwrap(); + res.extend(req.get_req().get_errs().into_iter()); + req.req_ref().entries.write().clear(); + req.req_ref().ptrs.lock().clear(); } - Ok(reqs) + res } fn write_to_lsm(&self, req: ArcRequest) -> Result<()> { @@ -704,7 +687,9 @@ impl KV { } fn new_cas_counter(&self, how_many: u64) -> u64 { - self.last_used_cas_counter.fetch_add(how_many, Ordering::Relaxed) + 1 + self.last_used_cas_counter + .fetch_add(how_many, Ordering::Relaxed) + + 1 } async fn ensure_room_for_write(&self) -> Result<()> { @@ -874,13 +859,7 @@ impl ArcKV { /// Batch set entries, returns result sets pub async fn batch_set(&self, entries: Vec) -> Vec> { - let reqs = self.to_ref().batch_set(entries).await.unwrap(); - let mut res = Vec::with_capacity(reqs.len()); - for req in reqs { - let ret = req.get_resp().await; - res.push(ret); - } - res + self.to_ref().batch_set(entries).await } /// CompareAndSetAsync is the asynchronous version of CompareAndSet. It accepts a callback function @@ -1041,7 +1020,7 @@ impl ArcKV { self.must_vlog().incr_iterator_count(); // Create iterators across all the tables involved first. - let mut itrs: Vec>> = vec![]; + let mut itrs: Vec>> = vec![]; for tb in tables.clone() { let st = unsafe { tb.as_ref().unwrap().clone() }; let iter = Box::new(UniIterator::new(st, opt.reverse)); @@ -1056,10 +1035,8 @@ impl ArcKV { impl ArcKV { async fn do_writes(&self, lc: Closer, without_close_write_ch: bool) { info!("start do writes task!"); - defer! {info!("exit writes task!")} - ; - defer! {lc.done()} - ; + defer! {info!("exit writes task!")}; + defer! {lc.done()}; // TODO add metrics let has_been_close = lc.has_been_closed(); let write_ch = self.write_ch.clone(); @@ -1101,10 +1078,9 @@ impl ArcKV { }; if !to_reqs.is_empty() { - let res = self.write_requests(to_reqs.clone()).await; - for req in to_reqs.clone().to_vec() { - req.set_err(res.clone()).await; - } + self.write_requests(to_reqs.clone()) + .await + .expect("TODO: panic message"); } } @@ -1121,10 +1097,9 @@ impl ArcKV { } reqs.lock().push(req.unwrap()); let to_reqs = to_reqs(); - let res = self.write_requests(to_reqs.clone()).await; - for req in to_reqs.clone().to_vec() { - req.set_err(res.clone()).await; - } + self.write_requests(to_reqs.clone()) + .await + .expect("TODO: panic message"); } } @@ -1132,7 +1107,7 @@ impl ArcKV { pub(crate) async fn yield_item_value( &self, item: KVItemInner, - mut consumer: impl FnMut(Vec) -> Pin> + Send>>, + mut consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { // no value if !item.has_value() { diff --git a/src/kv_test.rs b/src/kv_test.rs index 2fc3d84..b2a7b28 100644 --- a/src/kv_test.rs +++ b/src/kv_test.rs @@ -27,10 +27,10 @@ async fn t_write() { let kv = KV::open(get_test_option(&dir)).await; let kv = kv.unwrap(); let res = kv.set(b"hello".to_vec(), b"word".to_vec(), 10).await; - assert!(res.is_ok()); - let got = kv._get(b"hello"); - assert!(got.is_ok()); - assert_eq!(&got.unwrap().value, b"word"); + // assert!(res.is_ok()); + // let got = kv._get(b"hello"); + // assert!(got.is_ok()); + // assert_eq!(&got.unwrap().value, b"word"); } #[tokio::test] diff --git a/src/value_log.rs b/src/value_log.rs index 6ebe542..8cc279c 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -374,8 +374,6 @@ pub struct Request { pub(crate) entries: RwLock>>, // Output Values and wait group stuff below pub(crate) ptrs: Mutex>>, - // The res not same to EntryType.1 error - pub(crate) res: Channel>, } impl Default for Request { @@ -383,18 +381,33 @@ impl Default for Request { Request { entries: Default::default(), ptrs: Default::default(), - res: Channel::new(1), } } } impl Request { - pub(crate) async fn get_resp(&self) -> Result<()> { - self.res.recv().await.unwrap() + pub(crate) fn set_entries_resp(&self, ret: Result<()>) { + for entry in self.entries.write().iter_mut() { + info!("set resp"); + entry.get_mut().set_resp(ret.clone()); + } } - pub(crate) async fn set_resp(&self, ret: Result<()>) { - self.res.send(ret).await.unwrap() + pub async fn get_first_err(&self) -> Result<()> { + for entry in self.entries.read().iter() { + if let Err(err) = entry.read().as_ref().right().unwrap() { + return Err(err.clone()); + } + } + Ok(()) + } + + pub fn get_errs(&self) -> Vec> { + let mut res = vec![]; + for entry in self.entries.read().iter() { + res.push(entry.read().as_ref().right().unwrap().clone()); + } + res } } @@ -402,7 +415,7 @@ impl Request { /// Eg: compare_and_set #[derive(Clone)] pub struct ArcRequest { - inner: Arc, + pub(crate) inner: Arc, } impl std::fmt::Debug for ArcRequest { @@ -425,16 +438,16 @@ impl ArcRequest { } pub async fn is_ok(&self) -> bool { - let resp = self.get_req().get_resp().await; + let resp = self.get_req().get_first_err().await; resp.is_ok() } pub async fn get_resp(&self) -> Result<()> { - self.get_req().get_resp().await + self.get_req().get_first_err().await } - pub(crate) async fn set_err(&self, err: Result<()>) { - self.inner.res.send(err).await.expect("TODO: panic message"); + pub fn set_err(&self, err: Result<()>) { + self.inner.set_entries_resp(err); } pub(crate) fn get_req(&self) -> Arc { @@ -800,7 +813,6 @@ impl ValueLogCore { .unwrap() .write(self.buf.read().buffer())?; // todo add metrics - info!("Done"); self.writable_log_offset .fetch_add(n as u32, Ordering::Release); self.buf.write().get_mut().clear(); @@ -827,10 +839,8 @@ impl ValueLogCore { && entry.read().entry().value.len() < self.opt.value_threshold { // No need to write to value log. - info!("ptrs {}", req.ptrs.lock().len()); + // WARN: if mt not flush into disk but process abort, that will discard data(the data not write into vlog that WAL file) req.ptrs.lock()[idx] = None; - info!("to disk~"); - continue; } @@ -840,7 +850,7 @@ impl ValueLogCore { ptr.offset = self.writable_log_offset.load(Ordering::Acquire) + self.buf.read().buffer().len() as u32; let mut buf = self.buf.write(); - entry.write().entry().enc(&mut *buf)?; + entry.write().entry().enc(&mut *buf).unwrap(); } } to_disk() @@ -950,7 +960,7 @@ impl ValueLogCore { count ); info!("REWRITE: Removing fid: {}", lf.read().fid); - kv.batch_set(write_batch).await?; + kv.batch_set(write_batch).await; info!("REWRITE: Processed {} entries in total", count); info!("REWRITE: Removing fid: {}", lf.read().fid); let mut deleted_file_now = false; From e26c011790b2803bd352cad4f8a9dc903bebd519 Mon Sep 17 00:00:00 2001 From: Rg Date: Thu, 13 Apr 2023 21:30:10 +0800 Subject: [PATCH 76/77] :dog: --- src/kv.rs | 189 +++++++++++++++++------------------------------ src/types.rs | 4 + src/value_log.rs | 146 +++++++++++++++--------------------- 3 files changed, 129 insertions(+), 210 deletions(-) diff --git a/src/kv.rs b/src/kv.rs index 5bf99ad..064c054 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -7,7 +7,7 @@ use crate::table::iterator::{IteratorImpl, IteratorItem}; use crate::table::table::{new_file_name, Table, TableCore}; use crate::types::{ArcMx, Channel, Closer, TArcMx, TArcRW, XArc, XWeak}; use crate::value_log::{ - ArcRequest, Entry, EntryPair, EntryType, MetaBit, Request, ValueLogCore, ValuePointer, + ArcRequest, Entry, EntryType, MetaBit, Request, ValueLogCore, ValuePointer, MAX_KEY_SIZE, }; use crate::y::{ @@ -43,7 +43,7 @@ use std::time::Duration; use std::{string, vec}; use tokio::fs::create_dir_all; use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tokio::sync::{RwLock, RwLockWriteGuard}; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; pub const _BADGER_PREFIX: &[u8; 8] = b"!badger!"; // Prefix for internal keys used by badger. @@ -356,11 +356,7 @@ impl KV { // alongside the key, and can be used as an aid to interpret the value or store other contextual // bits corresponding to the key-value pair. pub(crate) async fn set(&self, key: Vec, value: Vec, user_meta: u8) -> Result<()> { - let mut entry = Entry::default(); - entry.key = key; - entry.value = value; - entry.user_meta = user_meta; - let res = self.batch_set(vec![entry]).await; + let res = self.batch_set(vec![Entry::default().key(key).value(value).user_meta(user_meta)]).await; assert_eq!(res.len(), 1); res[0].to_owned() } @@ -403,38 +399,39 @@ impl KV { // There is code (in flush_mem_table) whose correctness depends on us generating CAS Counter // values _before_ we modify s.vptr here. for req in reqs.iter() { - let entries = req.req_ref().entries.read(); + let entries = req.req_ref().entries.read().await; let counter_base = self.new_cas_counter(entries.len() as u64); for (idx, entry) in entries.iter().enumerate() { - entry.write().mut_entry().cas_counter = counter_base + idx as u64; - info!("update cas counter: {}", entry.read().entry().cas_counter); + entry.write().await.mut_entry().cas_counter = counter_base + idx as u64; + info!("update cas counter: {}", entry.read().await.entry().cas_counter); } } // TODO add error set - if let Err(err) = self.vlog.as_ref().unwrap().write(reqs.clone()) { + if let Err(err) = self.vlog.as_ref().unwrap().write(reqs.clone()).await { for req in reqs.iter() { - req.set_err(Err(err.clone())); + req.set_err(Err(err.clone())).await; } + return Err(err); } info!("Writing to memory table"); let mut count = 0; for req in reqs.iter() { - if req.get_req().entries.read().is_empty() { + if req.get_req().entries.read().await.is_empty() { continue; } - count += req.get_req().entries.read().len(); + count += req.get_req().entries.read().await.len(); while let Err(err) = self.ensure_room_for_write().await { tokio::time::sleep(Duration::from_millis(10)).await; } info!("waiting for write"); - if let Err(err) = self.write_to_lsm(req.clone()) { - req.set_err(Err(err)); + if let Err(err) = self.write_to_lsm(req.clone()).await { + req.set_err(Err(err)).await; } else { - req.set_err(Ok(())); + req.set_err(Ok(())).await; } - self.update_offset(req.get_req().ptrs.lock()); + self.update_offset(req.get_req().ptrs.read().await); } info!("{} entries written", count); Ok(()) @@ -499,137 +496,85 @@ impl KV { // for e in entries { // Check(e.Error); // } - // TODO - // pub(crate) async fn batch_set(&self, entries: Vec) -> Result> { - // let mut bad = vec![]; - // let mut batch_reqs = vec![]; - // let mut b = Some(Request::default()); - // let mut count = 0; - // let mut sz = 0u64; - // for entry in entries { - // if entry.key.len() > MAX_KEY_SIZE { - // bad.push(entry); - // continue; - // } - // if entry.value.len() as u64 > self.opt.value_log_file_size { - // bad.push(entry); - // continue; - // } - // count += 1; - // sz += self.opt.estimate_size(&entry) as u64; - // - // { - // b.as_ref() - // .unwrap() - // .entries - // .write() - // .push(parking_lot::RwLock::new(EntryType::from(entry))); - // b.as_ref().unwrap().ptrs.lock().push(None); - // } - // - // if count >= self.opt.max_batch_count || sz >= self.opt.max_batch_count { - // let task_req = b.replace(Request::default()); - // batch_reqs.push(task_req.unwrap()); - // count = 0; - // sz = 0; - // } - // } - // if let Some(req) = b { - // if !req.entries.read().is_empty() { - // batch_reqs.push(req); - // } - // } - // - // let mut reqs = vec![]; - // for req in batch_reqs { - // if req.entries.read().is_empty() { - // break; - // } - // let arc_req = ArcRequest::from(req); - // reqs.push(arc_req.clone()); - // assert!(!self.write_ch.is_close()); - // info!( - // "send tasks to write, entries: {}", - // arc_req.get_req().entries.read().len() - // ); - // self.write_ch.send(arc_req).await.unwrap(); - // } - // if !bad.is_empty() { - // let req = Request::default(); - // *req.entries.write() = Vec::from_iter( - // bad.into_iter() - // .map(|bad| parking_lot::RwLock::new(EntryType::from(bad))) - // .into_iter(), - // ); - // let arc_req = ArcRequest::from(req); - // arc_req - // .set_err(Err("key too big or value to big".into())) - // .await; - // reqs.push(arc_req); - // } - // Ok(reqs) - // } pub(crate) async fn batch_set(&self, entries: Vec) -> Vec> { let mut count = 0; let mut sz = 0u64; - let mut res = vec![]; + let mut res = vec![Ok(()); entries.len()]; let mut req = ArcRequest::from(Request::default()); - for entry in entries { + let mut req_index = vec![]; + + for (i, entry) in entries.into_iter().enumerate() { if entry.key.len() > MAX_KEY_SIZE { - res.push(Err("Key too big".into())); + res[i] = Err("Key too big".into()); continue; } if entry.value.len() as u64 > self.opt.value_log_file_size { - res.push(Err("Value to big".into())); + res[i] = Err("Value to big".into()); continue; } - count += 1; - sz += self.opt.estimate_size(&entry) as u64; - - req.req_ref() - .entries - .write() - .push(parking_lot::RwLock::new(EntryType::from(entry))); - req.req_ref().ptrs.lock().push(None); + { + count += 1; + sz += self.opt.estimate_size(&entry) as u64; + req.req_ref() + .entries + .write() + .await + .push(tokio::sync::RwLock::new(EntryType::from(entry))); + req.req_ref().ptrs.write().await.push(None); + req_index.push(i); + } if count >= self.opt.max_batch_count || sz >= self.opt.max_batch_count { assert!(!self.write_ch.is_close()); info!( "send tasks to write, entries: {}", - req.get_req().entries.read().len() + req.get_req().entries.read().await.len() ); self.write_ch.send(req.clone()).await.unwrap(); - count = 0; - sz = 0; - let errs = req.get_req().get_errs(); - res.extend(errs.into_iter()); - req.req_ref().entries.write().clear(); - req.req_ref().ptrs.lock().clear(); + { + count = 0; + sz = 0; + for (index, err) in req.get_req().get_errs().await.into_iter().enumerate() { + let entry_index = req_index[index]; + res[entry_index] = err; + } + req.req_ref().entries.write().await.clear(); + req.req_ref().ptrs.write().await.clear(); + req_index.clear(); + } } } - if !req.req_ref().entries.read().is_empty() { + if !req.req_ref().entries.read().await.is_empty() { self.write_ch.send(req.clone()).await.unwrap(); - res.extend(req.get_req().get_errs().into_iter()); - req.req_ref().entries.write().clear(); - req.req_ref().ptrs.lock().clear(); + { + count = 0; + sz = 0; + for (index, err) in req.get_req().get_errs().await.into_iter().enumerate() { + let entry_index = req_index[index]; + res[entry_index] = err; + } + req.req_ref().entries.write().await.clear(); + req.req_ref().ptrs.write().await.clear(); + req_index.clear(); + } } res } - fn write_to_lsm(&self, req: ArcRequest) -> Result<()> { + async fn write_to_lsm(&self, req: ArcRequest) -> Result<()> { let req = req.get_req(); //.entries.read(); - let ptrs = req.ptrs.lock(); - let entries = req.entries.read(); + let ptrs = req.ptrs.read().await; + let entries = req.entries.read().await; assert_eq!(entries.len(), ptrs.len()); for (i, pair) in entries.iter().enumerate() { - let mut entry_pair = pair.write(); + let mut entry_pair = pair.write().await; let entry = entry_pair.entry(); if entry.cas_counter_check != 0 { let old_value = self._get(&entry.key)?; // No need to decode existing value. Just need old CAS counter. if old_value.cas_counter != entry.cas_counter_check { - entry_pair.set_resp(Err(Error::ValueCasMisMatch)); + entry_pair.set_resp(Err(Error::ValueCasMisMatch)).await; continue; } } @@ -639,7 +584,7 @@ impl KV { let exits = self._exists(&entry.key)?; // Value already exists. don't write. if exits { - entry_pair.set_resp(Err(Error::ValueKeyExists)); + entry_pair.set_resp(Err(Error::ValueKeyExists)).await; continue; } } @@ -728,7 +673,7 @@ impl KV { entry.value.len() < self.opt.value_threshold } - fn update_offset(&self, ptrs: MutexGuard>>) { + fn update_offset(&self, ptrs: RwLockReadGuard>>) { let mut ptr = &ValuePointer::default(); for tmp_ptr in ptrs.iter().rev() { if tmp_ptr.is_none() || tmp_ptr.as_ref().unwrap().is_zero() { @@ -1020,7 +965,7 @@ impl ArcKV { self.must_vlog().incr_iterator_count(); // Create iterators across all the tables involved first. - let mut itrs: Vec>> = vec![]; + let mut itrs: Vec>> = vec![]; for tb in tables.clone() { let st = unsafe { tb.as_ref().unwrap().clone() }; let iter = Box::new(UniIterator::new(st, opt.reverse)); @@ -1035,8 +980,8 @@ impl ArcKV { impl ArcKV { async fn do_writes(&self, lc: Closer, without_close_write_ch: bool) { info!("start do writes task!"); - defer! {info!("exit writes task!")}; - defer! {lc.done()}; + defer! {info!("exit writes task!")} + defer! {lc.done()} // TODO add metrics let has_been_close = lc.has_been_closed(); let write_ch = self.write_ch.clone(); @@ -1107,7 +1052,7 @@ impl ArcKV { pub(crate) async fn yield_item_value( &self, item: KVItemInner, - mut consumer: impl FnMut(Vec) -> Pin> + Send>>, + mut consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { // no value if !item.has_value() { diff --git a/src/types.rs b/src/types.rs index 3fea7a8..b8fd48f 100644 --- a/src/types.rs +++ b/src/types.rs @@ -75,6 +75,10 @@ impl Channel { self.tx.as_ref().unwrap().clone() } + pub fn rx(&self) -> Receiver { + self.rx.as_ref().unwrap().clone() + } + /// consume tx and return it if exist pub fn take_tx(&mut self) -> Option> { self.tx.take() diff --git a/src/value_log.rs b/src/value_log.rs index 8cc279c..b6b4266 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -1,11 +1,11 @@ -use async_channel::RecvError; +use async_channel::{Receiver, RecvError, Sender}; use awaitgroup::{WaitGroup, Worker}; use bitflags::bitflags; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use crc32fast::Hasher; use drop_cell::defer; use either::Either; -use libc::memchr; +use libc::{endgrent, memchr}; use log::info; use log::kv::Source; use memmap::{Mmap, MmapMut}; @@ -106,7 +106,7 @@ impl Decode for Header { /// Entry provides Key, Value and if required, cas_counter_check to kv.batch_set() API. /// If cas_counter_check is provided, it would be compared against the current `cas_counter` /// assigned to this key-value. Set be done on this key only if the counters match. -#[derive(Default, Clone)] +#[derive(Default, Clone, Debug)] pub struct Entry { pub(crate) key: Vec, pub(crate) meta: u8, @@ -290,92 +290,59 @@ impl Decode for ValuePointer { } } -pub(crate) struct EntryType(Either>); +// pub(crate) struct EntryType(Either>>); + +pub(crate) struct EntryType { + entry: Entry, + fut_ch: Channel>, +} impl EntryType { pub(crate) fn entry(&self) -> &Entry { - match self.0 { - Either::Left(ref entry) => entry, - _ => panic!("It should be not happen"), - } + &self.entry } pub(crate) fn mut_entry(&mut self) -> &mut Entry { - match self.0 { - Either::Left(ref mut entry) => entry, - _ => panic!("It should be not happen"), - } + &mut self.entry } - pub(crate) fn ret(&self) -> &Result<()> { - match self.0 { - Either::Right(ref m) => m, - _ => panic!("It should be not happen"), - } + pub(crate) fn ret(&self) -> Receiver> { + self.fut_ch.rx() } - pub(crate) fn set_resp(&mut self, ret: Result<()>) { - self.0 = Either::Right(ret); + pub(crate) async fn set_resp(&mut self, ret: Result<()>) { + self.fut_ch.tx().send(ret).await.unwrap(); } } -impl Deref for EntryType { - type Target = Either>; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} +// impl Deref for EntryType { +// type Target = Either>; +// +// fn deref(&self) -> &Self::Target { +// &self.0 +// } +// } impl From for EntryType { fn from(value: Entry) -> Self { - Self(Either::Left(value)) - } -} - -impl From> for EntryType { - fn from(value: Result<()>) -> Self { - Self(Either::Right(value)) - } -} - -pub(crate) struct EntryPair { - entry: Entry, - ret: RwLock>, -} - -impl EntryPair { - pub(crate) fn new(entry: Entry) -> Self { - EntryPair { - entry, - ret: RwLock::new(Ok(())), + Self { + entry: value, + fut_ch: Channel::new(1), } } - - pub(crate) fn set_resp(&self, ret: Result<()>) { - *self.ret.write() = ret - } - - pub(crate) fn entry(&self) -> &Entry { - &self.entry - } - - pub(crate) fn mut_entry(&mut self) -> &mut Entry { - &mut self.entry - } - - pub(crate) fn resp(&self) -> Result<()> { - self.ret.read().clone() - } } pub struct Request { // Input values, NOTE: RefCell is called concurrency - pub(crate) entries: RwLock>>, + pub(crate) entries: tokio::sync::RwLock>>, // Output Values and wait group stuff below - pub(crate) ptrs: Mutex>>, + pub(crate) ptrs: tokio::sync::RwLock>>, } +// unsafe impl Send for Request {} +// +// unsafe impl Sync for Request {} + impl Default for Request { fn default() -> Self { Request { @@ -386,26 +353,27 @@ impl Default for Request { } impl Request { - pub(crate) fn set_entries_resp(&self, ret: Result<()>) { - for entry in self.entries.write().iter_mut() { + pub(crate) async fn set_entries_resp(&self, ret: Result<()>) { + for entry in self.entries.write().await.iter_mut() { info!("set resp"); - entry.get_mut().set_resp(ret.clone()); + entry.get_mut().set_resp(ret.clone()).await; } } pub async fn get_first_err(&self) -> Result<()> { - for entry in self.entries.read().iter() { - if let Err(err) = entry.read().as_ref().right().unwrap() { - return Err(err.clone()); - } + let ret = self.entries.read().await; + if let Some(ret) = ret.get(0) { + ret.read().await.ret().recv().await.unwrap() + }else { + Ok(()) } - Ok(()) } - pub fn get_errs(&self) -> Vec> { + pub async fn get_errs(&self) -> Vec> { let mut res = vec![]; - for entry in self.entries.read().iter() { - res.push(entry.read().as_ref().right().unwrap().clone()); + for entry in self.entries.read().await.iter() { + let ret = entry.read().await.ret().recv().await.unwrap(); + res.push(ret); } res } @@ -446,7 +414,7 @@ impl ArcRequest { self.get_req().get_first_err().await } - pub fn set_err(&self, err: Result<()>) { + pub async fn set_err(&self, err: Result<()>) { self.inner.set_entries_resp(err); } @@ -632,7 +600,7 @@ impl ValueLogCore { vp.offset, self.writable_log_offset.load(Ordering::Acquire) ) - .into()); + .into()); } self.read_value_bytes(vp, |buffer| { @@ -651,7 +619,7 @@ impl ValueLogCore { pub async fn async_read( &self, vp: &ValuePointer, - consumer: impl FnMut(Vec) -> Pin> + Send>>, + consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { // Check for valid offset if we are reading to writable log. if vp.fid == self.max_fid.load(Ordering::Acquire) @@ -662,7 +630,7 @@ impl ValueLogCore { vp.offset, self.writable_log_offset.load(Ordering::Acquire) ) - .into()); + .into()); } self.async_read_bytes(vp, consumer).await?; Ok(()) @@ -675,7 +643,7 @@ impl ValueLogCore { mut f: impl for<'a> FnMut( &'a Entry, &'a ValuePointer, - ) -> Pin> + 'a>>, + ) -> Pin> + 'a>>, ) -> Result<()> { let vlogs = self.pick_log_guard(); info!("Seeking at value pointer: {:?}", vp); @@ -779,7 +747,7 @@ impl ValueLogCore { async fn async_read_bytes( &self, vp: &ValuePointer, - mut consumer: impl FnMut(Vec) -> Pin> + Send>>, + mut consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { let mut buffer = self.pick_log_by_vlog_id(&vp.fid).read().read(&vp)?.to_vec(); let value_buffer = buffer.split_off(Header::encoded_size()); @@ -793,7 +761,7 @@ impl ValueLogCore { } // write is thread-unsafe by design and should not be called concurrently. - pub(crate) fn write(&self, reqs: Arc>) -> Result<()> { + pub(crate) async fn write(&self, reqs: Arc>) -> Result<()> { defer! {info!("finished write value log");} let cur_vlog_file = self.pick_log_by_vlog_id(&self.max_fid.load(Ordering::Acquire)); let to_disk = || -> Result<()> { @@ -834,13 +802,13 @@ impl ValueLogCore { for req in reqs.iter() { let req = req.get_req(); - for (idx, entry) in req.entries.read().iter().enumerate() { + for (idx, entry) in req.entries.read().await.iter().enumerate() { if !self.opt.sync_writes - && entry.read().entry().value.len() < self.opt.value_threshold + && entry.read().await.entry().value.len() < self.opt.value_threshold { // No need to write to value log. // WARN: if mt not flush into disk but process abort, that will discard data(the data not write into vlog that WAL file) - req.ptrs.lock()[idx] = None; + req.ptrs.write().await[idx] = None; continue; } @@ -850,7 +818,9 @@ impl ValueLogCore { ptr.offset = self.writable_log_offset.load(Ordering::Acquire) + self.buf.read().buffer().len() as u32; let mut buf = self.buf.write(); - entry.write().entry().enc(&mut *buf).unwrap(); + let mut entry = entry.write().await; + let mut entry = entry.mut_entry(); + entry.enc(&mut *buf).unwrap(); } } to_disk() @@ -1055,8 +1025,8 @@ impl ValueLogCore { pub(crate) async fn wait_on_gc(&self, lc: Closer) { defer! {lc.done()} lc.wait().await; // wait for lc to be closed. - // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up - // the channel of size 1. + // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up + // the channel of size 1. self.garbage_ch.send(()).await.unwrap(); } From acb90c26703603e78c94aaf40445d192e78c0d7b Mon Sep 17 00:00:00 2001 From: Rg Date: Fri, 14 Apr 2023 01:34:49 +0800 Subject: [PATCH 77/77] :dog: sleep --- Cargo.toml | 5 ++- src/kv.rs | 63 +++++++++++++------------- src/kv_test.rs | 3 +- src/value_log.rs | 114 +++++++++++++++++++++++------------------------ 4 files changed, 94 insertions(+), 91 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3c21d40..e299aca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ serde = { version = "1.0.131", features = ["derive"] } serde_json = { version = "1.0", default-features = true, features = ["alloc"] } anyhow = "1.0.31" thiserror = "1.0" -tokio = { version = "1.16.1", features = ["full"] } +tokio = { version = "1.16.1", features = ["full", "tracing"] } byteorder = "1.4.3" rand = "0.8.5" maligned = "0.2.1" @@ -47,6 +47,7 @@ eieio = "1.0.0" either = "1.8.1" enum-unitary = "0.5.0" atom_box = "0.1.2" +console-subscriber = "0.1.8" [dev-dependencies] tracing-subscriber = "0.3.16" tracing-log = "0.1.3" @@ -54,3 +55,5 @@ chrono = "0.4.22" env_logger = "0.9.1" console_log = { version = "0.2.0", features = ["color"] } +[build] +rustflags = ["--cfg", "tokio_unstable"] \ No newline at end of file diff --git a/src/kv.rs b/src/kv.rs index 064c054..24c23d3 100644 --- a/src/kv.rs +++ b/src/kv.rs @@ -7,8 +7,7 @@ use crate::table::iterator::{IteratorImpl, IteratorItem}; use crate::table::table::{new_file_name, Table, TableCore}; use crate::types::{ArcMx, Channel, Closer, TArcMx, TArcRW, XArc, XWeak}; use crate::value_log::{ - ArcRequest, Entry, EntryType, MetaBit, Request, ValueLogCore, ValuePointer, - MAX_KEY_SIZE, + ArcRequest, Entry, EntryType, MetaBit, Request, ValueLogCore, ValuePointer, MAX_KEY_SIZE, }; use crate::y::{ async_sync_directory, create_synced_file, sync_directory, Encode, Result, ValueStruct, @@ -21,7 +20,7 @@ use crate::{ use anyhow::__private::kind::TraitKind; use async_channel::RecvError; use atomic::Atomic; -use bytes::BufMut; +use bytes::{Buf, BufMut}; use crossbeam_epoch::{Owned, Shared}; use drop_cell::defer; use fs2::FileExt; @@ -356,7 +355,12 @@ impl KV { // alongside the key, and can be used as an aid to interpret the value or store other contextual // bits corresponding to the key-value pair. pub(crate) async fn set(&self, key: Vec, value: Vec, user_meta: u8) -> Result<()> { - let res = self.batch_set(vec![Entry::default().key(key).value(value).user_meta(user_meta)]).await; + let res = self + .batch_set(vec![Entry::default() + .key(key) + .value(value) + .user_meta(user_meta)]) + .await; assert_eq!(res.len(), 1); res[0].to_owned() } @@ -399,11 +403,12 @@ impl KV { // There is code (in flush_mem_table) whose correctness depends on us generating CAS Counter // values _before_ we modify s.vptr here. for req in reqs.iter() { - let entries = req.req_ref().entries.read().await; + let entries = &req.req_ref().entries; let counter_base = self.new_cas_counter(entries.len() as u64); for (idx, entry) in entries.iter().enumerate() { - entry.write().await.mut_entry().cas_counter = counter_base + idx as u64; - info!("update cas counter: {}", entry.read().await.entry().cas_counter); + let mut entry = entry.write().await; + entry.mut_entry().cas_counter = counter_base + idx as u64; + info!("update cas counter: {}", entry.entry().cas_counter); } } @@ -418,10 +423,10 @@ impl KV { info!("Writing to memory table"); let mut count = 0; for req in reqs.iter() { - if req.get_req().entries.read().await.is_empty() { + if req.get_req().entries.is_empty() { continue; } - count += req.get_req().entries.read().await.len(); + count += req.get_req().entries.len(); while let Err(err) = self.ensure_room_for_write().await { tokio::time::sleep(Duration::from_millis(10)).await; } @@ -500,7 +505,7 @@ impl KV { let mut count = 0; let mut sz = 0u64; let mut res = vec![Ok(()); entries.len()]; - let mut req = ArcRequest::from(Request::default()); + let mut req = Request::default(); let mut req_index = vec![]; for (i, entry) in entries.into_iter().enumerate() { @@ -515,12 +520,9 @@ impl KV { { count += 1; sz += self.opt.estimate_size(&entry) as u64; - req.req_ref() - .entries - .write() - .await + req.entries .push(tokio::sync::RwLock::new(EntryType::from(entry))); - req.req_ref().ptrs.write().await.push(None); + req.ptrs.write().await.push(None); req_index.push(i); } @@ -528,47 +530,48 @@ impl KV { assert!(!self.write_ch.is_close()); info!( "send tasks to write, entries: {}", - req.get_req().entries.read().await.len() + req.entries.len() ); - self.write_ch.send(req.clone()).await.unwrap(); + let arc_req = ArcRequest::from(req); + self.write_ch.send(arc_req.clone()).await.unwrap(); { count = 0; sz = 0; - for (index, err) in req.get_req().get_errs().await.into_iter().enumerate() { + for (index, err) in arc_req.req_ref().get_errs().await.into_iter().enumerate() { let entry_index = req_index[index]; res[entry_index] = err; } - req.req_ref().entries.write().await.clear(); - req.req_ref().ptrs.write().await.clear(); + req = Request::default(); req_index.clear(); } } } - if !req.req_ref().entries.read().await.is_empty() { - self.write_ch.send(req.clone()).await.unwrap(); + if !req.entries.is_empty() { + let arc_req = ArcRequest::from(req); + self.write_ch.send(arc_req.clone()).await.unwrap(); { count = 0; sz = 0; - for (index, err) in req.get_req().get_errs().await.into_iter().enumerate() { + for (index, err) in arc_req.get_req().get_errs().await.into_iter().enumerate() { let entry_index = req_index[index]; res[entry_index] = err; } - req.req_ref().entries.write().await.clear(); - req.req_ref().ptrs.write().await.clear(); + req = Request::default(); req_index.clear(); } } res } - async fn write_to_lsm(&self, req: ArcRequest) -> Result<()> { + async fn write_to_lsm(&self, req: ArcRequest) -> Result<()> { + defer! {info!("exit write to lsm")} let req = req.get_req(); //.entries.read(); let ptrs = req.ptrs.read().await; - let entries = req.entries.read().await; + let entries = &req.entries; assert_eq!(entries.len(), ptrs.len()); for (i, pair) in entries.iter().enumerate() { - let mut entry_pair = pair.write().await; + let entry_pair = pair.read().await; let entry = entry_pair.entry(); if entry.cas_counter_check != 0 { let old_value = self._get(&entry.key)?; @@ -965,7 +968,7 @@ impl ArcKV { self.must_vlog().incr_iterator_count(); // Create iterators across all the tables involved first. - let mut itrs: Vec>> = vec![]; + let mut itrs: Vec>> = vec![]; for tb in tables.clone() { let st = unsafe { tb.as_ref().unwrap().clone() }; let iter = Box::new(UniIterator::new(st, opt.reverse)); @@ -1052,7 +1055,7 @@ impl ArcKV { pub(crate) async fn yield_item_value( &self, item: KVItemInner, - mut consumer: impl FnMut(Vec) -> Pin> + Send>>, + mut consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { // no value if !item.has_value() { diff --git a/src/kv_test.rs b/src/kv_test.rs index b2a7b28..a656842 100644 --- a/src/kv_test.rs +++ b/src/kv_test.rs @@ -20,9 +20,10 @@ fn get_test_option(dir: &str) -> Options { } #[tokio::test] -async fn t_write() { +async fn t_1_write() { use crate::test_util::{mock_log, mock_log_terminal, random_tmp_dir, tracing_log}; tracing_log(); + // console_subscriber::init(); let dir = random_tmp_dir(); let kv = KV::open(get_test_option(&dir)).await; let kv = kv.unwrap(); diff --git a/src/value_log.rs b/src/value_log.rs index b6b4266..6eead10 100644 --- a/src/value_log.rs +++ b/src/value_log.rs @@ -37,7 +37,7 @@ use crate::log_file::LogFile; use crate::options::Options; use crate::skl::BlockBytes; use crate::table::iterator::BlockSlice; -use crate::types::{ArcMx, ArcRW, Channel, Closer, TArcMx, XArc}; +use crate::types::{ArcMx, ArcRW, Channel, Closer, TArcMx, TArcRW, XArc}; use crate::y::{ create_synced_file, is_eof, open_existing_synced_file, read_at, sync_directory, Decode, Encode, }; @@ -310,7 +310,7 @@ impl EntryType { self.fut_ch.rx() } - pub(crate) async fn set_resp(&mut self, ret: Result<()>) { + pub(crate) async fn set_resp(&self, ret: Result<()>) { self.fut_ch.tx().send(ret).await.unwrap(); } } @@ -334,7 +334,7 @@ impl From for EntryType { pub struct Request { // Input values, NOTE: RefCell is called concurrency - pub(crate) entries: tokio::sync::RwLock>>, + pub(crate) entries: Vec>, // Output Values and wait group stuff below pub(crate) ptrs: tokio::sync::RwLock>>, } @@ -354,25 +354,25 @@ impl Default for Request { impl Request { pub(crate) async fn set_entries_resp(&self, ret: Result<()>) { - for entry in self.entries.write().await.iter_mut() { + for entry in self.entries.iter() { info!("set resp"); - entry.get_mut().set_resp(ret.clone()).await; + entry.read().await.set_resp(ret.clone()).await; } } pub async fn get_first_err(&self) -> Result<()> { - let ret = self.entries.read().await; - if let Some(ret) = ret.get(0) { + if let Some(ret) = self.entries.get(0) { ret.read().await.ret().recv().await.unwrap() - }else { + } else { Ok(()) } } pub async fn get_errs(&self) -> Vec> { let mut res = vec![]; - for entry in self.entries.read().await.iter() { - let ret = entry.read().await.ret().recv().await.unwrap(); + for entry in self.entries.iter() { + let ch = entry.read().await.fut_ch.rx(); + let ret = ch.recv().await.unwrap(); res.push(ret); } res @@ -415,7 +415,7 @@ impl ArcRequest { } pub async fn set_err(&self, err: Result<()>) { - self.inner.set_entries_resp(err); + self.inner.set_entries_resp(err).await; } pub(crate) fn get_req(&self) -> Arc { @@ -444,7 +444,7 @@ pub struct ValueLogCore { // A refcount of iterators -- when this hits zero, we can delete the files_to_be_deleted. Why? num_active_iterators: AtomicI32, writable_log_offset: AtomicU32, - buf: ArcRW>>, + buf: TArcRW>>, opt: Options, kv: BoxKV, // Only allow one GC at a time. @@ -461,7 +461,7 @@ impl Default for ValueLogCore { dirty_vlogs: Arc::new(Default::default()), num_active_iterators: Default::default(), writable_log_offset: Default::default(), - buf: Arc::new(RwLock::new(BufWriter::new(vec![0u8; 0]))), + buf: Arc::new(tokio::sync::RwLock::new(BufWriter::new(vec![0u8; 0]))), opt: Default::default(), kv: BoxKV::new(ptr::null_mut()), garbage_ch: Channel::new(1), @@ -600,7 +600,7 @@ impl ValueLogCore { vp.offset, self.writable_log_offset.load(Ordering::Acquire) ) - .into()); + .into()); } self.read_value_bytes(vp, |buffer| { @@ -619,7 +619,7 @@ impl ValueLogCore { pub async fn async_read( &self, vp: &ValuePointer, - consumer: impl FnMut(Vec) -> Pin> + Send>>, + consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { // Check for valid offset if we are reading to writable log. if vp.fid == self.max_fid.load(Ordering::Acquire) @@ -630,7 +630,7 @@ impl ValueLogCore { vp.offset, self.writable_log_offset.load(Ordering::Acquire) ) - .into()); + .into()); } self.async_read_bytes(vp, consumer).await?; Ok(()) @@ -643,7 +643,7 @@ impl ValueLogCore { mut f: impl for<'a> FnMut( &'a Entry, &'a ValuePointer, - ) -> Pin> + 'a>>, + ) -> Pin> + 'a>>, ) -> Result<()> { let vlogs = self.pick_log_guard(); info!("Seeking at value pointer: {:?}", vp); @@ -747,7 +747,7 @@ impl ValueLogCore { async fn async_read_bytes( &self, vp: &ValuePointer, - mut consumer: impl FnMut(Vec) -> Pin> + Send>>, + mut consumer: impl FnMut(Vec) -> Pin> + Send>>, ) -> Result<()> { let mut buffer = self.pick_log_by_vlog_id(&vp.fid).read().read(&vp)?.to_vec(); let value_buffer = buffer.split_off(Header::encoded_size()); @@ -764,66 +764,62 @@ impl ValueLogCore { pub(crate) async fn write(&self, reqs: Arc>) -> Result<()> { defer! {info!("finished write value log");} let cur_vlog_file = self.pick_log_by_vlog_id(&self.max_fid.load(Ordering::Acquire)); - let to_disk = || -> Result<()> { - if self.buf.read().buffer().is_empty() { + + for req in reqs.iter() { + let req = req.get_req(); + for (idx, entry) in req.entries.iter().enumerate() { + if !self.opt.sync_writes + && entry.read().await.entry().value.len() < self.opt.value_threshold + { + // No need to write to value log. + // WARN: if mt not flush into disk but process abort, that will discard data(the data not write into vlog that WAL file) + req.ptrs.write().await[idx] = None; + continue; + } + + let mut ptr = ValuePointer::default(); + ptr.fid = cur_vlog_file.read().fid; + // Use the offset including buffer length so far. + ptr.offset = self.writable_log_offset.load(Ordering::Acquire) + + self.buf.read().await.buffer().len() as u32; + let mut buf = self.buf.write().await; + let mut entry = entry.write().await; + let mut entry = entry.mut_entry(); + entry.enc(buf.get_mut()).unwrap(); + } + } + { + if self.buf.read().await.buffer().is_empty() { return Ok(()); } info!( " Flushing {} blocks of total size: {}", reqs.len(), - self.buf.read().buffer().len() + self.buf.read().await.buffer().len() ); - let n = cur_vlog_file - .write() - .fd - .as_mut() - .unwrap() - .write(self.buf.read().buffer())?; + let mut buffer = self.buf.write().await; + let mut buffer = buffer.get_mut(); + let mut cur_vlog_file_wt = cur_vlog_file.write(); + let fp = cur_vlog_file_wt.fd.as_mut().unwrap(); + let n = fp.write(&buffer)?; // todo add metrics self.writable_log_offset .fetch_add(n as u32, Ordering::Release); - self.buf.write().get_mut().clear(); + buffer.clear(); if self.writable_log_offset.load(Ordering::Acquire) > self.opt.value_log_file_size as u32 { - cur_vlog_file - .write() - .done_writing(self.writable_log_offset.load(Ordering::Acquire))?; + cur_vlog_file_wt.done_writing(self.writable_log_offset.load(Ordering::Acquire))?; let new_id = self.max_fid.fetch_add(1, Ordering::Release); assert!(new_id < 1 << 16, "newid will overflow u16: {}", new_id); - *cur_vlog_file.write() = + *cur_vlog_file_wt = self.create_mmap_vlog_file(new_id, 2 * self.opt.value_log_file_size)?; } Ok(()) - }; - - for req in reqs.iter() { - let req = req.get_req(); - for (idx, entry) in req.entries.read().await.iter().enumerate() { - if !self.opt.sync_writes - && entry.read().await.entry().value.len() < self.opt.value_threshold - { - // No need to write to value log. - // WARN: if mt not flush into disk but process abort, that will discard data(the data not write into vlog that WAL file) - req.ptrs.write().await[idx] = None; - continue; - } - - let mut ptr = ValuePointer::default(); - ptr.fid = cur_vlog_file.read().fid; - // Use the offset including buffer length so far. - ptr.offset = self.writable_log_offset.load(Ordering::Acquire) - + self.buf.read().buffer().len() as u32; - let mut buf = self.buf.write(); - let mut entry = entry.write().await; - let mut entry = entry.mut_entry(); - entry.enc(&mut *buf).unwrap(); - } } - to_disk() } // rewrite the log_file @@ -1025,8 +1021,8 @@ impl ValueLogCore { pub(crate) async fn wait_on_gc(&self, lc: Closer) { defer! {lc.done()} lc.wait().await; // wait for lc to be closed. - // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up - // the channel of size 1. + // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up + // the channel of size 1. self.garbage_ch.send(()).await.unwrap(); }