From 45b453b229940e2ae1a9cf8b6701a71b77a7ff3b Mon Sep 17 00:00:00 2001 From: Nitin Singla Date: Sat, 25 Jan 2025 18:38:50 +0000 Subject: [PATCH 1/9] First cut changes for commit. --- turbonfs/inc/file_cache.h | 104 ++++++++ turbonfs/inc/nfs_inode.h | 75 +++++- turbonfs/inc/rpc_task.h | 1 + turbonfs/src/file_cache.cpp | 115 +++++++++ turbonfs/src/nfs_inode.cpp | 472 ++++++++++++++++++++++++++++++----- turbonfs/src/rpc_task.cpp | 484 +++++++++++++++++++++++++++++++----- 6 files changed, 1126 insertions(+), 125 deletions(-) diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h index 33729300..be7a8ac0 100644 --- a/turbonfs/inc/file_cache.h +++ b/turbonfs/inc/file_cache.h @@ -237,6 +237,9 @@ struct membuf std::vector *flush_waiters = nullptr; std::mutex flush_waiters_lock_44; + std::vector *commit_waiters = nullptr; + std::mutex commit_waiters_lock_44; + /** * get_flush_waiters() returns the list of tasks waiting for this * membuf to be flushed to the Blob. All these tasks must be completed @@ -247,6 +250,16 @@ struct membuf */ std::vector get_flush_waiters(); + /** + * get_commit_waiters() returns the list of tasks waiting for this + * membuf to be committed to the Blob. All these tasks must be completed + * appropriately when membuf commit completes (success or failure). + * + * Note: This splices the commit_waiters list and returns, and hence + * it's not idempotent. + */ + std::vector get_commit_waiters(); + /* * If is_file_backed() is true then 'allocated_buffer' is the mmap()ed * address o/w it's the heap allocation address. @@ -1222,6 +1235,11 @@ class bytes_chunk_cache uint64_t length, struct rpc_task *task); + bool add_commit_waiter(uint64_t offset, + uint64_t length, + struct rpc_task *task); + + /* * Returns all dirty chunks for a given range in chunkmap. * Before returning it increases the inuse count of underlying membuf(s). @@ -1259,6 +1277,16 @@ class bytes_chunk_cache std::vector get_dirty_nonflushing_bcs_range( uint64_t st_off = 0, uint64_t end_off = UINT64_MAX) const; + /** + * Returns contiguous dirty (and not flushing) chunks from chunmap, starting + * with the lowest dirty offset, and returns the total number of (dirty) + * bytes contained in the returned chunks. + * Before returning it increases the inuse count of underlying membuf(s). + * Caller will typically flush these to the backing Blob as UNSTABLE + * writes. + */ + std::vector get_contiguous_dirty_bcs(uint64_t& size) const; + /* * Returns all dirty chunks which are currently flushing for a given range * in chunkmap. Before returning it increases the inuse count of underlying @@ -1384,6 +1412,36 @@ class bytes_chunk_cache return std::min(max_dirty_extent, uint64_t(1024 * 1024 * 1024ULL)); } + /* + * Maximum size of commit bytes that can be pending in cache, before we + * should commit it to Blob. + * It should be greater than max_dirty_extent_bytes() and smaller than + * inline_dirty_threshold. So, that inline pruning can be avoided. + * This is 60% of the allowed cache size. + * f.e = Cache size of 4GB then max_commit_bytes = 2.4GB + * - Flush will start every 1GB dirty data and each 1GB dirty data + * converted to commit_pending_bytes. + * + * This is 60% of the allowed cache size. + */ + uint64_t max_commit_bytes() const + { + // Maximum cache size allowed in bytes. + static const uint64_t max_total = + (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); + assert(max_total != 0); + + static const uint64_t max_commit_bytes = + std::max((((uint64_t)(max_total * 0.6))/ max_dirty_extent_bytes()) * max_dirty_extent_bytes(), + (uint64_t) (2 * AZNFSCFG_WSIZE_MAX)); + + static const uint64_t min_commit_bytes = + std::max(((2 * max_dirty_extent_bytes()) - AZNFSCFG_WSIZE_MAX), + (uint64_t) (2 * AZNFSCFG_WSIZE_MAX)); + + return std::max((max_commit_bytes - AZNFSCFG_WSIZE_MAX), min_commit_bytes); + } + /** * Get the amount of dirty data that needs to be flushed. * This excludes the data which is already flushing. @@ -1432,6 +1490,52 @@ class bytes_chunk_cache return bytes_flushing > 0; } + /* + * Check if bytes_commit_pending is equal or more than + * max_commit_bytes limit. + */ + bool commit_required() const + { + return (bytes_commit_pending >= max_commit_bytes()); + } + + /* + * get_bytes_to_prune() returns the number of bytes that need to be flushed + * inline to free up the space. If there are enough bytes_flushing then we + * can just wait for them to complete. + * + * Note : We are not considering bytes_commit_pending in this calculation. + * If bytes_commit_pending are high then commit already started and + * if bytes_flushing are high once flushing is done commit will be + * triggered. + */ + uint64_t get_bytes_to_prune() const + { + static const uint64_t max_dirty_allowed_per_cache = + max_dirty_extent_bytes() * 2; + int64_t total_bytes = + std::max(int64_t(bytes_dirty - bytes_flushing), int64_t(0)); + const bool local_pressure = total_bytes > (int64_t)max_dirty_allowed_per_cache; + + if (local_pressure) { + return std::max(int64_t(0), (int64_t)(total_bytes - max_dirty_extent_bytes())); + } + + /* + * Global pressure is when get_prune_goals() returns non-zero bytes + * to be pruned inline. + */ + uint64_t inline_bytes; + + /* + * TODO: Noisy neighbor syndrome, where one file is hogging the cache, + * inline pruning will be triggered for other files. + */ + get_prune_goals(&inline_bytes, nullptr); + return std::max(int64_t(inline_bytes - (bytes_flushing + bytes_commit_pending)), + (int64_t)0); + } + /** * This should be called by writer threads to find out if they must wait * for the write to complete. This will check both the cache specific and diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h index bef85bb9..959e88b5 100644 --- a/turbonfs/inc/nfs_inode.h +++ b/turbonfs/inc/nfs_inode.h @@ -279,6 +279,29 @@ struct nfs_inode */ struct stat attr; + /** + * We maintain following multiple views of the file and thus multiple file + * sizes for those views. + * - Cached. + * This is the view of the file that comprises of data that has been + * written by the application and saved in file cache. It may or may not + * have been flushed and/or committed. This is the most uptodate view of + * the file and applications must use this view. + * cached_filesize tracks the file size for this view. + * - Uncommited. + * This is the view of the file that tracks data that has been flushed + * using UNSTABLE writes but not yet COMMITted to the Blob. This view of + * the file is only used to see if the next PB call will write after the + * last PB'ed byte and thus can be appended. + * putblock_filesize tracks the file size for this view. + * - Committed. + * This is the view of the file that tracks data committed to the Blob. + * Other clients will see this view. + * attr.st_size tracks the file size for this view. + */ + off_t cached_filesize = 0; + off_t putblock_filesize = 0; + /* * Has this inode seen any non-append write? * This starts as false and remains false as long as copy_to_cache() only @@ -384,6 +407,13 @@ struct nfs_inode */ int write_error = 0; + /* + * Last offset and length of the last flushing operation. + * This is used add task waiting for flush/commit to complete. + */ + uint64_t last_flushing_offset = 0; + uint64_t last_flushing_length = 0; + /* * Commit state for this inode. * This is used to track the state of commit operation for this inode, which @@ -418,6 +448,12 @@ struct nfs_inode std::atomic commit_state = commit_state_t::COMMIT_NOT_NEEDED; + /* + * commit_lock_5 is used to synchronize flush thread and write thread + * for commit operation. + */ + std::mutex commit_lock_5; + /** * TODO: Initialize attr with postop attributes received in the RPC * response. @@ -1319,17 +1355,50 @@ struct nfs_inode * progress (which must have held the is_flushing lock). */ int flush_cache_and_wait(uint64_t start_off = 0, - uint64_t end_off = UINT64_MAX); + uint64_t end_off = UINT64_MAX, + struct rpc_task *parent_task = nullptr); + + void mark_commit_in_progress(); /** * Wait for currently flushing membufs to complete. + * If parent_task is non-null, then it's added to the commit_waiters list + * and returned. Otherwise it waits for the ongoing flush to complete. + * * Returns 0 on success and a positive errno value on error. * * Note : Caller must hold the inode is_flushing lock to ensure that * no new membufs are added till this call completes. */ - int wait_for_ongoing_flush(uint64_t start_off = 0, - uint64_t end_off = UINT64_MAX); + int wait_for_ongoing_flush_commit(uint64_t start_off = 0, + uint64_t end_off = UINT64_MAX, + struct rpc_task *parent_task = nullptr); + + /* + * commit_membufs() is called to commit uncommitted membufs to the BLOB. + * It creates commit RPC and sends it to the NFS server. + */ + void commit_membufs(); + + /* + * switch_to_stable_write() is called to switch the inode to stable write + * mode. There is should be no ongoing commit/flusing operation when this + * is called. It creates a commit RPC to commit all the uncopmmitted membufs + * to the BLOB. + */ + void switch_to_stable_write(); + + /** + * Check if stable write is required for the given offset. + * Given offset is the start of contigious dirty membufs that need to be + * flushed to the BLOB. + */ + bool check_stable_write_required(off_t offset); + + /** + * Wait for ongoing commit operation to complete. + */ + void wait_for_ongoing_commit(); /** * Sync the dirty membufs in the file cache to the NFS server. diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h index a7f049ae..c9fd4303 100644 --- a/turbonfs/inc/rpc_task.h +++ b/turbonfs/inc/rpc_task.h @@ -2387,6 +2387,7 @@ struct rpc_task */ bool add_bc(const bytes_chunk& bc); void issue_write_rpc(); + void issue_commit_rpc(); #ifdef ENABLE_NO_FUSE /* diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp index 2b51abbb..d133ed7e 100644 --- a/turbonfs/src/file_cache.cpp +++ b/turbonfs/src/file_cache.cpp @@ -712,6 +712,36 @@ void membuf::clear_inuse() inuse--; } +std::vector membuf::get_commit_waiters() +{ + /* + * We cannot assert for !is_dirty() as in case of write failure + * we complete flush_waiters w/o clearing dirty flag. + */ + assert(is_inuse()); + assert(is_locked()); + assert(!is_flushing()); + assert(!is_commit_pending()); + + std::unique_lock _lock(commit_waiters_lock_44); + std::vector tasks; + + if (commit_waiters) { + // flush_waiters is dynamically allocated when first task is added. + assert(!commit_waiters->empty()); + tasks.swap(*commit_waiters); + assert(commit_waiters->empty()); + + /* + * Free the commit_waiters vector. + */ + delete commit_waiters; + commit_waiters = nullptr; + } + + return tasks; +} + std::vector membuf::get_flush_waiters() { /* @@ -2362,6 +2392,43 @@ std::vector bytes_chunk_cache::get_dirty_nonflushing_bcs_range( return bc_vec; } +std::vector bytes_chunk_cache::get_contiguous_dirty_bcs( + uint64_t& size) const +{ + std::vector bc_vec; + size = 0; + + // TODO: Make it shared lock. + const std::unique_lock _lock(chunkmap_lock_43); + auto it = chunkmap.lower_bound(0); + uint64_t prev_offset = AZNFSC_BAD_OFFSET; + + while (it != chunkmap.cend()) { + const struct bytes_chunk& bc = it->second; + struct membuf *mb = bc.get_membuf(); + + if (mb->is_dirty() && !mb->is_flushing()) { + if (prev_offset != AZNFSC_BAD_OFFSET) { + if (prev_offset != bc.offset) { + break; + } else { + size += bc.length; + prev_offset = bc.offset + bc.length; + } + } else { + size += bc.length; + prev_offset = bc.offset + bc.length; + } + mb->set_inuse(); + bc_vec.emplace_back(bc); + } + + ++it; + } + + return bc_vec; +} + bool bytes_chunk_cache::add_flush_waiter(uint64_t offset, uint64_t length, struct rpc_task *task) @@ -2409,6 +2476,54 @@ bool bytes_chunk_cache::add_flush_waiter(uint64_t offset, return false; } +bool bytes_chunk_cache::add_commit_waiter(uint64_t offset, + uint64_t length, + struct rpc_task *task) +{ + assert(offset < AZNFSC_MAX_FILE_SIZE); + assert(length > 0); + + // Only frontend write tasks must ever wait. + assert(task->magic == RPC_TASK_MAGIC); + assert(task->get_op_type() == FUSE_WRITE); + assert(task->rpc_api->write_task.is_fe()); + + const std::unique_lock _lock(chunkmap_lock_43); + auto it = chunkmap.lower_bound(offset); + struct bytes_chunk *last_bc = nullptr; + + // Get the last bc for the given range. + while (it != chunkmap.cend() && (it->first < (offset + length))) { + last_bc = &(it->second); + ++it; + } + + if (last_bc != nullptr) { + // membuf must have at least one byte in the requested range. + assert(last_bc->offset >= offset && + last_bc->offset < (offset + length)); + + struct membuf *mb = last_bc->get_membuf(); + + std::unique_lock _lock2(mb->commit_waiters_lock_44); + /* + * Only add flush waiters to dirty membufs, else it may have already + * completed flush and we don't want the task to be waiting forever. + */ + if (mb->is_flushing() || mb->is_commit_pending()) { + if (mb->commit_waiters == nullptr) { + mb->commit_waiters = new std::vector(); + } + + mb->commit_waiters->emplace_back(task); + return true; + } + } + + return false; +} + + std::vector bytes_chunk_cache::get_dirty_bc_range( uint64_t start_off, uint64_t end_off) const { diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp index 1fb31583..dcb8b18d 100644 --- a/turbonfs/src/nfs_inode.cpp +++ b/turbonfs/src/nfs_inode.cpp @@ -375,6 +375,142 @@ int nfs_inode::get_actimeo_max() const } } +/* + * Caller should hold flush_lock(). + */ +void nfs_inode::wait_for_ongoing_commit() +{ + assert(is_flushing); + + while (is_commit_in_progress()) { + assert(!get_filecache()->is_flushing_in_progress()); + ::usleep(1000); + } + + assert(is_commit_in_progress() == false); +} + +/* + * This function is called with flush_lock() held. + * In normal case this function should be no-op as + * when we reach here all the previous flushes/commits + * should have been completed. But in case of committing + * at the close of file, we need to wait if any flush + * going on and commit all the dirty membufs which are + * not yet committed. + */ +void nfs_inode::switch_to_stable_write() +{ + assert(is_flushing); + AZLogInfo("[{}] Switching to stable write", ino); + + /* + * We should not be in commit in progress state. + * Switch_to_stable_write() is called by flush thread + * and it shouldn't be called when commit is in progress. + */ + assert(is_commit_in_progress() == false); + + /* + * switch_to_stable_write() called from following paths. + * 1. sync_membufs(), No new flush should be started if + * there is ongoing flush is in progress. + * 2. flush_cache_and_wait(), Wait for ongoing flush to + * complete. + * As we are allowing multiple flush in parallel in case + * of unstable write as well, we should wait for ongoing + * flush to complere before switching to stable write. + * switch_to_stable_write() is called by flush_lock() held + * so no new flush can be started. + */ + wait_for_ongoing_flush_commit(); + assert(!get_filecache()->is_flushing_in_progress()); + + /* + * Check if there is anything to commit, if not then + * switch to stable write. + */ + if (get_filecache()->get_bytes_to_commit() == 0) { + AZLogDebug("[{}] Nothing to commit, switching to stable write", ino); + set_stable_write(); + return; + } + + /* + * If there is something to commit, then commit it and + * wait for the commit to complete. + */ + std::unique_lock commit_lock(commit_lock_5); + assert(!is_commit_in_progress()); + set_commit_in_progress(); + commit_lock.unlock(); + + /* + * Issue the commit RPC to commit the dirty membufs. + */ + commit_membufs(); + + /* + * Wait for the commit to complete. + */ + wait_for_ongoing_commit(); + + assert(get_filecache()->get_bytes_to_commit() == 0); + assert(!get_filecache()->is_flushing_in_progress()); + assert(!is_commit_in_progress()); + + set_stable_write(); + return; +} + +/* + * This function checks, whether switch to stable write or not. + */ +bool nfs_inode::check_stable_write_required(off_t offset) +{ + /* + * If stable_write is already set, we don't need to do anything. + * We don't need lock here as once stable_write is set it's never + * unset. + */ + if (is_stable_write()) { + return false; + } + + /* + * If current write is not append write, then we can't go for unstable writes + * It may be overwrite to existing data and we don't have the knowldege of + * existing block list, it maye require read modified write. So, we can't go + * for unstable write. Similarly, if the offset is more than end of the file, + * we need to write zero block in between the current end of the file and the + * offset. + */ + if (putblock_filesize != offset) { + AZLogInfo("Stable write required as putblock_filesize:{} is not at the" + "offset:{}", putblock_filesize, offset); + + return true; + } + + return false; +} + +/** + * commit_membufs() is called by writer thread to commit flushed membufs. + */ +void nfs_inode::commit_membufs() +{ + /* + * Create the commit task to carry out the write. + */ + struct rpc_task *commit_task = + get_client()->get_rpc_task_helper()->alloc_rpc_task(FUSE_FLUSH); + commit_task->init_flush(nullptr /* fuse_req */, ino); + assert(commit_task->rpc_api->pvt == nullptr); + + commit_task->issue_commit_rpc(); +} + void nfs_inode::sync_membufs(std::vector &bc_vec, bool is_flush, struct rpc_task *parent_task) @@ -407,6 +543,24 @@ void nfs_inode::sync_membufs(std::vector &bc_vec, parent_task->num_ongoing_backend_writes = 1; } + /* + * New flush should not be started if flush/commit is in progress. + * But it may happen commit_is_pending() is set by another thread. + */ + assert(!is_commit_in_progress() || is_commit_pending()); + + if (bc_vec.empty()) { + return; + } + + /* + * If new offset is not at the end of the file, + * then we need to switch to stable write. + */ + if(check_stable_write_required(bc_vec[0].offset)) { + switch_to_stable_write(); + } + /* * Create the flush task to carry out the write. */ @@ -495,6 +649,12 @@ void nfs_inode::sync_membufs(std::vector &bc_vec, continue; } + /* + * Update the last flushing offset and length. + */ + last_flushing_offset = bc.offset; + last_flushing_length = bc.length; + if (write_task == nullptr) { write_task = get_client()->get_rpc_task_helper()->alloc_rpc_task(FUSE_WRITE); @@ -767,12 +927,69 @@ int nfs_inode::copy_to_cache(const struct fuse_bufvec* bufv, return err; } +void nfs_inode::mark_commit_in_progress() +{ + /* + * We want to commit, but we need to wait for current flushing to complete. + * 1. If flushing is going on, set commit_pending to inode, when last membuf flushed, + * write_iov_callback(), start new task to commit the data. + * 2. If flushing is not going on, then create the task and start commit of data. + * + * Note: Ensure lock is held till we set the commit state depending on flushing + * going on or not. This lock to synchronize the commit and flush task. + */ + bool start_commit_task = false; + + /* + * Take the flush_lock to ensure no new flush or commit task is started. + * commit_lock is held to synchronize the flush thread and this thread + * to set the commit state depending on flushing going on or not. + * + * Note: flush_lock can't be used as flush task can be started under the + * flush_lock (flush_cache_and_wait()) and it can cause deadlock. + * flush_cache_and_wait() is called under flush_lock and it checks + * if commit required, then it starts the commit task. + * + * Note: commit_lock held for very small duration, so it's safe to hold in + * fuse context. + */ + if (!is_commit_in_progress()) + { + std::unique_lock lock(commit_lock_5 /*commit_lock*/); + /* + * It may be possible that commit is in progress, so check again. + * Might be set by other thread. + */ + if (!is_commit_in_progress()) + { + if (get_filecache()->is_flushing_in_progress()) { + set_commit_pending(); + } else { + set_commit_in_progress(); + start_commit_task = true; + } + } + lock.unlock(); + } + + if (start_commit_task) { + commit_membufs(); + } +} + /** * Note: Caller should call with flush_lock() held. + * On return there is no ongoing flush/commit in progress. */ -int nfs_inode::wait_for_ongoing_flush(uint64_t start_off, uint64_t end_off) +int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, + uint64_t end_off, + struct rpc_task *task) { assert(start_off < end_off); + assert(!is_stable_write()); + + // NAND Gate + assert(!(is_commit_in_progress() && get_filecache()->is_flushing_in_progress())); // Caller must call us with is_flushing lock held. assert(is_flushing); @@ -796,78 +1013,141 @@ int nfs_inode::wait_for_ongoing_flush(uint64_t start_off, uint64_t end_off) /* * Flushing not in progress and no new flushing can be started as we hold * the flush_lock(). + * It may be possible that the flush is not in progress and but we may have + * commit_in_progress or commit_pending_bytes more than max_commit_bytes. + * In that case we should start the commit task if it's not already in + * progress. And wait for the commit to complete. */ - if (!get_filecache()->is_flushing_in_progress()) { + if (!get_filecache()->is_flushing_in_progress() && + !is_commit_in_progress() && + !get_filecache()->commit_required()) { AZLogDebug("[{}] No flush in progress, returning", ino); + + /* + * Complete the write task if it was passed. + */ + if (task) { + task->reply_write(task->rpc_api->write_task.get_size()); + } return 0; } /* - * Get the flushing bytes_chunk from the filecache handle. - * This will grab an exclusive lock on the file cache and return the list - * of flushing bytes_chunks at that point. Note that we can have new dirty - * bytes_chunks created but we don't want to wait for those. + * Try to get the flushing bytes_chunk from the filecache handle if the + * flush is in progress. */ - std::vector bc_vec = - filecache_handle->get_flushing_bc_range(start_off, end_off); + std::vector bc_vec = {}; + if (get_filecache()->is_flushing_in_progress()) { + /* + * Get the flushing bytes_chunk from the filecache handle. + * This will grab an exclusive lock on the file cache and return the list + * of flushing bytes_chunks at that point. Note that we can have new dirty + * bytes_chunks created but we don't want to wait for those. + */ + bc_vec = + filecache_handle->get_flushing_bc_range(start_off, end_off); + } + /* - * Our caller expects us to return only after the flush completes. + * If there is no task passed, then our caller expects us to return + * only after the flush completes (f.e. truncate, switch_to_stable_write). * Wait for all the membufs to flush and get result back. */ - for (bytes_chunk &bc : bc_vec) { - struct membuf *mb = bc.get_membuf(); + if (task == nullptr) { + for (bytes_chunk &bc : bc_vec) { + struct membuf *mb = bc.get_membuf(); - assert(mb != nullptr); - assert(mb->is_inuse()); + assert(mb != nullptr); + assert(mb->is_inuse()); + mb->set_locked(); - /* - * sync_membufs() would have taken the membuf lock for the duration - * of the backend wite that flushes the membuf, so once we get the - * lock we know that the flush write has completed. - */ - mb->set_locked(); + /* + * If still dirty after we get the lock, it may mean two things: + * - Write failed. + * - Some other thread got the lock before us and it made the + * membuf dirty again. + */ + if (mb->is_dirty() && get_write_error()) { + AZLogError("[{}] Flush [{}, {}) failed with error: {}", + ino, + bc.offset, bc.offset + bc.length, + get_write_error()); + } - /* - * If still dirty after we get the lock, it may mean two things: - * - Write failed. - * - Some other thread got the lock before us and it made the - * membuf dirty again. - */ - if (mb->is_dirty() && get_write_error()) { - AZLogError("[{}] Flush [{}, {}) failed with error: {}", - ino, - bc.offset, bc.offset + bc.length, - get_write_error()); - } + mb->clear_locked(); + mb->clear_inuse(); - mb->clear_locked(); - mb->clear_inuse(); + /* + * Release the bytes_chunk back to the filecache. + * These bytes_chunks are not needed anymore as the flush is done. + * + * Note: We come here for bytes_chunks which were found dirty by the + * above loop. These writes may or may not have been issued by + * us (if not issued by us it was because some other thread, + * mostly the writer issued the write so we found it flushing + * and hence didn't issue). In any case since we have an inuse + * count, release() called from write_callback() would not have + * released it, so we need to release it now. + */ + filecache_handle->release(bc.offset, bc.length); + } + mark_commit_in_progress(); + } else { /* - * Release the bytes_chunk back to the filecache. - * These bytes_chunks are not needed anymore as the flush is done. - * - * Note: We come here for bytes_chunks which were found dirty by the - * above loop. These writes may or may not have been issued by - * us (if not issued by us it was because some other thread, - * mostly the writer issued the write so we found it flushing - * and hence didn't issue). In any case since we have an inuse - * count, release() called from write_callback() would not have - * released it, so we need to release it now. + * Add task to commit_waiters list. */ - filecache_handle->release(bc.offset, bc.length); + get_filecache()->add_commit_waiter(last_flushing_offset, + last_flushing_length, task); + mark_commit_in_progress(); + return 0; } - AZLogDebug("[{}] wait_for_ongoing_flush() returning with error: {}", - ino, get_write_error()); + + /* + * Wait for the ongoing commit to complete. + */ + wait_for_ongoing_commit(); + + /* + * Note: There can be bytes_commit_pending(), but that's fine + * as if bytes_commit_pending() reached max_commit_bytes() + * we would have started the commit task. As we held the + * flush_lock() no new flush can be started, which increases + * the bytes_commit_pending(). + */ + assert(is_commit_in_progress() == false); + assert(get_filecache()->get_bytes_to_flush() == 0); return get_write_error(); } /** - * Note: This takes shared lock on ilock_1. + * flush_cache_and_wait() is called only from the release/flush call. + * It's called with flush_lock() held. + * Things it does: + * - Before flushing it needs to wait for any ongoing commit to complete. + * - First it try to get membufs through get_contiguous_dirty_bcs() or + * get_dirty_bc_range() based on the unstable write or stable write. + * - Issue flush for the dirty membufs and wait for the flush to complete. + * - If it's unstable write, it issue commit for commit pending membufs. + * - Wait for the commit to complete. + * - Returns the error code if any. + * + * Note: Flush_cache_and_wait() block the fuse thread till the flush completes. + * It's called from the release(), flush() and getattr() calls. getattr() + * + * TODO: Need to add the support of completion of different calls + * (flush and release). */ -int nfs_inode::flush_cache_and_wait(uint64_t start_off, uint64_t end_off) +int nfs_inode::flush_cache_and_wait(uint64_t start_off, + uint64_t end_off, + struct rpc_task *parent_task) { + assert(is_flushing); + assert(start_off < end_off); + assert(!is_stable_write() || get_filecache()->get_bytes_to_commit() == 0); + assert(!is_stable_write() || is_commit_in_progress() == false); + /* * MUST be called only for regular files. * Leave the assert to catch if fuse ever calls flush() on non-reg files. @@ -906,14 +1186,62 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, uint64_t end_off) */ flush_lock(); + /* + * If flush is in progress, wait for it to complete. + * It may happen that last flush is still in progress and we need to wait + * for it to complete before we start the new flush. + * As we have the flush_lock() we are guaranteed that no new flush can be + * started till we release the flush_lock(). + * + * wait_for_ongoing_flush_commit() will wait for the ongoing flush/commt to complete. + * + * Note: For stable write case, we don't need to wait for the ongoing flush, + * as there is no commit required for stable write. + */ + if (!is_stable_write()) { + const int err = wait_for_ongoing_flush_commit(); + if (err != 0) { + AZLogError("[{}] Failed to flush cache to stable storage, " + "error={}", ino, err); + return err; + } + } + + if (get_filecache()->get_bytes_to_flush() == 0) { + AZLogDebug("[{}] Nothing to flush, returning", ino); + flush_unlock(); + return 0; + } + /* * Get the dirty bytes_chunk from the filecache handle. * This will grab an exclusive lock on the file cache and return the list - * of dirty bytes_chunks at that point. Note that we can have new dirty - * bytes_chunks created but we don't want to wait for those. + * of dirty bytes_chunks at that point. */ - std::vector bc_vec = - filecache_handle->get_dirty_bc_range(start_off, end_off); + std::vector bc_vec; + uint64_t size = 0; + if (!is_stable_write()) { + bc_vec = filecache_handle->get_contiguous_dirty_bcs(size); + + if (size != get_filecache()->get_bytes_to_flush()) { + AZLogInfo("[{}] Flushed size: {} != bytes to flush: {}", + ino, size, get_filecache()->get_bytes_to_flush()); + AZLogInfo("SWITCHING TO STABLE WRITE"); + + /* + * This is rare and no-op as we already completed the pending + * flush and commit. + */ + switch_to_stable_write(); + for (auto& bc : bc_vec) { + bc.get_membuf()->clear_inuse(); + } + bc_vec = filecache_handle->get_dirty_bc_range(start_off, end_off); + } + } else { + assert(get_filecache()->get_bytes_to_commit() == 0); + bc_vec = filecache_handle->get_dirty_bc_range(start_off, end_off); + } /* * sync_membufs() iterate over the bc_vec and starts flushing the dirty @@ -964,20 +1292,44 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, uint64_t end_off) filecache_handle->release(bc.offset, bc.length); } - flush_unlock(); - /* - * If the file is deleted while we still have data in the cache, don't - * treat it as a failure to flush. The file has gone and we don't really - * care about the unwritten data. + * Flush_cache_and_wait called from a release call only, + * we need to commit all the dirty membufs which are not yet committed. */ - if (get_write_error() == ENOENT || get_write_error() == ESTALE) { - return 0; + bool start_commit_task = false; + + if (get_filecache()->get_bytes_to_commit() > 0) { + assert(!is_stable_write()); + assert(!is_commit_in_progress()); + + /* + * In case of release, there is no thread which starts the commit + * operation. So, we need to start the commit operation here. + */ + std::unique_lock commit_lock(commit_lock_5); + set_commit_in_progress(); + commit_lock.unlock(); + start_commit_task = true; + } + + if(start_commit_task) { + assert(!is_stable_write()); + assert(!get_filecache()->is_flushing_in_progress()); + assert(is_commit_in_progress()); + + commit_membufs(); } + /* + * Wait for the ongoing commit to complete. + */ + wait_for_ongoing_commit(); + flush_unlock(); + return get_write_error(); } + void nfs_inode::flush_lock() const { AZLogDebug("[{}] flush_lock() called", ino); @@ -1053,7 +1405,7 @@ bool nfs_inode::truncate_start(size_t size) */ flush_lock(); - wait_for_ongoing_flush(0, UINT64_MAX); + wait_for_ongoing_flush_commit(); AZLogDebug("[{}] Ongoing flush operations completed", ino); diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp index 9f62bbbe..b63abb57 100644 --- a/turbonfs/src/rpc_task.cpp +++ b/turbonfs/src/rpc_task.cpp @@ -806,6 +806,194 @@ void access_callback( } } +static void complete_commit_waiter_tasks(struct membuf *mb) +{ + std::vector tvec = mb->get_commit_waiters(); + for (auto& task : tvec) { + assert(task->magic == RPC_TASK_MAGIC); + assert(task->get_op_type() == FUSE_WRITE); + assert(task->rpc_api->write_task.is_fe()); + assert(task->rpc_api->write_task.get_size() > 0); + + AZLogDebug("Completing flush waiter task {} for [{}, {})", + fmt::ptr(task), + task->rpc_api->write_task.get_offset(), + task->rpc_api->write_task.get_offset() + + task->rpc_api->write_task.get_size()); + task->reply_write(task->rpc_api->write_task.get_size()); + } +} + +static void commit_callback( + struct rpc_context *rpc, + int rpc_status, + void *data, + void *private_data) +{ + rpc_task *task = (rpc_task*) private_data; + assert(task->magic == RPC_TASK_MAGIC); + assert(task->get_op_type() == FUSE_FLUSH); + assert(task->rpc_api->pvt != nullptr); + + auto res = (COMMIT3res*)data; + + INJECT_JUKEBOX(res, task); + + const fuse_ino_t ino = + task->rpc_api->flush_task.get_ino(); + struct nfs_inode *inode = + task->get_client()->get_nfs_inode_from_ino(ino); + auto bc_vec_ptr = (std::vector *)task->rpc_api->pvt; + + const int status = task->status(rpc_status, NFS_STATUS(res)); + UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc); + AZLogInfo("commit_callback"); + /* + * Now that the request has completed, we can query libnfs for the + * dispatch time. + */ + task->get_stats().on_rpc_complete(rpc_get_pdu(rpc), NFS_STATUS(res)); + if (status == 0) { + uint64_t offset = 0; + uint64_t length = 0; + auto &bc_vec = *bc_vec_ptr; // get the vector from the pointer. + + for (auto &bc : bc_vec) + { + struct membuf *mb = bc.get_membuf(); + assert(mb->is_inuse()); + assert(mb->is_locked()); + assert(mb->is_commit_pending()); + assert(mb->is_uptodate()); + + mb->clear_commit_pending(); + complete_commit_waiter_tasks(mb); + mb->clear_locked(); + mb->clear_inuse(); + + /** + * Collect the contiguous range of bc's to release. + */ + if (offset == 0 && length == 0) { + offset = bc.offset; + length = bc.length; + } else if (offset + length == bc.offset) { + length += bc.length; + } else { + auto release = inode->get_filecache()->release(offset, length); + AZLogDebug("BC released {}, asked for {}", release, length); + offset = bc.offset; + length = bc.length; + } + } + + if (length != 0) { + assert(length != 0); + auto release = inode->get_filecache()->release(offset, length); + + AZLogDebug("BC offset {}, asked for {}, release {}", (offset/1048576), (length/1048576), release); + } + + task->free_rpc_task(); + #if 0 + if (task->get_fuse_req() != nullptr) { + task->reply_error(status); + } else { + task->free_rpc_task(); + } + #endif + } else if (NFS_STATUS(res) == NFS3ERR_JUKEBOX) { + task->get_client()->jukebox_retry(task); + return; + } else { + auto &bc_vec = *bc_vec_ptr; // get the vector from the pointer. + /* + * Mark the membufs as dirty and clear the commit_pending flag. + * Next write will initiate the flush again with stable write. + */ + for (auto &bc : bc_vec) + { + struct membuf *mb = bc.get_membuf(); + assert(mb->is_inuse()); + assert(mb->is_locked()); + assert(mb->is_commit_pending()); + assert(mb->is_uptodate()); + + mb->clear_commit_pending(); + complete_commit_waiter_tasks(mb); + mb->set_dirty(); + mb->clear_locked(); + mb->clear_inuse(); + } + inode->set_stable_write(); + + task->free_rpc_task(); + #if 0 + if (task->get_fuse_req() != nullptr) { + task->reply_error(status); + } else { + inode->set_write_error(status); + task->free_rpc_task(); + } + #endif + } + + delete bc_vec_ptr; + + inode->clear_commit_in_progress(); +} + +void rpc_task::issue_commit_rpc() +{ + // Must only be called for a flush task. + assert(get_op_type() == FUSE_FLUSH); + + const fuse_ino_t ino = rpc_api->flush_task.get_ino(); + struct nfs_inode *inode = get_client()->get_nfs_inode_from_ino(ino); + assert(inode->is_stable_write() == false); + assert(inode->get_filecache()->is_flushing_in_progress() == false); + + COMMIT3args args; + ::memset(&args, 0, sizeof(args)); + bool rpc_retry = false; + + AZLogInfo("issue_commit_rpc"); + + /* + * Get the bcs marked for commit_pending. + */ + if (rpc_api->pvt == nullptr) { + rpc_api->pvt = static_cast(new std::vector(inode->get_filecache()->get_commit_pending_bcs())); + } + + args.file = inode->get_fh(); + args.offset = 0; + args.count = 0; + + do { + rpc_retry = false; + stats.on_rpc_issue(); + + if (rpc_nfs3_commit_task(get_rpc_ctx(), + commit_callback, &args, + this) == NULL) { + stats.on_rpc_cancel(); + /* + * Most common reason for this is memory allocation failure, + * hence wait for some time before retrying. Also block the + * current thread as we really want to slow down things. + * + * TODO: For soft mount should we fail this? + */ + rpc_retry = true; + + AZLogWarn("rpc_nfs3_write_task failed to issue, retrying " + "after 5 secs!"); + ::sleep(5); + } + } while (rpc_retry); +} + /** * Must be called when bytes_completed bytes are successfully read/written. */ @@ -967,7 +1155,7 @@ void bc_iovec::on_io_fail(int status) /* * This membuf has completed flushing albeit with failure, check if any - * task is waiting for this membuf to be flushed. + * task is waiting for this membuf to be flushed/commit. */ std::vector tvec = mb->get_flush_waiters(); for (auto& task : tvec) { @@ -986,6 +1174,29 @@ void bc_iovec::on_io_fail(int status) task->reply_error(status); } + /* + * This membuf has completed flushing albeit with failure, check if any + * task is waiting for this membuf to be committed. Now this membuf + * failed to flush, so we don't issue commit for this membuf. Hence + * we must complete the waiter tasks. + */ + std::vector tvec_commit = mb->get_commit_waiters(); + for (auto& task : tvec_commit) { + assert(task->magic == RPC_TASK_MAGIC); + assert(task->get_op_type() == FUSE_WRITE); + assert(task->rpc_api->write_task.is_fe()); + + AZLogError("Completing flush waiter task {} for [{}, {}), " + "failed: {}", + fmt::ptr(task), + task->rpc_api->write_task.get_offset(), + task->rpc_api->write_task.get_offset() + + task->rpc_api->write_task.get_size(), + status); + + task->reply_error(status); + } + mb->clear_locked(); mb->clear_inuse(); iov++; @@ -2308,6 +2519,141 @@ void rpc_task::run_access() } while (rpc_retry); } +static void perform_inline_writes(struct rpc_task *task, + struct nfs_inode *inode) +{ + const size_t length = task->rpc_api->write_task.get_size(); + const off_t offset = task->rpc_api->write_task.get_offset(); + const fuse_ino_t ino = task->rpc_api->write_task.get_ino(); + std::vector bc_vec; + + /* + * Grab flush_lock to get exclusive list of dirty chunks, which are not + * already being flushed. This also protects us racing with a truncate + * call and growing the file size after truncate shrinks the file. + */ + inode->flush_lock(); + + /* + * prune_bytes is the number of bytes that need to be flushed to + * make space for the new writes. If it's 0, then we don't need to + * flush more data we can just wait for the ongoing flush/commit + * to complete. + */ + uint64_t prune_bytes = inode->get_filecache()->get_bytes_to_prune(); + + if (prune_bytes == 0 && !inode->is_stable_write()) { + AZLogDebug("No bytes to prune, Wait for ongoing flush/commit" + "to complete."); + /* + * In case of inline_write, we want to block the writes till the + * enough space is made for new writes. Every write thread will + * come here and try to take the lock, only one thread succeeds + * and wait for the ongoing flush/commit to complete. Rest of the + * threads will wait for the lock to be released by the first + * thread. After first thread completes the wait, check if any ongoing + * flush/commit is in progress and if it's still true. Mostly it will + * be false as new writes are blocked. + * + * Note: flush_lock() is held so that no new flush or commit task + * is started. + */ + inode->wait_for_ongoing_flush_commit(0, UINT64_MAX, task); + } else { + uint64_t size = 0; + if (inode->is_stable_write()) { + /* + * If the write is stable, we can directly write to the + * membufs and issue the writes to the backend. + */ + bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range(); + if (bc_vec.empty()) { + /* + * Another application write raced with us and it got to do the + * inline write before us. Try adding this task to the flush_waiters + * list for the membuf where we copied the application data. If + * we are able to add successfully, then we will call the fuse + * callback when the flush completes at the backend. This will + * slow down the writer application, thus relieving the memory + * pressure. + * If add_flush_waiter() returns false, that means this membuf + * is already flushed by that other thread and we can complete + * the fuse callback in this context. + */ + inode->flush_unlock(); + + if (inode->get_filecache()->add_flush_waiter(offset, + length, + task)) { + AZLogDebug("[{}] Inline write, membuf not flushed, write " + "[{}, {}) will be completed when membuf is flushed", + ino, offset, offset+length); + return; + } else { + AZLogDebug("[{}] Inline write, membuf already flushed, " + "completing fuse write [{}, {})", + ino, offset, offset+length); + task->reply_write(length); + return; + } + } + } else { + /* + * If the write is not stable, we need to copy the application + * data into the membufs and issue the writes to the backend. + */ + bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size); + if (!bc_vec.empty()) { + /* + * Perform inline sync. + * In case of unstable writes, we need to complete the write + * on commit complete. So, we don't pass the 3rd arg to + * sync_membufs(), but add the task to the commit_waiters list. + */ + inode->sync_membufs(bc_vec, false /* is_flush */, task); + } + inode->flush_unlock(); + + /* + * Set the commit_in_progress flag to ensure that the commit + * task is started after the flush completes. + */ + inode->mark_commit_in_progress(); + + if (inode->get_filecache()->add_commit_waiter( + inode->last_flushing_offset, + inode->last_flushing_length, + task)) { + AZLogDebug("[{}] Inline write, membuf not flushed, write " + "[{}, {}) will be completed when membuf is flushed", + ino, inode->last_flushing_offset, + inode->last_flushing_length); + return; + } else { + AZLogDebug("[{}] Inline write, membuf already flushed, " + "completing fuse write [{}, {})", + ino, inode->last_flushing_offset, + inode->last_flushing_length); + task->reply_write(length); + return; + } + } + } + + /* + * Perform inline sync. + * Since we pass the 3rd arg to sync_membufs, it tells sync_membufs() + * to call the fuse callback after all the issued backend writes + * complete. This will be done asynchronously while the sync_membufs() + * call will return after issuing the writes. + */ + inode->sync_membufs(bc_vec, false /* is_flush */, task); + inode->flush_unlock(); + + // Free up the fuse thread without completing the application write. + return; +} + void rpc_task::run_write() { // This must be called only for front end tasks. @@ -2464,69 +2810,25 @@ void rpc_task::run_write() ino, sparse_write, (extent_right - extent_left), extent_left, extent_right); - /* - * Grab flush_lock to get exclusive list of dirty chunks, which are not - * already being flushed. This also protects us racing with a truncate - * call and growing the file size after truncate shrinks the file. - */ - inode->flush_lock(); - - std::vector bc_vec = - inode->get_filecache()->get_dirty_nonflushing_bcs_range(); - - if (bc_vec.empty()) { - /* - * Another application write raced with us and it got to do the - * inline write before us. Try adding this task to the flush_waiters - * list for the membuf where we copied the application data. If - * we are able to add successfully, then we will call the fuse - * callback when the flush completes at the backend. This will - * slow down the writer application, thus relieving the memory - * pressure. - * If add_flush_waiter() returns false, that means this membuf - * is already flushed by that other thread and we can complete - * the fuse callback in this context. - */ - inode->flush_unlock(); - - if (inode->get_filecache()->add_flush_waiter(offset, - length, - this)) { - AZLogDebug("[{}] Inline write, membuf not flushed, write " - "[{}, {}) will be completed when membuf is flushed", - ino, offset, offset+length); - return; - } else { - AZLogDebug("[{}] Inline write, membuf already flushed, " - "completing fuse write [{}, {})", - ino, offset, offset+length); - reply_write(length); - return; - } - } - - /* - * Perform inline sync. - * Since we pass the 3rd arg to sync_membufs, it tells sync_membufs() - * to call the fuse callback after all the issued backend writes - * complete. This will be done asynchronously while the sync_membufs() - * call will return after issuing the writes. - * - * Note: sync_membufs() can free this rpc_task if all issued backend - * writes complete before sync_membufs() can return. - * DO NOT access rpc_task after sync_membufs() call. - */ - inode->sync_membufs(bc_vec, false /* is_flush */, this); - inode->flush_unlock(); - // Free up the fuse thread without completing the application write. - return; + return perform_inline_writes(this, inode); } /* - * Ok, we don't need to do inline writes. See if we have enough dirty - * data and we need to start async flush. + * We don't need to do inline writes. See if we have enough bytes to + * commit to the backend. */ + const bool needs_commit = inode->get_filecache()->commit_required(); + if (needs_commit) { + /* + * Set the commit_in_progress flag to ensure that the commit + * task is startet. + */ + inode->mark_commit_in_progress(); + + assert(inode->is_commit_in_progress()); + assert(!need_inline_write); + } /* * If the extent size exceeds the max allowed dirty size as returned by @@ -2557,6 +2859,54 @@ void rpc_task::run_write() bytes_to_flush, max_dirty_extent); + if (inode->is_commit_in_progress()) { + /* + * Commit is in progress, we can't start new flush to the BLOB. + * Complete the write if it doesn't need flush, otherwise add to + * commit_waiters list. + * Note: It make sense to complete the write only after commit is done + * if we already reached the flush limit. As otherwise it will + * trigger inline writes. + */ + if (bytes_to_flush < max_dirty_extent) { + AZLogDebug("[{}] Commit in progress, skipping write, inline_write is false", + ino); + reply_write(length); + return; + } else { + /* + * Add this task to the commit_waiters list, so that it can be + * completed after the commit is done. + */ + if(inode->get_filecache()->add_commit_waiter( + inode->last_flushing_offset, + inode->last_flushing_length, + this)) { + AZLogDebug("[{}] Commit in progress, added to commit_waiters list", + ino); + return; + } else { + /* + * Commit is done, but this task is not added to commit_waiters + * list, so complete the write. + */ + AZLogDebug("[{}] Commit in progress, but not added to commit_waiters list, " + "completing write", ino); + reply_write(length); + return; + } + } + + // We should never reach here. + assert(false); + return; + } + + /* + * Ok, we neither need inline writes nor commit in progress. See if we have + * enough dirty data and we need to start async flush. + */ + if ((extent_right - extent_left) < max_dirty_extent) { /* * Current extent is not large enough to be flushed, see if we have @@ -2588,9 +2938,19 @@ void rpc_task::run_write() */ inode->flush_lock(); - std::vector bc_vec = - inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left, - extent_right); + uint64_t size = 0; + std::vector bc_vec; + /* + * If the inode is stable write, then get the dirty membufs in the range. + * otherwise get the dirty membufs contigious range. + */ + if (inode->is_stable_write()) { + bc_vec = + inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left, + extent_right); + } else { + bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size); + } if (bc_vec.size() == 0) { inode->flush_unlock(); From 9d545d11a90c8fd135e6df4e378e8cc7e88fec2d Mon Sep 17 00:00:00 2001 From: Nitin Singla Date: Sun, 26 Jan 2025 14:12:11 +0000 Subject: [PATCH 2/9] Some fixes. --- turbonfs/inc/file_cache.h | 21 +- turbonfs/inc/nfs_inode.h | 10 +- turbonfs/inc/rpc_stats.h | 9 - turbonfs/inc/rpc_task.h | 8 + turbonfs/src/file_cache.cpp | 28 +- turbonfs/src/nfs_inode.cpp | 222 ++++++++++---- turbonfs/src/rpc_stats.cpp | 1 + turbonfs/src/rpc_task.cpp | 575 +++++++++++++++++++----------------- 8 files changed, 539 insertions(+), 335 deletions(-) diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h index be7a8ac0..39ddee58 100644 --- a/turbonfs/inc/file_cache.h +++ b/turbonfs/inc/file_cache.h @@ -1239,6 +1239,8 @@ class bytes_chunk_cache uint64_t length, struct rpc_task *task); + void cleanup_on_error(); + /* * Returns all dirty chunks for a given range in chunkmap. @@ -1400,6 +1402,9 @@ class bytes_chunk_cache * holding more data than the Blob NFS server's scheduler cache size. * We want to send as prompt as possible to utilize the n/w b/w but slow * enough to give the write scheduler an opportunity to merge better. + * + * Note: max_dirty_extent is static as it doesn't change after it's + * queried for the first time. */ uint64_t max_dirty_extent_bytes() const { @@ -1407,9 +1412,12 @@ class bytes_chunk_cache static const uint64_t max_total = (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); assert(max_total != 0); - static const uint64_t max_dirty_extent = (max_total * 0.6); + static const uint64_t max_dirty_extent = std::min( + (uint64_t)(max_total * 0.6), (uint64_t)(1024 * 1024 * 1024ULL)); - return std::min(max_dirty_extent, uint64_t(1024 * 1024 * 1024ULL)); + assert(max_dirty_extent != 0); + + return max_dirty_extent; } /* @@ -1499,6 +1507,15 @@ class bytes_chunk_cache return (bytes_commit_pending >= max_commit_bytes()); } + /* + * Check if bytes_to_flush is equal or more than + * max_dirty_extent_bytes limit. + */ + bool flush_required() const + { + return (get_bytes_to_flush() >= max_dirty_extent_bytes()); + } + /* * get_bytes_to_prune() returns the number of bytes that need to be flushed * inline to free up the space. If there are enough bytes_flushing then we diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h index 959e88b5..ea3a2c95 100644 --- a/turbonfs/inc/nfs_inode.h +++ b/turbonfs/inc/nfs_inode.h @@ -323,7 +323,7 @@ struct nfs_inode * Note: As of now, we are not using this flag as commit changes not yet * integrated, so we are setting this flag to true. */ - bool stable_write = true; + bool stable_write = false; public: /* @@ -1169,7 +1169,8 @@ struct nfs_inode bool is_commit_in_progress() const { assert(commit_state != commit_state_t::INVALID); - return (commit_state == commit_state_t::COMMIT_IN_PROGRESS); + return ((commit_state == commit_state_t::COMMIT_IN_PROGRESS) || + (commit_state == commit_state_t::NEEDS_COMMIT)); } /** @@ -1355,10 +1356,11 @@ struct nfs_inode * progress (which must have held the is_flushing lock). */ int flush_cache_and_wait(uint64_t start_off = 0, - uint64_t end_off = UINT64_MAX, - struct rpc_task *parent_task = nullptr); + uint64_t end_off = UINT64_MAX); void mark_commit_in_progress(); + void add_task_to_commit_waiters(struct rpc_task *task); + void add_task_to_flush_waiters(struct rpc_task *task); /** * Wait for currently flushing membufs to complete. diff --git a/turbonfs/inc/rpc_stats.h b/turbonfs/inc/rpc_stats.h index 626914b1..d0a7bbed 100644 --- a/turbonfs/inc/rpc_stats.h +++ b/turbonfs/inc/rpc_stats.h @@ -139,9 +139,6 @@ class rpc_stats_az */ void on_rpc_issue() { - // FUSE_FLUSH is never issued as an RPC task to the server. FUSE_WRITE is issued instead. - assert(optype != FUSE_FLUSH); - assert(stamp.issue == 0); stamp.issue = get_current_usecs(); assert(stamp.issue >= stamp.create); @@ -156,9 +153,6 @@ class rpc_stats_az */ void on_rpc_cancel() { - // FUSE_FLUSH is never issued as an RPC task to the server. FUSE_WRITE is issued instead. - assert(optype != FUSE_FLUSH); - assert(stamp.issue != 0); assert((int64_t) stamp.issue <= get_current_usecs()); assert(stamp.dispatch == 0); @@ -178,9 +172,6 @@ class rpc_stats_az */ void on_rpc_complete(struct rpc_pdu *pdu, nfsstat3 status) { - // FUSE_FLUSH is never issued as an RPC task to the server. FUSE_WRITE is issued instead. - assert(optype != FUSE_FLUSH); - assert((status == NFS3ERR_RPC_ERROR) || (nfsstat3_to_errno(status) != -ERANGE)); diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h index c9fd4303..72c77788 100644 --- a/turbonfs/inc/rpc_task.h +++ b/turbonfs/inc/rpc_task.h @@ -2000,6 +2000,14 @@ struct rpc_task csched = _csched; } + void set_task_csched(bool stable_write) + { + if (client->mnt_options.nfs_port == 2048) { + csched = stable_write ? CONN_SCHED_FH_HASH : CONN_SCHED_RR; + } + return; + } + conn_sched_t get_csched() const { assert(csched > CONN_SCHED_INVALID && diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp index d133ed7e..a6ff86be 100644 --- a/turbonfs/src/file_cache.cpp +++ b/turbonfs/src/file_cache.cpp @@ -483,7 +483,7 @@ void membuf::clear_flushing() * TODO: Remove me once we have enough testing. * Don't release it to production. */ - assert(!is_dirty()); + // assert(!is_dirty()); flag &= ~MB_Flag::Flushing; @@ -2268,7 +2268,7 @@ void bytes_chunk_cache::clear_nolock() assert(bytes_cached == 0); if (bytes_allocated != 0) { - AZLogWarn("[{}] Cache purge: bytes_allocated is still {}, some user " + AZLogDebug("[{}] Cache purge: bytes_allocated is still {}, some user " "is still holding on to the bytes_chunk/membuf even after " "dropping the inuse count: backing_file_name={}", CACHE_TAG, bytes_allocated.load(), backing_file_name); @@ -2523,6 +2523,30 @@ bool bytes_chunk_cache::add_commit_waiter(uint64_t offset, return false; } +void bytes_chunk_cache::cleanup_on_error() +{ + const std::unique_lock _lock(chunkmap_lock_43); + + for (auto it = chunkmap.begin(); it != chunkmap.cend(); ++it) { + struct bytes_chunk& bc = it->second; + struct membuf *mb = bc.get_membuf(); + if (!mb->is_inuse() && !mb->is_locked()) { + mb->set_inuse(); + mb->set_locked(); + if (mb->is_dirty()) { + mb->clear_dirty(); + } + + if (mb->is_flushing()) { + mb->clear_flushing(); + } + + if (mb->is_commit_pending()) { + mb->clear_commit_pending(); + } + } + } +} std::vector bytes_chunk_cache::get_dirty_bc_range( uint64_t start_off, uint64_t end_off) const diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp index dcb8b18d..a6eeba51 100644 --- a/turbonfs/src/nfs_inode.cpp +++ b/turbonfs/src/nfs_inode.cpp @@ -34,7 +34,7 @@ nfs_inode::nfs_inode(const struct nfs_fh3 *filehandle, assert(write_error == 0); // TODO: Revert this to false once commit changes integrated. - assert(stable_write == true); + assert(stable_write == false); assert(commit_state == commit_state_t::COMMIT_NOT_NEEDED); #ifndef ENABLE_NON_AZURE_NFS @@ -547,7 +547,7 @@ void nfs_inode::sync_membufs(std::vector &bc_vec, * New flush should not be started if flush/commit is in progress. * But it may happen commit_is_pending() is set by another thread. */ - assert(!is_commit_in_progress() || is_commit_pending()); + assert(!is_commit_in_progress()); if (bc_vec.empty()) { return; @@ -649,11 +649,13 @@ void nfs_inode::sync_membufs(std::vector &bc_vec, continue; } + #if 0 /* * Update the last flushing offset and length. */ last_flushing_offset = bc.offset; last_flushing_length = bc.length; + #endif if (write_task == nullptr) { write_task = @@ -678,6 +680,10 @@ void nfs_inode::sync_membufs(std::vector &bc_vec, * Once packed completely, then dispatch the write. */ if (write_task->add_bc(bc)) { + assert(is_stable_write() || ((uint64_t) putblock_filesize == bc.offset)); + putblock_filesize += bc.length; + last_flushing_offset = bc.offset; + last_flushing_length = bc.length; continue; } else { /* @@ -703,6 +709,10 @@ void nfs_inode::sync_membufs(std::vector &bc_vec, // Single bc addition should not fail. [[maybe_unused]] bool res = write_task->add_bc(bc); assert(res == true); + assert(is_stable_write() || ((uint64_t) putblock_filesize == bc.offset)); + putblock_filesize += bc.length; + last_flushing_offset = bc.offset; + last_flushing_length = bc.length; } } @@ -929,6 +939,8 @@ int nfs_inode::copy_to_cache(const struct fuse_bufvec* bufv, void nfs_inode::mark_commit_in_progress() { + assert(is_stable_write() == false); + /* * We want to commit, but we need to wait for current flushing to complete. * 1. If flushing is going on, set commit_pending to inode, when last membuf flushed, @@ -977,22 +989,90 @@ void nfs_inode::mark_commit_in_progress() } } +void nfs_inode::add_task_to_flush_waiters(struct rpc_task *task) +{ + /* + * Add this task to the flush_waiter list, so that it can be + * completed after the flush is done. + */ + if(get_filecache()->add_flush_waiter(last_flushing_offset, + last_flushing_length, + task)) { + AZLogDebug("[{}] flush in progress, added to flush_waiter list" + " for [{}, {})", ino, last_flushing_offset, + last_flushing_length); + return; + } else { + /* + * flush is done, but this task is not added to flush_waiter + * list, so complete the write. + */ + AZLogDebug("[{}] flush is done, not added to flush_waiter list, " + "completing write", ino); + task->reply_write(task->rpc_api->write_task.get_size()); + return; + } +} + +void nfs_inode::add_task_to_commit_waiters(struct rpc_task *task) +{ + /* + * This should be called only for unstable writes. + */ + assert(is_stable_write() == false); + + /* + * Add this task to the commit_waiters list, so that it can be + * completed after the commit is done. + */ + if(get_filecache()->add_commit_waiter(last_flushing_offset, + last_flushing_length, + task)) { + AZLogDebug("[{}] Commit in progress, added to commit_waiters list" + " for [{}, {})", ino, last_flushing_offset, + last_flushing_length); + return; + } else { + /* + * Commit is done, but this task is not added to commit_waiters + * list, so complete the write. + */ + AZLogDebug("[{}] Commit in progress, but not added to commit_waiters list, " + "completing write", ino); + task->reply_write(task->rpc_api->write_task.get_size()); + return; + } +} + /** * Note: Caller should call with flush_lock() held. * On return there is no ongoing flush/commit in progress. + * + * flush_cache_and_wait() checks for return value, as it's + * called on close() and need to know final status of the writes. */ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, uint64_t end_off, struct rpc_task *task) { + // Caller must call us with is_flushing lock held. + assert(is_flushing); assert(start_off < end_off); - assert(!is_stable_write()); - // NAND Gate - assert(!(is_commit_in_progress() && get_filecache()->is_flushing_in_progress())); + if (is_stable_write()) { + assert(!is_commit_in_progress()); + assert(get_filecache()->bytes_commit_pending == 0); + } - // Caller must call us with is_flushing lock held. - assert(is_flushing); + /* + * If commit_pending limit reached, we need to start the commit, + * but we need to wait for the current flushing to complete, so + * we set the commit_pending flag and once flusing completes it + * will start the commit task. + */ + if (get_filecache()->is_flushing_in_progress()) { + assert(!is_commit_in_progress() || is_commit_pending()); + } /* * MUST be called only for regular files. @@ -1014,13 +1094,13 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, * Flushing not in progress and no new flushing can be started as we hold * the flush_lock(). * It may be possible that the flush is not in progress and but we may have - * commit_in_progress or commit_pending_bytes more than max_commit_bytes. + * commit_in_progress or commit_pending_bytes to commit. * In that case we should start the commit task if it's not already in * progress. And wait for the commit to complete. */ if (!get_filecache()->is_flushing_in_progress() && !is_commit_in_progress() && - !get_filecache()->commit_required()) { + get_filecache()->bytes_commit_pending == 0) { AZLogDebug("[{}] No flush in progress, returning", ino); /* @@ -1037,7 +1117,7 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, * flush is in progress. */ std::vector bc_vec = {}; - if (get_filecache()->is_flushing_in_progress()) { + if (task == nullptr && get_filecache()->is_flushing_in_progress()) { /* * Get the flushing bytes_chunk from the filecache handle. * This will grab an exclusive lock on the file cache and return the list @@ -1048,11 +1128,13 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, filecache_handle->get_flushing_bc_range(start_off, end_off); } - /* * If there is no task passed, then our caller expects us to return * only after the flush completes (f.e. truncate, switch_to_stable_write). * Wait for all the membufs to flush and get result back. + * Otherwise add the task to the flush_waiter list for stable write and + * commit_waiter list for unstable write. + * Task will wait on last flushed offset and length. */ if (task == nullptr) { for (bytes_chunk &bc : bc_vec) { @@ -1092,32 +1174,47 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, */ filecache_handle->release(bc.offset, bc.length); } - - mark_commit_in_progress(); } else { - /* - * Add task to commit_waiters list. - */ - get_filecache()->add_commit_waiter(last_flushing_offset, - last_flushing_length, task); - mark_commit_in_progress(); - return 0; + if (is_stable_write()) { + /* + * Add task to flush_waiters list. + */ + add_task_to_flush_waiters(task); + } else { + /* + * Add task to commit_waiters list. + */ + add_task_to_commit_waiters(task); + } } /* - * Wait for the ongoing commit to complete. + * Unstable write case, we need to wait for the commit to complete. + * So, mark the commit in progress and wait for the commit to complete. */ - wait_for_ongoing_commit(); + if (!is_stable_write()) { + mark_commit_in_progress(); + if (task) { + return 0; + } + + /* + * Wait for the ongoing commit to complete. + */ + wait_for_ongoing_commit(); + } /* - * Note: There can be bytes_commit_pending(), but that's fine - * as if bytes_commit_pending() reached max_commit_bytes() - * we would have started the commit task. As we held the - * flush_lock() no new flush can be started, which increases - * the bytes_commit_pending(). + * As we have flush_lock held, no new flush can be started. + * We already waited for the ongoing flush and commit to complete. + * For stable write case there should not be any commit pending. + * So, following conditions should be true for both stable and + * unstable writes.: */ assert(is_commit_in_progress() == false); - assert(get_filecache()->get_bytes_to_flush() == 0); + assert(get_filecache()->bytes_commit_pending == 0); + assert(get_filecache()->is_flushing_in_progress() == 0); + return get_write_error(); } @@ -1134,16 +1231,14 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, * - Returns the error code if any. * * Note: Flush_cache_and_wait() block the fuse thread till the flush completes. - * It's called from the release(), flush() and getattr() calls. getattr() - * - * TODO: Need to add the support of completion of different calls - * (flush and release). + * It's called from the release(), flush() and getattr() calls. It's ok + * as of now as it's not very often called. We can optimize to complete + * the flush in background and return immediately. For that we need to add + * special handling for the getattr() call. */ int nfs_inode::flush_cache_and_wait(uint64_t start_off, - uint64_t end_off, - struct rpc_task *parent_task) + uint64_t end_off) { - assert(is_flushing); assert(start_off < end_off); assert(!is_stable_write() || get_filecache()->get_bytes_to_commit() == 0); assert(!is_stable_write() || is_commit_in_progress() == false); @@ -1163,6 +1258,9 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, */ const int error_code = get_write_error(); if (error_code != 0) { + // Cleanup of the membufs if any left. + get_filecache()->cleanup_on_error(); + AZLogWarn("[{}] Previous write to this Blob failed with error={}, " "skipping new flush!", ino, error_code); @@ -1207,7 +1305,23 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, } } - if (get_filecache()->get_bytes_to_flush() == 0) { + /* + * For unstable write, If no bytes to flush, then return. + * For stable write, If no bytes to flush and no flush in + * progress, then return. As for stable write, we didn't + * wait for the flush to complete as above. + * + * TODO: Need to check getattr() call. As that can cause this + * assert to fail. + */ + if ((get_filecache()->get_bytes_to_flush() == 0) && + (get_filecache()->is_flushing_in_progress() == false)) { + assert(is_commit_in_progress() == false); + assert(get_filecache()->is_flushing_in_progress() == false); + assert(get_filecache()->bytes_dirty == 0); + assert(get_filecache()->bytes_commit_pending == 0); + assert(get_filecache()->bytes_flushing == 0); + AZLogDebug("[{}] Nothing to flush, returning", ino); flush_unlock(); return 0; @@ -1296,28 +1410,13 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, * Flush_cache_and_wait called from a release call only, * we need to commit all the dirty membufs which are not yet committed. */ - bool start_commit_task = false; - if (get_filecache()->get_bytes_to_commit() > 0) { assert(!is_stable_write()); assert(!is_commit_in_progress()); + assert(get_filecache()->get_bytes_to_flush() == 0 || + get_write_error() != 0); - /* - * In case of release, there is no thread which starts the commit - * operation. So, we need to start the commit operation here. - */ - std::unique_lock commit_lock(commit_lock_5); - set_commit_in_progress(); - commit_lock.unlock(); - start_commit_task = true; - } - - if(start_commit_task) { - assert(!is_stable_write()); - assert(!get_filecache()->is_flushing_in_progress()); - assert(is_commit_in_progress()); - - commit_membufs(); + mark_commit_in_progress(); } /* @@ -1326,6 +1425,21 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, wait_for_ongoing_commit(); flush_unlock(); + /* + * Cleanup on error. + */ + if (get_write_error()) { + AZLogError("[{}] Failed to flush cache to stable storage, " + "error={}", ino, get_write_error()); + get_filecache()->cleanup_on_error(); + } + + assert(is_commit_in_progress() == false); + assert(get_filecache()->is_flushing_in_progress() == false); + assert(get_filecache()->bytes_dirty == 0); + assert(get_filecache()->bytes_commit_pending == 0); + assert(get_filecache()->bytes_flushing == 0); + return get_write_error(); } diff --git a/turbonfs/src/rpc_stats.cpp b/turbonfs/src/rpc_stats.cpp index ad26697a..c3fe7be4 100644 --- a/turbonfs/src/rpc_stats.cpp +++ b/turbonfs/src/rpc_stats.cpp @@ -303,6 +303,7 @@ do { \ DUMP_OP(FUSE_READDIRPLUS); DUMP_OP(FUSE_READ); DUMP_OP(FUSE_WRITE); + DUMP_OP(FUSE_FLUSH); /* * TODO: Add more ops. diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp index b63abb57..145493d6 100644 --- a/turbonfs/src/rpc_task.cpp +++ b/turbonfs/src/rpc_task.cpp @@ -847,7 +847,7 @@ static void commit_callback( const int status = task->status(rpc_status, NFS_STATUS(res)); UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc); - AZLogInfo("commit_callback"); + AZLogDebug("commit_callback"); /* * Now that the request has completed, we can query libnfs for the * dispatch time. @@ -881,7 +881,7 @@ static void commit_callback( length += bc.length; } else { auto release = inode->get_filecache()->release(offset, length); - AZLogDebug("BC released {}, asked for {}", release, length); + AZLogInfo("BC released {}, asked for {}", release, length); offset = bc.offset; length = bc.length; } @@ -891,7 +891,7 @@ static void commit_callback( assert(length != 0); auto release = inode->get_filecache()->release(offset, length); - AZLogDebug("BC offset {}, asked for {}, release {}", (offset/1048576), (length/1048576), release); + AZLogInfo("BC offset {}, asked for {}, release {}", (offset/1048576), (length/1048576), release); } task->free_rpc_task(); @@ -925,6 +925,12 @@ static void commit_callback( mb->clear_locked(); mb->clear_inuse(); } + /* + * Set the inode to stable write, so that next write will initiate + * the flush again with stable write. + * There should be no flush in progress as this moment. + */ + assert(inode->get_filecache()->is_flushing_in_progress() == false); inode->set_stable_write(); task->free_rpc_task(); @@ -957,7 +963,7 @@ void rpc_task::issue_commit_rpc() ::memset(&args, 0, sizeof(args)); bool rpc_retry = false; - AZLogInfo("issue_commit_rpc"); + AZLogDebug("issue_commit_rpc"); /* * Get the bcs marked for commit_pending. @@ -1314,7 +1320,7 @@ static void write_iov_callback( bciov->orig_offset + bciov->orig_length); // Update bciov after the current write. - bciov->on_io_complete(res->WRITE3res_u.resok.count); + bciov->on_io_complete(res->WRITE3res_u.resok.count, !inode->is_stable_write()); // Create a new write_task for the remaining bc_iovec. struct rpc_task *write_task = @@ -1352,7 +1358,7 @@ static void write_iov_callback( return; } else { // Complete bc_iovec IO completed. - bciov->on_io_complete(res->WRITE3res_u.resok.count); + bciov->on_io_complete(res->WRITE3res_u.resok.count, !inode->is_stable_write()); // Complete data writen to blob. AZLogDebug("[{}] <{}> Completed write, off: {}, len: {}", @@ -1384,8 +1390,28 @@ static void write_iov_callback( bciov->on_io_fail(status); } - delete bciov; - task->rpc_api->pvt = nullptr; + // check if commit is pending. + if (!inode->get_filecache()->is_flushing_in_progress()) { + bool create_commit_task = false; + + std::unique_lock lock(inode->commit_lock_5); + { + if (inode->is_commit_pending()) { + assert(inode->is_stable_write() == false); + inode->set_commit_in_progress(); + create_commit_task = true; + } + } + lock.unlock(); + + if (create_commit_task) { + // Create a new flush_task for the remaining bc_iovec. + struct rpc_task *commit_task = + client->get_rpc_task_helper()->alloc_rpc_task_reserved(FUSE_FLUSH); + commit_task->init_flush(nullptr /* fuse_req */, ino); + commit_task->issue_commit_rpc(); + } + } /* * If this write_rpc was issued as part of a parent write task, then @@ -1411,6 +1437,9 @@ static void write_iov_callback( } } + delete bciov; + task->rpc_api->pvt = nullptr; + // Release the task. task->free_rpc_task(); } @@ -1469,7 +1498,8 @@ void rpc_task::issue_write_rpc() args.file = inode->get_fh(); args.offset = offset; args.count = length; - args.stable = FILE_SYNC; + args.stable = inode->is_stable_write() ? FILE_SYNC : UNSTABLE; + set_task_csched(inode->is_stable_write()); do { rpc_retry = false; @@ -2522,9 +2552,8 @@ void rpc_task::run_access() static void perform_inline_writes(struct rpc_task *task, struct nfs_inode *inode) { + AZLogInfo("Performing inline writes for inode {}", inode->get_fuse_ino()); const size_t length = task->rpc_api->write_task.get_size(); - const off_t offset = task->rpc_api->write_task.get_offset(); - const fuse_ino_t ino = task->rpc_api->write_task.get_ino(); std::vector bc_vec; /* @@ -2534,6 +2563,12 @@ static void perform_inline_writes(struct rpc_task *task, */ inode->flush_lock(); + if (!inode->get_filecache()->do_inline_write()) { + inode->flush_unlock(); + task->reply_write(length); + return; + } + /* * prune_bytes is the number of bytes that need to be flushed to * make space for the new writes. If it's 0, then we don't need to @@ -2543,7 +2578,7 @@ static void perform_inline_writes(struct rpc_task *task, uint64_t prune_bytes = inode->get_filecache()->get_bytes_to_prune(); if (prune_bytes == 0 && !inode->is_stable_write()) { - AZLogDebug("No bytes to prune, Wait for ongoing flush/commit" + AZLogInfo("No bytes to prune, Wait for ongoing flush/commit" "to complete."); /* * In case of inline_write, we want to block the writes till the @@ -2559,99 +2594,291 @@ static void perform_inline_writes(struct rpc_task *task, * is started. */ inode->wait_for_ongoing_flush_commit(0, UINT64_MAX, task); + inode->flush_unlock(); + return; } else { uint64_t size = 0; - if (inode->is_stable_write()) { + if (!inode->is_stable_write()) { /* - * If the write is stable, we can directly write to the - * membufs and issue the writes to the backend. + * We need to wait for the ongoing flush/commit to complete. + * This is only for the case where we have to prune some data + * to make space for the new writes. Usually this will not be + * happen so often, it's ok to wait here. */ - bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range(); - if (bc_vec.empty()) { - /* - * Another application write raced with us and it got to do the - * inline write before us. Try adding this task to the flush_waiters - * list for the membuf where we copied the application data. If - * we are able to add successfully, then we will call the fuse - * callback when the flush completes at the backend. This will - * slow down the writer application, thus relieving the memory - * pressure. - * If add_flush_waiter() returns false, that means this membuf - * is already flushed by that other thread and we can complete - * the fuse callback in this context. - */ - inode->flush_unlock(); + AZLogDebug("Wait for ongoing flush/commit to complete."); + inode->wait_for_ongoing_flush_commit(0, UINT64_MAX); + + assert(!inode->is_commit_in_progress()); + assert(!inode->get_filecache()->is_flushing_in_progress()); - if (inode->get_filecache()->add_flush_waiter(offset, - length, - task)) { - AZLogDebug("[{}] Inline write, membuf not flushed, write " - "[{}, {}) will be completed when membuf is flushed", - ino, offset, offset+length); - return; - } else { - AZLogDebug("[{}] Inline write, membuf already flushed, " - "completing fuse write [{}, {})", - ino, offset, offset+length); - task->reply_write(length); - return; - } - } - } else { /* * If the write is not stable, we need to copy the application * data into the membufs and issue the writes to the backend. */ bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size); - if (!bc_vec.empty()) { + if (size >= prune_bytes) { + assert(!bc_vec.empty()); /* * Perform inline sync. * In case of unstable writes, we need to complete the write * on commit complete. So, we don't pass the 3rd arg to * sync_membufs(), but add the task to the commit_waiters list. */ - inode->sync_membufs(bc_vec, false /* is_flush */, task); + inode->sync_membufs(bc_vec, false /* is_flush */, nullptr); + inode->flush_unlock(); + + /* + * Set the commit_in_progress flag to ensure that the commit + * task is started after the flush completes. + * Whether we hit commit limit or not, we need to start commit + * as we need to release the memory. + */ + inode->mark_commit_in_progress(); + + inode->add_task_to_commit_waiters(task); + return; } - inode->flush_unlock(); + } + + if (!inode->is_stable_write()) { + AZLogWarn("Not enough contiguous dirty data to prune, may be" + "pattern is not sequential. Prune_bytes={}, size={}", + prune_bytes, size); /* - * Set the commit_in_progress flag to ensure that the commit - * task is started after the flush completes. + * Let's switch to stable write mode. */ - inode->mark_commit_in_progress(); - - if (inode->get_filecache()->add_commit_waiter( - inode->last_flushing_offset, - inode->last_flushing_length, - task)) { - AZLogDebug("[{}] Inline write, membuf not flushed, write " - "[{}, {}) will be completed when membuf is flushed", - ino, inode->last_flushing_offset, - inode->last_flushing_length); - return; - } else { - AZLogDebug("[{}] Inline write, membuf already flushed, " - "completing fuse write [{}, {})", - ino, inode->last_flushing_offset, - inode->last_flushing_length); - task->reply_write(length); - return; - } + inode->switch_to_stable_write(); + } + + /* + * If the write is stable, we can directly write to the + * membufs and issue the writes to the backend. + */ + bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range(); + if (!bc_vec.empty()) { + /* + * Perform inline sync. + * Since we pass the 3rd arg to sync_membufs, it tells sync_membufs() + * to call the fuse callback after all the issued backend writes + * complete. This will be done asynchronously while the sync_membufs() + * call will return after issuing the writes. + */ + inode->sync_membufs(bc_vec, false /* is_flush */, task); + inode->flush_unlock(); + + // Free up the fuse thread without completing the application write. + return; } + + /* + * Another application write raced with us and it got to do the + * inline write before us. Try adding this task to the flush_waiters + * list for the membuf where we copied the application data. If + * we are able to add successfully, then we will call the fuse + * callback when the flush completes at the backend. This will + * slow down the writer application, thus relieving the memory + * pressure. + * If add_flush_waiter() returns false, that means this membuf + * is already flushed by that other thread and we can complete + * the fuse callback in this context. + */ + inode->flush_unlock(); + + inode->add_task_to_flush_waiters(task); + return; + } + + assert(false); +} + +static bool handle_writes(struct nfs_inode *inode, + struct rpc_task *task, + const fuse_ino_t ino, + const bool sparse_write, + uint64_t extent_left, + uint64_t extent_right) +{ + assert(task->magic == RPC_TASK_MAGIC); + assert(task->rpc_api->write_task.is_fe()); + assert(task->get_op_type() == FUSE_WRITE); + + const size_t length = task->rpc_api->write_task.get_size(); + + /* + * Check what kind of limit we have hit. + */ + const bool need_inline_write = + (sparse_write || inode->get_filecache()->do_inline_write()); + const bool need_commit = + inode->get_filecache()->commit_required(); + const bool need_flush = + inode->get_filecache()->flush_required(); + + AZLogDebug("[{}] handle write (sparse={}, need_inline_write={}, need_commit={}," + " need_flush={} )", ino, sparse_write, need_inline_write, + need_commit, need_flush); + + /* + * Nothing to do, we can complete the application write rightaway. + */ + if (!need_inline_write && !need_commit && !need_flush) { + task->reply_write(length); + return true; + } + + /* + * We have successfully copied the user data into the cache. + * We can have the following cases: + * 1. This new write caused a contiguous extent to be greater than + * max_dirty_extent_bytes(), aka MDEB. + * In this case we begin flush of this contiguous extent (in multiple + * parallel wsize sized blocks) since there's no benefit in waiting more + * as the data is sufficient for the server scheduler to effectively + * write, in optimal sized blocks. + * We complete the application write rightaway without waiting for the + * flush to complete as we are not under memory pressure. + * This will happen when user is sequentially writing to the file. + * 2. A single contiguous extent is not greater than MDEB but total dirty + * bytes waiting to be flushed is more than MDEB. + * In this case we begin flush of the entire dirty data from the cache + * as we have sufficient data for the server scheduler to perform + * batched writes effectively. + * We complete the application write rightaway without waiting for the + * flush to complete as we are not under memory pressure. + * This will happen when user is writing to the file in a random or + * pseudo-sequential fashion. + * 3. Cache has dirty data beyond the "inline write" threshold, ref + * do_inline_write(). + * In this case we begin flush of the entire dirty data. + * This is a memory pressure situation and hence we do not complete the + * application write till all the backend writes complete. + * This will happen when user is writing faster than our backend write + * throughput, eventually dirty data will grow beyond the "inline write" + * threshold and then we have to slow down the writers by delaying + * completion. + * + * Other than this we have a special case of "write beyond eof" (termed + * sparse write). In sparse write case also we perform "inline write" of + * all the dirty data. This is needed for correct read behaviour. Imagine + * a reader reading from the sparse part of the file which is not yet in + * the bytes_chunk_cache. This read will be issued to the server and since + * server doesn't know the updated file size (as the write may still be + * sitting in our bytes_chunk_cache) it will return eof. This is not correct + * as such reads issued after successful write, are valid and should return + * 0 bytes for the sparse range. + */ + + // Case 1: Inline writes + /* + * Do we need to perform "inline write"? + * Inline write implies, we flush all the dirty data and wait for all the + * corresponding backend writes to complete. + */ + if (need_inline_write) { + INC_GBL_STATS(inline_writes, 1); + + // Free up the fuse thread without completing the application write. + perform_inline_writes(task, inode); + return true; } + // Case 2: Commit /* - * Perform inline sync. - * Since we pass the 3rd arg to sync_membufs, it tells sync_membufs() - * to call the fuse callback after all the issued backend writes - * complete. This will be done asynchronously while the sync_membufs() - * call will return after issuing the writes. + * We don't need to do inline writes. See if we need to commit the dirty + * data to the backend */ - inode->sync_membufs(bc_vec, false /* is_flush */, task); - inode->flush_unlock(); + if (need_commit) { + /* + * Set the commit_in_progress flag to ensure that the commit + * task is startet. + */ + inode->mark_commit_in_progress(); + + assert(inode->is_commit_in_progress()); + assert(!need_inline_write); + } + + // Case 3: Flush + if (need_flush) { + /* + * We need to flush the dirty data in the range [extent_left, extent_right), + * get the membufs for the dirty data. + * We don't want to run over an inprogress truncate and resetting the file + * size set by truncate, so grab the is_flushing lock. + */ + inode->flush_lock(); + + /* + * If flush is required but flush/commit already in progress, then + * add this task to the respective waiters list and return. + * As we already 1GB worth of dirty data in the cache, we don't want + * to add more data to the cache. So we wait for the ongoing flush/commit. + */ + if (inode->is_commit_in_progress()) { + /* + * If commit is in progress, we need to wait for the commit + * to complete before we can proceed with the write. + */ + inode->add_task_to_commit_waiters(task); + inode->flush_unlock(); + return true; + } + + if (inode->get_filecache()->is_flushing_in_progress()) { + /* + * If flush is in progress, we need to wait for the flush to + * complete before we can proceed with the write. + */ + inode->add_task_to_flush_waiters(task); + inode->flush_unlock(); + return true; + } + + /* + * If we are here, then we need to flush the dirty data to the backend. + * We don't need to wait for the flush to complete, we can complete the + * application write rightaway. + */ + uint64_t size = 0; + std::vector bc_vec; + + /* + * If the inode is stable write, then get the dirty membufs in the range. + * otherwise get the dirty membufs contigious range. + */ + if (inode->is_stable_write()) { + bc_vec = + inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left, + extent_right); + } else { + bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size); + } + + if (bc_vec.size() == 0) { + inode->flush_unlock(); + task->reply_write(length); + return true; + } - // Free up the fuse thread without completing the application write. - return; + /* + * Pass is_flush as false, since we don't want the writes to complete + * before returning. + */ + inode->sync_membufs(bc_vec, false /* is_flush */); + inode->flush_unlock(); + + // Send reply to original request without waiting for the backend write to complete. + task->reply_write(length); + return true; + } + + /* + * If we reached true, then we have need_commit true, but need_flush false. + */ + assert(need_commit && !need_flush); + task->reply_write(length); + return true; } void rpc_task::run_write() @@ -2664,7 +2891,7 @@ void rpc_task::run_write() const size_t length = rpc_api->write_task.get_size(); struct fuse_bufvec *const bufv = rpc_api->write_task.get_buffer_vector(); const off_t offset = rpc_api->write_task.get_offset(); - const bool sparse_write = (offset > inode->get_file_size(true)); + const bool sparse_write = false; uint64_t extent_left = 0; uint64_t extent_right = 0; @@ -2753,83 +2980,6 @@ void rpc_task::run_write() assert(extent_right >= (extent_left + length)); - /* - * We have successfully copied the user data into the cache. - * We can have the following cases: - * 1. This new write caused a contiguous extent to be greater than - * max_dirty_extent_bytes(), aka MDEB. - * In this case we begin flush of this contiguous extent (in multiple - * parallel wsize sized blocks) since there's no benefit in waiting more - * as the data is sufficient for the server scheduler to effectively - * write, in optimal sized blocks. - * We complete the application write rightaway without waiting for the - * flush to complete as we are not under memory pressure. - * This will happen when user is sequentially writing to the file. - * 2. A single contiguous extent is not greater than MDEB but total dirty - * bytes waiting to be flushed is more than MDEB. - * In this case we begin flush of the entire dirty data from the cache - * as we have sufficient data for the server scheduler to perform - * batched writes effectively. - * We complete the application write rightaway without waiting for the - * flush to complete as we are not under memory pressure. - * This will happen when user is writing to the file in a random or - * pseudo-sequential fashion. - * 3. Cache has dirty data beyond the "inline write" threshold, ref - * do_inline_write(). - * In this case we begin flush of the entire dirty data. - * This is a memory pressure situation and hence we do not complete the - * application write till all the backend writes complete. - * This will happen when user is writing faster than our backend write - * throughput, eventually dirty data will grow beyond the "inline write" - * threshold and then we have to slow down the writers by delaying - * completion. - * - * Other than this we have a special case of "write beyond eof" (termed - * sparse write). In sparse write case also we perform "inline write" of - * all the dirty data. This is needed for correct read behaviour. Imagine - * a reader reading from the sparse part of the file which is not yet in - * the bytes_chunk_cache. This read will be issued to the server and since - * server doesn't know the updated file size (as the write may still be - * sitting in our bytes_chunk_cache) it will return eof. This is not correct - * as such reads issued after successful write, are valid and should return - * 0 bytes for the sparse range. - */ - - /* - * Do we need to perform "inline write"? - * Inline write implies, we flush all the dirty data and wait for all the - * corresponding backend writes to complete. - */ - const bool need_inline_write = - (sparse_write || inode->get_filecache()->do_inline_write()); - - if (need_inline_write) { - INC_GBL_STATS(inline_writes, 1); - - AZLogDebug("[{}] Inline write (sparse={}), {} bytes, extent @ [{}, {})", - ino, sparse_write, (extent_right - extent_left), - extent_left, extent_right); - - // Free up the fuse thread without completing the application write. - return perform_inline_writes(this, inode); - } - - /* - * We don't need to do inline writes. See if we have enough bytes to - * commit to the backend. - */ - const bool needs_commit = inode->get_filecache()->commit_required(); - if (needs_commit) { - /* - * Set the commit_in_progress flag to ensure that the commit - * task is startet. - */ - inode->mark_commit_in_progress(); - - assert(inode->is_commit_in_progress()); - assert(!need_inline_write); - } - /* * If the extent size exceeds the max allowed dirty size as returned by * max_dirty_extent_bytes(), then it's time to flush the extent. @@ -2845,83 +2995,11 @@ void rpc_task::run_write() inode->get_filecache()->max_dirty_extent_bytes(); assert(max_dirty_extent > 0); - /* - * How many bytes in the cache need to be flushed. These are dirty chunks - * which have not started flushing yet. - */ - const uint64_t bytes_to_flush = - inode->get_filecache()->get_bytes_to_flush(); - - AZLogDebug("[{}] extent_left: {}, extent_right: {}, size: {}, " - "bytes_to_flush: {} (max_dirty_extent: {})", - ino, extent_left, extent_right, - (extent_right - extent_left), - bytes_to_flush, - max_dirty_extent); - - if (inode->is_commit_in_progress()) { - /* - * Commit is in progress, we can't start new flush to the BLOB. - * Complete the write if it doesn't need flush, otherwise add to - * commit_waiters list. - * Note: It make sense to complete the write only after commit is done - * if we already reached the flush limit. As otherwise it will - * trigger inline writes. - */ - if (bytes_to_flush < max_dirty_extent) { - AZLogDebug("[{}] Commit in progress, skipping write, inline_write is false", - ino); - reply_write(length); - return; - } else { - /* - * Add this task to the commit_waiters list, so that it can be - * completed after the commit is done. - */ - if(inode->get_filecache()->add_commit_waiter( - inode->last_flushing_offset, - inode->last_flushing_length, - this)) { - AZLogDebug("[{}] Commit in progress, added to commit_waiters list", - ino); - return; - } else { - /* - * Commit is done, but this task is not added to commit_waiters - * list, so complete the write. - */ - AZLogDebug("[{}] Commit in progress, but not added to commit_waiters list, " - "completing write", ino); - reply_write(length); - return; - } - } - - // We should never reach here. - assert(false); - return; - } - /* * Ok, we neither need inline writes nor commit in progress. See if we have * enough dirty data and we need to start async flush. */ - if ((extent_right - extent_left) < max_dirty_extent) { - /* - * Current extent is not large enough to be flushed, see if we have - * enough total dirty data that needs to be flushed. This is to cause - * random writes to be periodically flushed. - */ - if (bytes_to_flush < max_dirty_extent) { - /* - * No memory pressure. - */ - AZLogDebug("Reply write without syncing to Blob"); - reply_write(length); - return; - } - /* * This is the case of non-sequential writes causing enough dirty * data to be accumulated, need to flush all of that. @@ -2930,43 +3008,12 @@ void rpc_task::run_write() extent_right = UINT64_MAX; } - /* - * We need to flush the dirty data in the range [extent_left, extent_right), - * get the membufs for the dirty data. - * We don't want to run over an inprogress truncate and resetting the file - * size set by truncate, so grab the is_flushing lock. - */ - inode->flush_lock(); - - uint64_t size = 0; - std::vector bc_vec; - /* - * If the inode is stable write, then get the dirty membufs in the range. - * otherwise get the dirty membufs contigious range. - */ - if (inode->is_stable_write()) { - bc_vec = - inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left, - extent_right); - } else { - bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size); - } - - if (bc_vec.size() == 0) { - inode->flush_unlock(); - reply_write(length); + if (handle_writes(inode, this, ino, sparse_write, + extent_left, extent_right)) { return; + } else { + assert(false); } - - /* - * Pass is_flush as false, since we don't want the writes to complete - * before returning. - */ - inode->sync_membufs(bc_vec, false /* is_flush */); - inode->flush_unlock(); - - // Send reply to original request without waiting for the backend write to complete. - reply_write(length); } void rpc_task::run_flush() From fff47d6f9866a94d5c609055514fb825b26c5bbe Mon Sep 17 00:00:00 2001 From: Nagendra Tomar Date: Tue, 28 Jan 2025 13:47:39 +0000 Subject: [PATCH 3/9] Audit changes. - audited file_cache.h --- turbonfs/inc/file_cache.h | 226 ++++++++++++++++++++++++++------------ turbonfs/src/rpc_task.cpp | 2 +- 2 files changed, 158 insertions(+), 70 deletions(-) diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h index 39ddee58..6a6165d1 100644 --- a/turbonfs/inc/file_cache.h +++ b/turbonfs/inc/file_cache.h @@ -233,10 +233,34 @@ struct membuf * immediately so that it can handle other interactive requests (note that * fuse threads are very limited). Later when the flush/write to Blob * actually completes, it completes the fuse task(s) queued in this list. + * + * Note: Depending on whether a membuf is written using UNSTABLE or STABLE + * write, it will or won't need to be committed. A membuf written + * using UNSTABLE write will be just added to the TBL (Temporary Block + * List) of the Blob and won't be visible to other readers and + * writers, till it's committed (which will add it to the Committed + * Block List, the CBL). + * A membuf written using STABLE write will be committed, i.e., it'll + * be added to the CBL, and won't need an extra commit call. + * So, with UNSTABLE write a membuf will have a flush call to write + * the membuf data and then a commit call to commit it to the Blob, + * (usually) along with more such membufs. Various membufs can be + * flushed in parallel, which allows us to get higher write throughput + * than what a single connection can support. */ std::vector *flush_waiters = nullptr; std::mutex flush_waiters_lock_44; + /** + * Like flush_waiters, commit_waiters is the list of tasks(s) waiting for + * this membuf to be committed, i.e., data in this membuf to be added to + * the CBL (Committed Block List) of the backing Blob. Note that the membuf + * must have started flushing, but it may not have yet completed flushing. + * When the membuf successfully completes flushing, the flush callback will + * start the commit and on completion of commit, the commit_waiters will be + * called. + * Also see flush_waiters. + */ std::vector *commit_waiters = nullptr; std::mutex commit_waiters_lock_44; @@ -1235,6 +1259,27 @@ class bytes_chunk_cache uint64_t length, struct rpc_task *task); + /** + * Add 'task' to the commit_waiters list for membuf covering the region + * [offset, offset+list). If the region is covered by more than one membuf + * then 'task' is added to the commit_waiters list of the last membuf. + * + * Returns true if task was successfully added. This will happen when the + * following conditions are met: + * 1. There is a membuf with at least one byte overlapping with the given + * region [offset, offset+list). + * 2. The membuf is either in flushing or commit_pending state. + * If in flushing state, it'll first move to commit_pending state + * after successful flush and then after commit completed, this + * task will be completed. + * + * If it returns true, caller need not call the fuse callback as it'll + * get called when the membuf is committed. + * If task cannot be added to the commit_waiters list, it returns false, + * and in that case caller must complete the fuse callback. + * + * LOCK: This takes chunkmap_lock_43 lock exclusively. + */ bool add_commit_waiter(uint64_t offset, uint64_t length, struct rpc_task *task); @@ -1396,58 +1441,65 @@ class bytes_chunk_cache } /** - * Maximum size a dirty extent can grow before we should flush it. + * Maximum size a dirty extent can grow before we must flush it. * This is 60% of the allowed cache size or 1GB whichever is lower. * The reason for limiting it to 1GB is because there's not much value in * holding more data than the Blob NFS server's scheduler cache size. * We want to send as prompt as possible to utilize the n/w b/w but slow * enough to give the write scheduler an opportunity to merge better. * - * Note: max_dirty_extent is static as it doesn't change after it's - * queried for the first time. + * TODO: For unstable writes, it may not make sense to wait after we have + * a full block worth of data. */ - uint64_t max_dirty_extent_bytes() const + static uint64_t max_dirty_extent_bytes() { // Maximum cache size allowed in bytes. static const uint64_t max_total = (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); assert(max_total != 0); - static const uint64_t max_dirty_extent = std::min( - (uint64_t)(max_total * 0.6), (uint64_t)(1024 * 1024 * 1024ULL)); + /* + * Minimum of 60% of max cache and 1000MiB. + * 1000MiB, as it can be equally divided into 100MiB blocks. + */ + static const uint64_t max_dirty_extent = + std::min((uint64_t)(max_total * 0.6), 1000 * 1024 * 1024UL); + + // Must be non-zero. assert(max_dirty_extent != 0); return max_dirty_extent; } - /* - * Maximum size of commit bytes that can be pending in cache, before we - * should commit it to Blob. - * It should be greater than max_dirty_extent_bytes() and smaller than - * inline_dirty_threshold. So, that inline pruning can be avoided. - * This is 60% of the allowed cache size. - * f.e = Cache size of 4GB then max_commit_bytes = 2.4GB - * - Flush will start every 1GB dirty data and each 1GB dirty data - * converted to commit_pending_bytes. - * - * This is 60% of the allowed cache size. - */ - uint64_t max_commit_bytes() const + /** + * Maximum size of commit_pending data that can be in cache, before we + * must commit it to Blob. + * It should be greater than or equal to the flush threshold (as returned + * by max_dirty_extent_bytes()) and smaller than the inline write threshold + * (as suggested by do_inline_write()), to minimize inline flush waits as + * much as possible, in steady state. + */ + static uint64_t max_commit_bytes() { // Maximum cache size allowed in bytes. static const uint64_t max_total = (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); assert(max_total != 0); + /* + * Minimum of 60% of max cache and 2 times the flush limit. + * We want to commit as soon as possible w/o affecting performance. + * If we commit too often, since commit is a serializing operation, + * it'll affect the write throughput, otoh, if we commit too late + * then we might hit the inline write threshold, which again would + * serialize writes, bringing down throughput. + */ static const uint64_t max_commit_bytes = - std::max((((uint64_t)(max_total * 0.6))/ max_dirty_extent_bytes()) * max_dirty_extent_bytes(), - (uint64_t) (2 * AZNFSCFG_WSIZE_MAX)); - - static const uint64_t min_commit_bytes = - std::max(((2 * max_dirty_extent_bytes()) - AZNFSCFG_WSIZE_MAX), - (uint64_t) (2 * AZNFSCFG_WSIZE_MAX)); + std::min((uint64_t)(max_total * 0.6), + 2 * max_dirty_extent_bytes()); + assert(max_commit_bytes > 0); - return std::max((max_commit_bytes - AZNFSCFG_WSIZE_MAX), min_commit_bytes); + return max_commit_bytes; } /** @@ -1498,59 +1550,61 @@ class bytes_chunk_cache return bytes_flushing > 0; } - /* - * Check if bytes_commit_pending is equal or more than - * max_commit_bytes limit. + /** + * Check if we must initiate a COMMIT RPC now. Note that the caller would + * just send the COMMIT RPC and not necessarily block the user write + * request till the COMMIT RPC completes, i.e., it's not an inline commit. + * + * We must start commit if: + * 1. We have enough commit_pending data for this file/cache, or, + * 2. Global memory pressure dictates that we commit now to free up + * memory. In this case we might be committing more frequently which + * won't necessarily be optimal, but we have no choice due to the + * memory pressure. */ bool commit_required() const { - return (bytes_commit_pending >= max_commit_bytes()); - } + const bool local_pressure = + (bytes_commit_pending >= max_commit_bytes()); - /* - * Check if bytes_to_flush is equal or more than - * max_dirty_extent_bytes limit. - */ - bool flush_required() const - { - return (get_bytes_to_flush() >= max_dirty_extent_bytes()); + if (local_pressure) { + return true; + } + + /* + * TODO: Take cue from global memory pressure. + */ + return false; } - /* - * get_bytes_to_prune() returns the number of bytes that need to be flushed - * inline to free up the space. If there are enough bytes_flushing then we - * can just wait for them to complete. - * - * Note : We are not considering bytes_commit_pending in this calculation. - * If bytes_commit_pending are high then commit already started and - * if bytes_flushing are high once flushing is done commit will be - * triggered. + /** + * Check if we must initiate flush of some cached data. Note that the caller + * would just send the corresponding WRITE RPC and not necessarily block the + * user write request till the WRITE RPC completes, i.e., it's not an inline + * write. + * + * We must start flush/write if: + * 1. We have enough bytes to flush so that we can write a full sized + * block, or for the case of stable write, we have enough data to fill + * the scheduler queue. + * 2. Global memory pressure dictates that we flush now to free up memory. + * In this case we might be flushing more frequently which won't + * necessarily be optimal, but we have no choice due to the memory + * pressure. */ - uint64_t get_bytes_to_prune() const + bool flush_required() const { - static const uint64_t max_dirty_allowed_per_cache = - max_dirty_extent_bytes() * 2; - int64_t total_bytes = - std::max(int64_t(bytes_dirty - bytes_flushing), int64_t(0)); - const bool local_pressure = total_bytes > (int64_t)max_dirty_allowed_per_cache; + const bool local_pressure = + (get_bytes_to_flush() >= max_dirty_extent_bytes()); if (local_pressure) { - return std::max(int64_t(0), (int64_t)(total_bytes - max_dirty_extent_bytes())); + return true; } /* - * Global pressure is when get_prune_goals() returns non-zero bytes - * to be pruned inline. + * TODO: Take cue from global memory pressure. */ - uint64_t inline_bytes; - - /* - * TODO: Noisy neighbor syndrome, where one file is hogging the cache, - * inline pruning will be triggered for other files. - */ - get_prune_goals(&inline_bytes, nullptr); - return std::max(int64_t(inline_bytes - (bytes_flushing + bytes_commit_pending)), - (int64_t)0); + return false; } /** @@ -1559,6 +1613,21 @@ class bytes_chunk_cache * global memory pressure. */ bool do_inline_write() const + { + return get_inline_flush_bytes() > 0; + } + + /** + * Inline write/flush means that we are under sufficient memory pressure + * that we want to slow down the application writes by not completing them + * after copying the data to cache (using copy_to_cache()) but instead + * complete application writes only after the flush completes. + * + * This function returns a non-zero number representing the number of bytes + * that we need to write inline, else if not under memory pressure returns + * zero. + */ + uint64_t get_inline_flush_bytes() const { /* * Allow two dirty extents before we force inline write. @@ -1566,11 +1635,18 @@ class bytes_chunk_cache * the second one. */ static const uint64_t max_dirty_allowed_per_cache = - max_dirty_extent_bytes() * 2; - const bool local_pressure = bytes_dirty > max_dirty_allowed_per_cache; + (max_dirty_extent_bytes() * 2); + const bool local_pressure = + ((int64_t) get_bytes_to_flush() > (int64_t) max_dirty_allowed_per_cache); if (local_pressure) { - return true; + /* + * Leave one max_dirty_extent_bytes worth of dirty bytes, and + * flush the rest. + */ + const int64_t flush_now = + (get_bytes_to_flush() - max_dirty_extent_bytes()); + return std::max((int64_t) 0, flush_now); } /* @@ -1579,8 +1655,20 @@ class bytes_chunk_cache */ uint64_t inline_bytes; + /* + * TODO: Noisy neighbor syndrome, where one file is hogging the cache, + * inline pruning will be triggered for other files. + */ get_prune_goals(&inline_bytes, nullptr); - return (inline_bytes > 0); + + /* + * (bytes_flushing + bytes_commit_pending) represents the data which + * is either already flushed or being flushed. Exclude that from the + * needs-to-be-flushed data. + */ + const int64_t flush_now = + (inline_bytes - (bytes_flushing + bytes_commit_pending)); + return std::max((int64_t) 0, flush_now); } /** diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp index 145493d6..ba6117da 100644 --- a/turbonfs/src/rpc_task.cpp +++ b/turbonfs/src/rpc_task.cpp @@ -2575,7 +2575,7 @@ static void perform_inline_writes(struct rpc_task *task, * flush more data we can just wait for the ongoing flush/commit * to complete. */ - uint64_t prune_bytes = inode->get_filecache()->get_bytes_to_prune(); + uint64_t prune_bytes = inode->get_filecache()->get_inline_flush_bytes(); if (prune_bytes == 0 && !inode->is_stable_write()) { AZLogInfo("No bytes to prune, Wait for ongoing flush/commit" From 00353ef4158e24c956c38765a2931b3eccbf74d5 Mon Sep 17 00:00:00 2001 From: Nagendra Tomar Date: Wed, 29 Jan 2025 05:14:14 +0000 Subject: [PATCH 4/9] Audit rpc_task.cpp --- turbonfs/src/rpc_task.cpp | 179 ++++++++++++++++++++++++-------------- 1 file changed, 112 insertions(+), 67 deletions(-) diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp index ba6117da..6f776571 100644 --- a/turbonfs/src/rpc_task.cpp +++ b/turbonfs/src/rpc_task.cpp @@ -806,8 +806,20 @@ void access_callback( } } +/** + * Helper function to complete all write tasks waiting for this membuf to be + * committed. + */ static void complete_commit_waiter_tasks(struct membuf *mb) { + /* + * membuf must have finished committing. + * Obviously must not be dirty, as we will only ever commit a clean membuf + * (after it completes flushing). + */ + assert(!mb->is_commit_pending()); + assert(!mb->is_dirty()); + std::vector tvec = mb->get_commit_waiters(); for (auto& task : tvec) { assert(task->magic == RPC_TASK_MAGIC); @@ -815,7 +827,7 @@ static void complete_commit_waiter_tasks(struct membuf *mb) assert(task->rpc_api->write_task.is_fe()); assert(task->rpc_api->write_task.get_size() > 0); - AZLogDebug("Completing flush waiter task {} for [{}, {})", + AZLogDebug("Completing commit waiter task {} for [{}, {})", fmt::ptr(task), task->rpc_api->write_task.get_offset(), task->rpc_api->write_task.get_offset() + @@ -831,40 +843,61 @@ static void commit_callback( void *private_data) { rpc_task *task = (rpc_task*) private_data; + + /* + * Commit is issued as a FUSE_FLUSH. + * TODO: Maybe it should have a type of its own. + */ assert(task->magic == RPC_TASK_MAGIC); assert(task->get_op_type() == FUSE_FLUSH); assert(task->rpc_api->pvt != nullptr); + // Commit is never called for a fuse request. + assert(task->get_fuse_req() == nullptr); - auto res = (COMMIT3res*)data; + auto res = (COMMIT3res*) data; INJECT_JUKEBOX(res, task); - const fuse_ino_t ino = - task->rpc_api->flush_task.get_ino(); + const fuse_ino_t ino = task->rpc_api->flush_task.get_ino(); struct nfs_inode *inode = task->get_client()->get_nfs_inode_from_ino(ino); - auto bc_vec_ptr = (std::vector *)task->rpc_api->pvt; + // List of bcs committed by this commit call. + auto bc_vec_ptr = (std::vector *) task->rpc_api->pvt; + + // We should call commit only when inode is doing unstable+commit. + assert(!inode->is_stable_write()); + // Caller must be inprogress. + assert(inode->is_commit_in_progress()); const int status = task->status(rpc_status, NFS_STATUS(res)); UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc); - AZLogDebug("commit_callback"); + + AZLogDebug("[{}] commit_callback", ino); + /* * Now that the request has completed, we can query libnfs for the * dispatch time. */ task->get_stats().on_rpc_complete(rpc_get_pdu(rpc), NFS_STATUS(res)); + if (status == 0) { uint64_t offset = 0; uint64_t length = 0; - auto &bc_vec = *bc_vec_ptr; // get the vector from the pointer. - for (auto &bc : bc_vec) - { + /* + * Go over all the successfully committed bcs and release them from + * file cache (note that successful commit confirms that server has + * persisted the data and client can fee it). + * Also complete any tasks waiting for these membufs to be committed. + */ + for (auto &bc : *bc_vec_ptr) { struct membuf *mb = bc.get_membuf(); assert(mb->is_inuse()); assert(mb->is_locked()); assert(mb->is_commit_pending()); assert(mb->is_uptodate()); + // Dirty membufs must not be committed. + assert(!mb->is_dirty()); mb->clear_commit_pending(); complete_commit_waiter_tasks(mb); @@ -880,44 +913,41 @@ static void commit_callback( } else if (offset + length == bc.offset) { length += bc.length; } else { - auto release = inode->get_filecache()->release(offset, length); - AZLogInfo("BC released {}, asked for {}", release, length); + const uint64_t released = + inode->get_filecache()->release(offset, length); + AZLogDebug("[{}] commit_callback releasing bc [{}, {}), " + "released {} bytes", + ino, offset, offset+length, released); offset = bc.offset; length = bc.length; } } + // Release the last bc not released by the above loop. if (length != 0) { - assert(length != 0); - auto release = inode->get_filecache()->release(offset, length); - - AZLogInfo("BC offset {}, asked for {}, release {}", (offset/1048576), (length/1048576), release); - } - - task->free_rpc_task(); - #if 0 - if (task->get_fuse_req() != nullptr) { - task->reply_error(status); - } else { - task->free_rpc_task(); + const uint64_t released = + inode->get_filecache()->release(offset, length); + AZLogDebug("[{}] commit_callback releasing bc [{}, {}), " + "released {} bytes", + ino, offset, offset+length, released); } - #endif } else if (NFS_STATUS(res) == NFS3ERR_JUKEBOX) { task->get_client()->jukebox_retry(task); return; } else { - auto &bc_vec = *bc_vec_ptr; // get the vector from the pointer. /* - * Mark the membufs as dirty and clear the commit_pending flag. + * Commit has failed. + * Go over all the bcs that this commit was targetting, mark the + * membufs back as dirty and clear the commit_pending flag. * Next write will initiate the flush again with stable write. */ - for (auto &bc : bc_vec) - { + for (auto &bc : *bc_vec_ptr) { struct membuf *mb = bc.get_membuf(); assert(mb->is_inuse()); assert(mb->is_locked()); assert(mb->is_commit_pending()); assert(mb->is_uptodate()); + assert(!mb->is_dirty()); mb->clear_commit_pending(); complete_commit_waiter_tasks(mb); @@ -932,45 +962,48 @@ static void commit_callback( */ assert(inode->get_filecache()->is_flushing_in_progress() == false); inode->set_stable_write(); - - task->free_rpc_task(); - #if 0 - if (task->get_fuse_req() != nullptr) { - task->reply_error(status); - } else { - inode->set_write_error(status); - task->free_rpc_task(); - } - #endif } - delete bc_vec_ptr; - + /* + * Clear commit inprogress for this inode. + * Only after any new flushes or commits can be sent for this file. + */ inode->clear_commit_in_progress(); + + delete bc_vec_ptr; + task->rpc_api->pvt = nullptr; + task->free_rpc_task(); } void rpc_task::issue_commit_rpc() { // Must only be called for a flush task. assert(get_op_type() == FUSE_FLUSH); + // Commit is never called for a fuse request. + assert(get_fuse_req() == nullptr); const fuse_ino_t ino = rpc_api->flush_task.get_ino(); struct nfs_inode *inode = get_client()->get_nfs_inode_from_ino(ino); - assert(inode->is_stable_write() == false); - assert(inode->get_filecache()->is_flushing_in_progress() == false); + + // Commit must not be called if we are doing stable writes. + assert(!inode->is_stable_write()); + // Commit must mot be sent with ongoing flushes. + assert(!inode->get_filecache()->is_flushing_in_progress()); + // Caller must have marked commit inprogress. + assert(inode->is_commit_in_progress()); COMMIT3args args; ::memset(&args, 0, sizeof(args)); bool rpc_retry = false; - AZLogDebug("issue_commit_rpc"); + AZLogDebug("[{}] issue_commit_rpc", ino); /* * Get the bcs marked for commit_pending. */ - if (rpc_api->pvt == nullptr) { - rpc_api->pvt = static_cast(new std::vector(inode->get_filecache()->get_commit_pending_bcs())); - } + assert(rpc_api->pvt != nullptr); + rpc_api->pvt = static_cast(new std::vector( + inode->get_filecache()->get_commit_pending_bcs())); args.file = inode->get_fh(); args.offset = 0; @@ -981,8 +1014,7 @@ void rpc_task::issue_commit_rpc() stats.on_rpc_issue(); if (rpc_nfs3_commit_task(get_rpc_ctx(), - commit_callback, &args, - this) == NULL) { + commit_callback, &args, this) == NULL) { stats.on_rpc_cancel(); /* * Most common reason for this is memory allocation failure, @@ -1161,7 +1193,7 @@ void bc_iovec::on_io_fail(int status) /* * This membuf has completed flushing albeit with failure, check if any - * task is waiting for this membuf to be flushed/commit. + * task is waiting for this membuf to be flushed. */ std::vector tvec = mb->get_flush_waiters(); for (auto& task : tvec) { @@ -1184,7 +1216,7 @@ void bc_iovec::on_io_fail(int status) * This membuf has completed flushing albeit with failure, check if any * task is waiting for this membuf to be committed. Now this membuf * failed to flush, so we don't issue commit for this membuf. Hence - * we must complete the waiter tasks. + * we must complete the commit waiter tasks. */ std::vector tvec_commit = mb->get_commit_waiters(); for (auto& task : tvec_commit) { @@ -1192,7 +1224,7 @@ void bc_iovec::on_io_fail(int status) assert(task->get_op_type() == FUSE_WRITE); assert(task->rpc_api->write_task.is_fe()); - AZLogError("Completing flush waiter task {} for [{}, {}), " + AZLogError("Completing commit waiter task {} for [{}, {}), " "failed: {}", fmt::ptr(task), task->rpc_api->write_task.get_offset(), @@ -1390,22 +1422,26 @@ static void write_iov_callback( bciov->on_io_fail(status); } - // check if commit is pending. + /* + * Check if commit is pending. + * Any thread that decides to commit and finds that there's an ongoing + * flush, will mark the inode as "needing commit" and when the flush + * completes, we need to start the commit. + */ if (!inode->get_filecache()->is_flushing_in_progress()) { bool create_commit_task = false; - std::unique_lock lock(inode->commit_lock_5); { + std::unique_lock lock(inode->commit_lock_5); if (inode->is_commit_pending()) { - assert(inode->is_stable_write() == false); + assert(!inode->is_stable_write()); inode->set_commit_in_progress(); create_commit_task = true; } } - lock.unlock(); if (create_commit_task) { - // Create a new flush_task for the remaining bc_iovec. + // Create a commit task to commit all bcs needing commit. struct rpc_task *commit_task = client->get_rpc_task_helper()->alloc_rpc_task_reserved(FUSE_FLUSH); commit_task->init_flush(nullptr /* fuse_req */, ino); @@ -2552,7 +2588,8 @@ void rpc_task::run_access() static void perform_inline_writes(struct rpc_task *task, struct nfs_inode *inode) { - AZLogInfo("Performing inline writes for inode {}", inode->get_fuse_ino()); + AZLogInfo("[{}] Performing inline write", inode->get_fuse_ino()); + const size_t length = task->rpc_api->write_task.get_size(); std::vector bc_vec; @@ -2570,16 +2607,17 @@ static void perform_inline_writes(struct rpc_task *task, } /* - * prune_bytes is the number of bytes that need to be flushed to + * flush_now is the number of bytes that need to be flushed to * make space for the new writes. If it's 0, then we don't need to * flush more data we can just wait for the ongoing flush/commit * to complete. */ - uint64_t prune_bytes = inode->get_filecache()->get_inline_flush_bytes(); + const uint64_t flush_now = inode->get_filecache()->get_inline_flush_bytes(); + + if (flush_now == 0 && !inode->is_stable_write()) { + AZLogInfo("No bytes to flush, Wait for ongoing flush/commit " + "to complete"); - if (prune_bytes == 0 && !inode->is_stable_write()) { - AZLogInfo("No bytes to prune, Wait for ongoing flush/commit" - "to complete."); /* * In case of inline_write, we want to block the writes till the * enough space is made for new writes. Every write thread will @@ -2616,7 +2654,7 @@ static void perform_inline_writes(struct rpc_task *task, * data into the membufs and issue the writes to the backend. */ bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size); - if (size >= prune_bytes) { + if (size >= flush_now) { assert(!bc_vec.empty()); /* * Perform inline sync. @@ -2641,9 +2679,9 @@ static void perform_inline_writes(struct rpc_task *task, } if (!inode->is_stable_write()) { - AZLogWarn("Not enough contiguous dirty data to prune, may be" - "pattern is not sequential. Prune_bytes={}, size={}", - prune_bytes, size); + AZLogWarn("Not enough contiguous dirty data to flush, may be " + "pattern is not sequential. flush_now={}, size={}", + flush_now, size); /* * Let's switch to stable write mode. @@ -2795,6 +2833,7 @@ static bool handle_writes(struct nfs_inode *inode, */ inode->mark_commit_in_progress(); + // XXX What if commit completes before we reach here? assert(inode->is_commit_in_progress()); assert(!need_inline_write); } @@ -2814,6 +2853,12 @@ static bool handle_writes(struct nfs_inode *inode, * add this task to the respective waiters list and return. * As we already 1GB worth of dirty data in the cache, we don't want * to add more data to the cache. So we wait for the ongoing flush/commit. + * + * XXX What if is_commit_in_progress() returns false but some other + * thread starts commit right after that? + * Note that we hold flush_lock which only protects us against some + * other thread starting a flush but commit is protected by + * commit_lock_5. */ if (inode->is_commit_in_progress()) { /* @@ -3009,7 +3054,7 @@ void rpc_task::run_write() } if (handle_writes(inode, this, ino, sparse_write, - extent_left, extent_right)) { + extent_left, extent_right)) { return; } else { assert(false); From 624924d48bf8f820695db37e608fe4610833d311 Mon Sep 17 00:00:00 2001 From: Nitin Singla Date: Wed, 29 Jan 2025 06:34:21 +0000 Subject: [PATCH 5/9] Removed the commit_lock. --- turbonfs/inc/nfs_inode.h | 6 ----- turbonfs/src/nfs_inode.cpp | 55 ++++++++++---------------------------- turbonfs/src/rpc_task.cpp | 35 +++++++++++------------- 3 files changed, 30 insertions(+), 66 deletions(-) diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h index ea3a2c95..17b6c4e5 100644 --- a/turbonfs/inc/nfs_inode.h +++ b/turbonfs/inc/nfs_inode.h @@ -448,12 +448,6 @@ struct nfs_inode std::atomic commit_state = commit_state_t::COMMIT_NOT_NEEDED; - /* - * commit_lock_5 is used to synchronize flush thread and write thread - * for commit operation. - */ - std::mutex commit_lock_5; - /** * TODO: Initialize attr with postop attributes received in the RPC * response. diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp index a6eeba51..de342b89 100644 --- a/turbonfs/src/nfs_inode.cpp +++ b/turbonfs/src/nfs_inode.cpp @@ -440,15 +440,8 @@ void nfs_inode::switch_to_stable_write() * If there is something to commit, then commit it and * wait for the commit to complete. */ - std::unique_lock commit_lock(commit_lock_5); assert(!is_commit_in_progress()); - set_commit_in_progress(); - commit_lock.unlock(); - - /* - * Issue the commit RPC to commit the dirty membufs. - */ - commit_membufs(); + mark_commit_in_progress(); /* * Wait for the commit to complete. @@ -497,9 +490,13 @@ bool nfs_inode::check_stable_write_required(off_t offset) /** * commit_membufs() is called by writer thread to commit flushed membufs. + * It's always issued under flush_lock(). */ void nfs_inode::commit_membufs() { + assert(is_flushing); + assert(is_commit_in_progress()); + /* * Create the commit task to carry out the write. */ @@ -937,8 +934,12 @@ int nfs_inode::copy_to_cache(const struct fuse_bufvec* bufv, return err; } +/* + * This function is called with flush_lock() held. + */ void nfs_inode::mark_commit_in_progress() { + assert(is_flushing); assert(is_stable_write() == false); /* @@ -950,42 +951,14 @@ void nfs_inode::mark_commit_in_progress() * Note: Ensure lock is held till we set the commit state depending on flushing * going on or not. This lock to synchronize the commit and flush task. */ - bool start_commit_task = false; - - /* - * Take the flush_lock to ensure no new flush or commit task is started. - * commit_lock is held to synchronize the flush thread and this thread - * to set the commit state depending on flushing going on or not. - * - * Note: flush_lock can't be used as flush task can be started under the - * flush_lock (flush_cache_and_wait()) and it can cause deadlock. - * flush_cache_and_wait() is called under flush_lock and it checks - * if commit required, then it starts the commit task. - * - * Note: commit_lock held for very small duration, so it's safe to hold in - * fuse context. - */ if (!is_commit_in_progress()) { - std::unique_lock lock(commit_lock_5 /*commit_lock*/); - /* - * It may be possible that commit is in progress, so check again. - * Might be set by other thread. - */ - if (!is_commit_in_progress()) - { - if (get_filecache()->is_flushing_in_progress()) { - set_commit_pending(); - } else { - set_commit_in_progress(); - start_commit_task = true; - } + if (get_filecache()->is_flushing_in_progress()) { + set_commit_pending(); + } else { + set_commit_in_progress(); + commit_membufs(); } - lock.unlock(); - } - - if (start_commit_task) { - commit_membufs(); } } diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp index 6f776571..9228efdc 100644 --- a/turbonfs/src/rpc_task.cpp +++ b/turbonfs/src/rpc_task.cpp @@ -868,6 +868,7 @@ static void commit_callback( assert(!inode->is_stable_write()); // Caller must be inprogress. assert(inode->is_commit_in_progress()); + assert(inode->get_filecache()->is_flushing_in_progress() == false); const int status = task->status(rpc_status, NFS_STATUS(res)); UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc); @@ -1001,7 +1002,7 @@ void rpc_task::issue_commit_rpc() /* * Get the bcs marked for commit_pending. */ - assert(rpc_api->pvt != nullptr); + assert(rpc_api->pvt == nullptr); rpc_api->pvt = static_cast(new std::vector( inode->get_filecache()->get_commit_pending_bcs())); @@ -1425,28 +1426,21 @@ static void write_iov_callback( /* * Check if commit is pending. * Any thread that decides to commit and finds that there's an ongoing - * flush, will mark the inode as "needing commit" and when the flush + * flush, will mark the inode as 'NEEDS_COMMIT' and when the flush * completes, we need to start the commit. */ if (!inode->get_filecache()->is_flushing_in_progress()) { - bool create_commit_task = false; - - { - std::unique_lock lock(inode->commit_lock_5); - if (inode->is_commit_pending()) { - assert(!inode->is_stable_write()); - inode->set_commit_in_progress(); - create_commit_task = true; - } - } + inode->flush_lock(); + if (inode->is_commit_pending()) { + assert(!inode->is_stable_write()); - if (create_commit_task) { - // Create a commit task to commit all bcs needing commit. - struct rpc_task *commit_task = - client->get_rpc_task_helper()->alloc_rpc_task_reserved(FUSE_FLUSH); - commit_task->init_flush(nullptr /* fuse_req */, ino); - commit_task->issue_commit_rpc(); + /* + * We are issuing the commit rpc, so set the commit_in_progress flag. + */ + inode->set_commit_in_progress(); + inode->commit_membufs(); } + inode->flush_unlock(); } /* @@ -2663,7 +2657,6 @@ static void perform_inline_writes(struct rpc_task *task, * sync_membufs(), but add the task to the commit_waiters list. */ inode->sync_membufs(bc_vec, false /* is_flush */, nullptr); - inode->flush_unlock(); /* * Set the commit_in_progress flag to ensure that the commit @@ -2672,6 +2665,7 @@ static void perform_inline_writes(struct rpc_task *task, * as we need to release the memory. */ inode->mark_commit_in_progress(); + inode->flush_unlock(); inode->add_task_to_commit_waiters(task); return; @@ -2827,6 +2821,7 @@ static bool handle_writes(struct nfs_inode *inode, * data to the backend */ if (need_commit) { + inode->flush_lock(); /* * Set the commit_in_progress flag to ensure that the commit * task is startet. @@ -2835,6 +2830,8 @@ static bool handle_writes(struct nfs_inode *inode, // XXX What if commit completes before we reach here? assert(inode->is_commit_in_progress()); + + inode->flush_unlock(); assert(!need_inline_write); } From 86107211a76ef613f2ef1c6ddf958a0afd94ddd9 Mon Sep 17 00:00:00 2001 From: Nagendra Tomar Date: Wed, 29 Jan 2025 22:59:15 +0000 Subject: [PATCH 6/9] Audit changes Audit nfs_inode.h Audit rpc_task.h --- turbonfs/inc/file_cache.h | 4 +- turbonfs/inc/nfs_inode.h | 130 ++++++++++++++++++++++++++++++-------- turbonfs/inc/rpc_task.h | 8 --- 3 files changed, 106 insertions(+), 36 deletions(-) diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h index 6a6165d1..ef2b65df 100644 --- a/turbonfs/inc/file_cache.h +++ b/turbonfs/inc/file_cache.h @@ -1349,8 +1349,8 @@ class bytes_chunk_cache * check for that after holding the membuf lock, before it tries to * commit those membuf(s). */ - std::vector get_flushing_bc_range(uint64_t st_off, - uint64_t end_off) const; + std::vector get_flushing_bc_range(uint64_t st_off = 0, + uint64_t end_off = UINT64_MAX) const; /* * Returns *all* commit pending chunks in chunkmap. diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h index 17b6c4e5..9abfc0ce 100644 --- a/turbonfs/inc/nfs_inode.h +++ b/turbonfs/inc/nfs_inode.h @@ -127,8 +127,16 @@ struct nfs_inode * * See flush_lock()/flush_unlock() for the actual locking. * + * Note: While flush_lock is held it's guaranteed that no new flush can + * start, but there can be ongoing flushes at the time when flush_lock + * is called (and it returns with the lock held). If the caller wants + * to proceed only after all ongoing flushes complete (and no new + * flushes are started), it must hold the flush_lock and then wait + * for ongoing flushes to complete. Ref flush_cache_and_wait(). + * * Note: Though it's called flush lock, but it protects backend file size - * changes through both flush and/or commit. + * changes through both flush and/or commit, i.e., if flush_lock is + * held it's guaranteed that no new flush or commit can start. */ mutable std::atomic is_flushing = false; mutable std::condition_variable_any flush_cv; @@ -319,9 +327,6 @@ struct nfs_inode * writes (either overwrite or sparse write) must go as a stable write * (since server knows best how to allocate blocks for them). * Once set to true, it remains true for the life of the inode. - * - * Note: As of now, we are not using this flag as commit changes not yet - * integrated, so we are setting this flag to true. */ bool stable_write = false; @@ -408,8 +413,10 @@ struct nfs_inode int write_error = 0; /* - * Last offset and length of the last flushing operation. - * This is used add task waiting for flush/commit to complete. + * Offset and length of the latest flush operation. + * Tasks that want to wait for multiple membufs to be flushed/committed, + * will add themselves to flush_waiters/commit_waiters list for this + * membuf. See add_task_to_flush_waiters()/add_task_to_commit_waiters(). */ uint64_t last_flushing_offset = 0; uint64_t last_flushing_length = 0; @@ -1138,42 +1145,86 @@ struct nfs_inode } /** - * Is commit pending for this inode? + * Is commit pending/scheduled for this inode? + * + * It's called from write_iov_callback() when flush/write completes to see + * if some other thread has scheduled commit (as it could not start commit + * since flush was in progress) and it needs to start commit now. + * + * This MUST be called with flush_lock held. */ bool is_commit_pending() const { + assert(is_flushing); assert(commit_state != commit_state_t::INVALID); return (commit_state == commit_state_t::NEEDS_COMMIT); } /** - * set needs_commit state for this inode. - * Note this is set to let flushing task know that commit is pending and start commit task. + * Schedule commit for this inode. + * + * This MUST be called with flush_lock held, as only one thread must + * schedule commit. Note that a thread wanting to commit some data, checks + * if some other thread is already flushing data from that cache, if yes it + * cannot start the commit till the flush completes (as we need consistent + * TBL at the time of commit), so it just schedules the commit and + * write_iov_callback() then starts the commit when the last ongoing flush + * completes. If there's no ongoing flush at the time of this call, then the + * thread that wants to commit must start the commit itself, in that case + * it'll call set_commit_in_progress() and not set_commit_pending(). + * See schedule_or_start_commit(). + * + * XXX Do not confuse it with similar function in membuf. + * That is used to set "commit needed" for a membuf after it's written + * using UNSTABLE write. This one is for the inode and conveys that one + * or more membufs in this inode's filecache need to be committed. */ void set_commit_pending() { - // Commit can be set to pending only if it is in commit_not_needed state. + assert(is_flushing); + /* + * Commit MUST be scheduled only if it is not already running or + * scheduled. + */ assert(commit_state == commit_state_t::COMMIT_NOT_NEEDED); commit_state = commit_state_t::NEEDS_COMMIT; } /** * Is commit in progress for this inode? + * Note that we consider commit to be in-progress even if it's scheduled + * (commit_state == NEEDS_COMMIT) but not yet running. This is because + * once scheduled the commit WILL DEFINITELY run (when the last flush + * completes) and we usually don't need to distinguish between that and + * whether it's actually running now. + * + * This can only be safely called with flush_lock held. + * Depending on other conditions, caller may be able to call it w/o the + * flush_lock but be careful and think through scenarios where some + * other thread can call set_commit_pending()/set_commit_in_progress(), + * right after or before our call to is_commit_in_progress(). */ bool is_commit_in_progress() const { assert(commit_state != commit_state_t::INVALID); return ((commit_state == commit_state_t::COMMIT_IN_PROGRESS) || - (commit_state == commit_state_t::NEEDS_COMMIT)); + (commit_state == commit_state_t::NEEDS_COMMIT)); } /** * Set commit_in_progress state for this inode. + * + * This MUST be called with flush_lock held, as only one thread must + * start commit. */ void set_commit_in_progress() { + assert(is_flushing); + assert(commit_state != commit_state_t::INVALID); + // Must not already be in-progress or scheduled. assert(commit_state != commit_state_t::COMMIT_IN_PROGRESS); + commit_state = commit_state_t::COMMIT_IN_PROGRESS; } @@ -1352,42 +1403,67 @@ struct nfs_inode int flush_cache_and_wait(uint64_t start_off = 0, uint64_t end_off = UINT64_MAX); - void mark_commit_in_progress(); - void add_task_to_commit_waiters(struct rpc_task *task); + /** + * Start commit of (all) uncommitted data. Since commit and flush cannot + * run in parallel, if there's an ongoing flush it schedules commit by + * calling set_commit_pending() which will cause the last completing flush + * to trigger the commit, else it starts the commit. + * MUST be called only when doing unstable writes. + */ + void schedule_or_start_commit(); + + /** + * Helper method for calling file_cache's add_flush_waiter() method. + * If not able to add successfully, then it completes the task else the + * task will be completed when ongoing flush completes. + * Call to this function guarantees that task will be completed and + * caller need not complete the task. + */ void add_task_to_flush_waiters(struct rpc_task *task); + /** + * Helper method for calling file_cache's add_commit_waiter() method. + * If not able to add successfully, then it completes the task else the + * task will be completed when ongoing commit or commit initiated after + * the ongoing flush completes. + * Call to this function guarantees that task will be completed and + * caller need not complete the task. + */ + void add_task_to_commit_waiters(struct rpc_task *task); + /** * Wait for currently flushing membufs to complete. * If parent_task is non-null, then it's added to the commit_waiters list - * and returned. Otherwise it waits for the ongoing flush to complete. + * and returned, otherwise it waits for the ongoing flush (and subsequent + * commit) to complete. * * Returns 0 on success and a positive errno value on error. * * Note : Caller must hold the inode is_flushing lock to ensure that - * no new membufs are added till this call completes. + * no new flush is started till this call completes. */ - int wait_for_ongoing_flush_commit(uint64_t start_off = 0, - uint64_t end_off = UINT64_MAX, - struct rpc_task *parent_task = nullptr); + int wait_for_ongoing_flush_commit(struct rpc_task *parent_task = nullptr); - /* - * commit_membufs() is called to commit uncommitted membufs to the BLOB. + /** + * commit_membufs() is called to commit uncommitted membufs to the Blob. * It creates commit RPC and sends it to the NFS server. */ void commit_membufs(); - /* + /** * switch_to_stable_write() is called to switch the inode to stable write - * mode. There is should be no ongoing commit/flusing operation when this - * is called. It creates a commit RPC to commit all the uncopmmitted membufs - * to the BLOB. + * mode. It waits for all ongoing flush and subsequent commit to complete. + * If not already scheduled, it'll perform an explicit commit after the + * flush complete. + * Post that it'll mark inode for stable write and return. From then on + * any writes to this inode will be sent as stable writes. */ void switch_to_stable_write(); /** * Check if stable write is required for the given offset. - * Given offset is the start of contigious dirty membufs that need to be - * flushed to the BLOB. + * Given offset is the start of contiguous dirty membufs that need to be + * flushed to the Blob. */ bool check_stable_write_required(off_t offset); @@ -1405,6 +1481,8 @@ struct nfs_inode * caller in case of memory pressure when we want to delay fuse callbacks * to slow down writes which can cause more memory to be dirtied. * + * Note: Must be called with flush_lock held. + * * Note: sync_membufs() can free parent_task if all issued backend * writes complete before sync_membufs() could return. * DO NOT access parent_task after sync_membufs() returns. diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h index 72c77788..c9fd4303 100644 --- a/turbonfs/inc/rpc_task.h +++ b/turbonfs/inc/rpc_task.h @@ -2000,14 +2000,6 @@ struct rpc_task csched = _csched; } - void set_task_csched(bool stable_write) - { - if (client->mnt_options.nfs_port == 2048) { - csched = stable_write ? CONN_SCHED_FH_HASH : CONN_SCHED_RR; - } - return; - } - conn_sched_t get_csched() const { assert(csched > CONN_SCHED_INVALID && From ba09ab205411c74d7592137d6d8682e2f19f7898 Mon Sep 17 00:00:00 2001 From: Nagendra Tomar Date: Thu, 30 Jan 2025 05:58:43 +0000 Subject: [PATCH 7/9] Audit changes --- turbonfs/inc/file_cache.h | 7 +- turbonfs/inc/nfs_inode.h | 7 +- turbonfs/src/file_cache.cpp | 25 -- turbonfs/src/nfs_client.cpp | 2 + turbonfs/src/nfs_inode.cpp | 338 ++++++++++++++----------- turbonfs/src/rpc_task.cpp | 488 ++++++++++++++++++------------------ 6 files changed, 448 insertions(+), 419 deletions(-) diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h index ef2b65df..38afcc89 100644 --- a/turbonfs/inc/file_cache.h +++ b/turbonfs/inc/file_cache.h @@ -1284,9 +1284,6 @@ class bytes_chunk_cache uint64_t length, struct rpc_task *task); - void cleanup_on_error(); - - /* * Returns all dirty chunks for a given range in chunkmap. * Before returning it increases the inuse count of underlying membuf(s). @@ -1302,8 +1299,8 @@ class bytes_chunk_cache * check for that after holding the membuf lock, before it tries to * flush those membuf(s). */ - std::vector get_dirty_bc_range(uint64_t st_off, - uint64_t end_off) const; + std::vector get_dirty_bc_range(uint64_t st_off = 0, + uint64_t end_off = UINT64_MAX) const; /* * Returns dirty chunks which are not already flushing, in the given range, diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h index 9abfc0ce..e33e61f9 100644 --- a/turbonfs/inc/nfs_inode.h +++ b/turbonfs/inc/nfs_inode.h @@ -1383,9 +1383,7 @@ struct nfs_inode /** * Flush the dirty file cache represented by filecache_handle and wait - * till all dirty data is sync'ed with the NFS server. Only dirty data - * in the given range is flushed if provided, else all dirty data is - * flushed. + * till all dirty data is sync'ed with the NFS server. * Note that filecache_handle is the only writeback cache that we have * and hence this only flushes that. * For a non-reg file inode this will be a no-op. @@ -1400,8 +1398,7 @@ struct nfs_inode * initiate any new flush operations while some truncate call is in * progress (which must have held the is_flushing lock). */ - int flush_cache_and_wait(uint64_t start_off = 0, - uint64_t end_off = UINT64_MAX); + int flush_cache_and_wait(); /** * Start commit of (all) uncommitted data. Since commit and flush cannot diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp index a6ff86be..f948c3c1 100644 --- a/turbonfs/src/file_cache.cpp +++ b/turbonfs/src/file_cache.cpp @@ -2523,31 +2523,6 @@ bool bytes_chunk_cache::add_commit_waiter(uint64_t offset, return false; } -void bytes_chunk_cache::cleanup_on_error() -{ - const std::unique_lock _lock(chunkmap_lock_43); - - for (auto it = chunkmap.begin(); it != chunkmap.cend(); ++it) { - struct bytes_chunk& bc = it->second; - struct membuf *mb = bc.get_membuf(); - if (!mb->is_inuse() && !mb->is_locked()) { - mb->set_inuse(); - mb->set_locked(); - if (mb->is_dirty()) { - mb->clear_dirty(); - } - - if (mb->is_flushing()) { - mb->clear_flushing(); - } - - if (mb->is_commit_pending()) { - mb->clear_commit_pending(); - } - } - } -} - std::vector bytes_chunk_cache::get_dirty_bc_range( uint64_t start_off, uint64_t end_off) const { diff --git a/turbonfs/src/nfs_client.cpp b/turbonfs/src/nfs_client.cpp index 0589c5e4..d5ec131e 100644 --- a/turbonfs/src/nfs_client.cpp +++ b/turbonfs/src/nfs_client.cpp @@ -1180,6 +1180,8 @@ void nfs_client::getattr( * updating nfs_inode::attr during cached writes and then returning * attributes from that instead of making a getattr call here. * We need to think carefully though. + * + * TODO: flush_cache_and_wait() */ if (inode->is_regfile()) { AZLogDebug("[{}] Flushing file data ahead of getattr", diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp index de342b89..4776069f 100644 --- a/turbonfs/src/nfs_inode.cpp +++ b/turbonfs/src/nfs_inode.cpp @@ -382,48 +382,47 @@ void nfs_inode::wait_for_ongoing_commit() { assert(is_flushing); + /* + * TODO: See if we can eliminate inline sleep. + */ + if (is_commit_in_progress()) { + AZLogWarn("[{}] wait_for_ongoing_commit() will sleep inline!!", + get_fuse_ino()); + } + while (is_commit_in_progress()) { assert(!get_filecache()->is_flushing_in_progress()); ::usleep(1000); } - assert(is_commit_in_progress() == false); + assert(!is_commit_in_progress()); } /* * This function is called with flush_lock() held. - * In normal case this function should be no-op as - * when we reach here all the previous flushes/commits - * should have been completed. But in case of committing - * at the close of file, we need to wait if any flush - * going on and commit all the dirty membufs which are - * not yet committed. + * This should be called whenever we figure out that we cannot proceed with + * unstable writes (most common reason being, next write is not an append + * write). Once this function returns, following is guaranteed: + * - There will be no flushes in progress. + * - There will be no commit_pending data and no commit inprogress. + * - inode->stable_write will be set to true. */ void nfs_inode::switch_to_stable_write() { assert(is_flushing); - AZLogInfo("[{}] Switching to stable write", ino); + assert(!is_stable_write()); - /* - * We should not be in commit in progress state. - * Switch_to_stable_write() is called by flush thread - * and it shouldn't be called when commit is in progress. - */ - assert(is_commit_in_progress() == false); + AZLogInfo("[{}] Switching to stable write", ino); /* - * switch_to_stable_write() called from following paths. - * 1. sync_membufs(), No new flush should be started if - * there is ongoing flush is in progress. - * 2. flush_cache_and_wait(), Wait for ongoing flush to - * complete. - * As we are allowing multiple flush in parallel in case - * of unstable write as well, we should wait for ongoing - * flush to complere before switching to stable write. - * switch_to_stable_write() is called by flush_lock() held - * so no new flush can be started. + * switch_to_stable_write() is called from places where we are about to + * start a flush operation. Before that we check to see if we need to + * change to stable write. Since we are not flushing yet and since we + * do not support multiple ongoing flushes, we are guaranteed that no + * flush should be in progress when we reach here. + * Similarly commit should not be in progress. */ - wait_for_ongoing_flush_commit(); + assert(!is_commit_in_progress()); assert(!get_filecache()->is_flushing_in_progress()); /* @@ -440,8 +439,7 @@ void nfs_inode::switch_to_stable_write() * If there is something to commit, then commit it and * wait for the commit to complete. */ - assert(!is_commit_in_progress()); - mark_commit_in_progress(); + schedule_or_start_commit(); /* * Wait for the commit to complete. @@ -495,7 +493,8 @@ bool nfs_inode::check_stable_write_required(off_t offset) void nfs_inode::commit_membufs() { assert(is_flushing); - assert(is_commit_in_progress()); + + set_commit_in_progress(); /* * Create the commit task to carry out the write. @@ -508,10 +507,29 @@ void nfs_inode::commit_membufs() commit_task->issue_commit_rpc(); } +/** + * sync_membufs() + */ void nfs_inode::sync_membufs(std::vector &bc_vec, bool is_flush, struct rpc_task *parent_task) { + assert(is_flushing); + + if (!is_stable_write()) { + /* + * We do not allow a new flush while there's an ongoing one, in case + * of unstable writes. + */ + assert(!get_filecache()->is_flushing_in_progress()); + } + + /* + * Stable won't have commit and for unstable we cannot flush while + * commit is going on. + */ + assert(!is_commit_in_progress()); + if (bc_vec.empty()) { return; } @@ -540,21 +558,11 @@ void nfs_inode::sync_membufs(std::vector &bc_vec, parent_task->num_ongoing_backend_writes = 1; } - /* - * New flush should not be started if flush/commit is in progress. - * But it may happen commit_is_pending() is set by another thread. - */ - assert(!is_commit_in_progress()); - - if (bc_vec.empty()) { - return; - } - /* * If new offset is not at the end of the file, * then we need to switch to stable write. */ - if(check_stable_write_required(bc_vec[0].offset)) { + if (check_stable_write_required(bc_vec[0].offset)) { switch_to_stable_write(); } @@ -563,7 +571,20 @@ void nfs_inode::sync_membufs(std::vector &bc_vec, */ struct rpc_task *write_task = nullptr; - // Flush dirty membufs to backend. + /* + * Flush dirty membufs to backend. + * We increment bytes_flushing by 1 before starting any of the flush to + * protect is against the condition where whatever flush we have issued + * completes till we could issue all the flushes. In write_iov_callback() + * we will wrongly treat it as "all flush completed" and start the commit, + * while this thread may still keep doing flush for membufs. + * + * Also assert that for unstable writes, we should never start a new flush + * while there is an ongoing flush. + */ + assert((get_filecache()->bytes_flushing == 0) || is_stable_write()); + get_filecache()->bytes_flushing++; + for (bytes_chunk& bc : bc_vec) { /* * Get the underlying membuf for bc. @@ -718,6 +739,22 @@ void nfs_inode::sync_membufs(std::vector &bc_vec, write_task->issue_write_rpc(); } + /* + * Drop the protective bytes_flushing count. + * If bytes_flushing becomes 0 then all flushes issued above have completed + * by the time we reached here. Check if we need to start the commit. + * Any thread that decides to commit and finds that there's an ongoing + * flush, will mark the inode as 'NEEDS_COMMIT' and when the flush + * completes, we need to start the commit. + */ + if (--get_filecache()->bytes_flushing == 0) { + if (is_commit_pending()) { + assert(!is_stable_write()); + + commit_membufs(); + } + } + /* * Drop the protective num_ongoing_backend_writes count taken at the start * of this function, and if it's the only one remaining that means all @@ -936,27 +973,28 @@ int nfs_inode::copy_to_cache(const struct fuse_bufvec* bufv, /* * This function is called with flush_lock() held. + * Once this function is called it's guaranteed that a commit will be done, + * either from here or from the last flush (write_iov_callback()). */ -void nfs_inode::mark_commit_in_progress() +void nfs_inode::schedule_or_start_commit() { assert(is_flushing); - assert(is_stable_write() == false); + assert(!is_stable_write()); /* * We want to commit, but we need to wait for current flushing to complete. - * 1. If flushing is going on, set commit_pending to inode, when last membuf flushed, - * write_iov_callback(), start new task to commit the data. - * 2. If flushing is not going on, then create the task and start commit of data. + * 1. If flushing is going on, set commit_pending for the inode, when last + * membuf flush completes write_iov_callback() will commit the data. + * 2. If flushing is not going on, then start commit of data rightaway. * - * Note: Ensure lock is held till we set the commit state depending on flushing - * going on or not. This lock to synchronize the commit and flush task. + * Note: Ensure lock is held till we set the commit state depending on + * flushing going on or not. This lock is to synchronize the commit + * and flush task. */ - if (!is_commit_in_progress()) - { + if (!is_commit_in_progress()) { if (get_filecache()->is_flushing_in_progress()) { set_commit_pending(); } else { - set_commit_in_progress(); commit_membufs(); } } @@ -964,13 +1002,20 @@ void nfs_inode::mark_commit_in_progress() void nfs_inode::add_task_to_flush_waiters(struct rpc_task *task) { + // Must be called only for unstable writes. + assert(!is_stable_write()); + assert(task->magic == RPC_TASK_MAGIC); + // Must be a frontend write task. + assert(task->get_op_type() == FUSE_WRITE); + assert(task->rpc_api->write_task.is_fe()); + /* * Add this task to the flush_waiter list, so that it can be * completed after the flush is done. */ - if(get_filecache()->add_flush_waiter(last_flushing_offset, - last_flushing_length, - task)) { + if (get_filecache()->add_flush_waiter(last_flushing_offset, + last_flushing_length, + task)) { AZLogDebug("[{}] flush in progress, added to flush_waiter list" " for [{}, {})", ino, last_flushing_offset, last_flushing_length); @@ -980,8 +1025,8 @@ void nfs_inode::add_task_to_flush_waiters(struct rpc_task *task) * flush is done, but this task is not added to flush_waiter * list, so complete the write. */ - AZLogDebug("[{}] flush is done, not added to flush_waiter list, " - "completing write", ino); + AZLogDebug("[{}] Could not add to flush_waiters list, completing " + "write task {}", ino, fmt::ptr(task)); task->reply_write(task->rpc_api->write_task.get_size()); return; } @@ -989,63 +1034,55 @@ void nfs_inode::add_task_to_flush_waiters(struct rpc_task *task) void nfs_inode::add_task_to_commit_waiters(struct rpc_task *task) { - /* - * This should be called only for unstable writes. - */ - assert(is_stable_write() == false); + // Must be called only for unstable writes. + assert(!is_stable_write()); + assert(task->magic == RPC_TASK_MAGIC); + // Must be a frontend write task. + assert(task->get_op_type() == FUSE_WRITE); + assert(task->rpc_api->write_task.is_fe()); /* * Add this task to the commit_waiters list, so that it can be * completed after the commit is done. */ - if(get_filecache()->add_commit_waiter(last_flushing_offset, - last_flushing_length, - task)) { + if (get_filecache()->add_commit_waiter(last_flushing_offset, + last_flushing_length, + task)) { AZLogDebug("[{}] Commit in progress, added to commit_waiters list" " for [{}, {})", ino, last_flushing_offset, last_flushing_length); + /* + * Since we don't complete the write task here, add_commit_waiter() + * MUST ensure task will be completed if it returns true. + */ return; } else { /* * Commit is done, but this task is not added to commit_waiters * list, so complete the write. */ - AZLogDebug("[{}] Commit in progress, but not added to commit_waiters list, " - "completing write", ino); + AZLogDebug("[{}] Could not add to commit_waiters list, completing " + "write task {}", ino, fmt::ptr(task)); task->reply_write(task->rpc_api->write_task.get_size()); return; } } /** - * Note: Caller should call with flush_lock() held. - * On return there is no ongoing flush/commit in progress. + * Wait for ongoing flushes and commit all uncommitted data. + * Caller must call with flush_lock() held. + * On return there would be no ongoing flush/commit in progress and all + * uncommitted data would have been committed. * - * flush_cache_and_wait() checks for return value, as it's - * called on close() and need to know final status of the writes. + * If caller passes a valid frontend write task, the call itself will + * return immediately but the task will be completed after flush and commit + * have completed. If caller does not pass a tak pointer, then the call + * itself would block till all flush and commit complete. */ -int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, - uint64_t end_off, - struct rpc_task *task) +int nfs_inode::wait_for_ongoing_flush_commit(struct rpc_task *task) { - // Caller must call us with is_flushing lock held. + // Caller must call us with flush_lock held. assert(is_flushing); - assert(start_off < end_off); - - if (is_stable_write()) { - assert(!is_commit_in_progress()); - assert(get_filecache()->bytes_commit_pending == 0); - } - - /* - * If commit_pending limit reached, we need to start the commit, - * but we need to wait for the current flushing to complete, so - * we set the commit_pending flag and once flusing completes it - * will start the commit task. - */ - if (get_filecache()->is_flushing_in_progress()) { - assert(!is_commit_in_progress() || is_commit_pending()); - } /* * MUST be called only for regular files. @@ -1056,6 +1093,14 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, return 0; } + if (task) { + assert(task->magic == RPC_TASK_MAGIC); + // Must be a frontend write task. + assert(task->get_op_type() == FUSE_WRITE); + assert(task->rpc_api->write_task.is_fe()); + assert(task->rpc_api->write_task.get_size() > 0); + } + /* * If flush() is called w/o open(), there won't be any cache, skip. */ @@ -1064,17 +1109,27 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, } /* - * Flushing not in progress and no new flushing can be started as we hold - * the flush_lock(). - * It may be possible that the flush is not in progress and but we may have - * commit_in_progress or commit_pending_bytes to commit. - * In that case we should start the commit task if it's not already in - * progress. And wait for the commit to complete. + * Stable writes do not need commit, so no commit inprogress and no pending + * commit data. */ - if (!get_filecache()->is_flushing_in_progress() && - !is_commit_in_progress() && - get_filecache()->bytes_commit_pending == 0) { - AZLogDebug("[{}] No flush in progress, returning", ino); + if (is_stable_write()) { + assert(!is_commit_in_progress()); + assert(get_filecache()->bytes_commit_pending == 0); + } + + if (get_filecache()->is_flushing_in_progress()) { + /* + * Flushing and committing cannot happen simultaneously. + * This is effectively asserting the following: + * assert(commit_state != commit_state_t::COMMIT_IN_PROGRESS); + */ + assert(!is_commit_in_progress() || is_commit_pending()); + } else if (!is_commit_in_progress() && + get_filecache()->bytes_commit_pending == 0) { + /* + * No flush or commit in progress and nothing to commit, return. + */ + AZLogDebug("[{}] No flush or commit in progress, returning", ino); /* * Complete the write task if it was passed. @@ -1086,19 +1141,18 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, } /* - * Try to get the flushing bytes_chunk from the filecache handle if the - * flush is in progress. + * Ok, so either flushing is going on or commit is going on or we have at + * least one byte to commit. */ std::vector bc_vec = {}; - if (task == nullptr && get_filecache()->is_flushing_in_progress()) { + if ((task == nullptr) && get_filecache()->is_flushing_in_progress()) { /* - * Get the flushing bytes_chunk from the filecache handle. - * This will grab an exclusive lock on the file cache and return the list - * of flushing bytes_chunks at that point. Note that we can have new dirty - * bytes_chunks created but we don't want to wait for those. + * Get the flushing bc from the filecache. + * We will need to wait for these inline. + * Note that we can have new dirty membufs created after the call but + * we don't want to wait for those. */ - bc_vec = - filecache_handle->get_flushing_bc_range(start_off, end_off); + bc_vec = filecache_handle->get_flushing_bc_range(); } /* @@ -1110,7 +1164,7 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, * Task will wait on last flushed offset and length. */ if (task == nullptr) { - for (bytes_chunk &bc : bc_vec) { + for (bytes_chunk& bc : bc_vec) { struct membuf *mb = bc.get_membuf(); assert(mb != nullptr); @@ -1119,15 +1173,16 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, /* * If still dirty after we get the lock, it may mean two things: - * - Write failed. + * - Write failed. We ignore the failure and pretend as if flush + * completed. * - Some other thread got the lock before us and it made the - * membuf dirty again. + * membuf dirty again. Since we wanted to wait only for ongoing + * flushes and not newly written data, we ignore this. */ if (mb->is_dirty() && get_write_error()) { AZLogError("[{}] Flush [{}, {}) failed with error: {}", - ino, - bc.offset, bc.offset + bc.length, - get_write_error()); + ino, bc.offset, bc.offset + bc.length, + get_write_error()); } mb->clear_locked(); @@ -1163,17 +1218,21 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, /* * Unstable write case, we need to wait for the commit to complete. - * So, mark the commit in progress and wait for the commit to complete. + * We must start/schedule commit. */ if (!is_stable_write()) { - mark_commit_in_progress(); + schedule_or_start_commit(); if (task) { return 0; } /* * Wait for the ongoing commit to complete. + * If task is not valid we would have waited for all ongoing flushes, + * and no new flush can start because we hold the flush_lock, so when + * we come here we should have flushing going on. */ + assert(!get_filecache()->is_flushing_in_progress()); wait_for_ongoing_commit(); } @@ -1184,9 +1243,9 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, * So, following conditions should be true for both stable and * unstable writes.: */ - assert(is_commit_in_progress() == false); + assert(!get_filecache()->is_flushing_in_progress()); + assert(!is_commit_in_progress()); assert(get_filecache()->bytes_commit_pending == 0); - assert(get_filecache()->is_flushing_in_progress() == 0); return get_write_error(); } @@ -1203,18 +1262,17 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off, * - Wait for the commit to complete. * - Returns the error code if any. * - * Note: Flush_cache_and_wait() block the fuse thread till the flush completes. + * Note: Flush_cache_and_wait() blocks the fuse thread till the flush completes. * It's called from the release(), flush() and getattr() calls. It's ok * as of now as it's not very often called. We can optimize to complete * the flush in background and return immediately. For that we need to add * special handling for the getattr() call. */ -int nfs_inode::flush_cache_and_wait(uint64_t start_off, - uint64_t end_off) +int nfs_inode::flush_cache_and_wait() { - assert(start_off < end_off); - assert(!is_stable_write() || get_filecache()->get_bytes_to_commit() == 0); - assert(!is_stable_write() || is_commit_in_progress() == false); + assert(!is_stable_write() || + get_filecache()->bytes_commit_pending == 0); + assert(!is_stable_write() || !is_commit_in_progress()); /* * MUST be called only for regular files. @@ -1231,9 +1289,6 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, */ const int error_code = get_write_error(); if (error_code != 0) { - // Cleanup of the membufs if any left. - get_filecache()->cleanup_on_error(); - AZLogWarn("[{}] Previous write to this Blob failed with error={}, " "skipping new flush!", ino, error_code); @@ -1264,7 +1319,8 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, * As we have the flush_lock() we are guaranteed that no new flush can be * started till we release the flush_lock(). * - * wait_for_ongoing_flush_commit() will wait for the ongoing flush/commt to complete. + * wait_for_ongoing_flush_commit() will wait for the ongoing flush/commt to + * complete. * * Note: For stable write case, we don't need to wait for the ongoing flush, * as there is no commit required for stable write. @@ -1272,6 +1328,7 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, if (!is_stable_write()) { const int err = wait_for_ongoing_flush_commit(); if (err != 0) { + flush_unlock(); AZLogError("[{}] Failed to flush cache to stable storage, " "error={}", ino, err); return err; @@ -1287,12 +1344,13 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, * TODO: Need to check getattr() call. As that can cause this * assert to fail. */ - if ((get_filecache()->get_bytes_to_flush() == 0) && - (get_filecache()->is_flushing_in_progress() == false)) { - assert(is_commit_in_progress() == false); - assert(get_filecache()->is_flushing_in_progress() == false); + assert(!get_filecache()->is_flushing_in_progress() || + is_stable_write()); + assert(!is_commit_in_progress()); + assert(get_filecache()->bytes_commit_pending == 0); + + if (get_filecache()->get_bytes_to_flush() == 0) { assert(get_filecache()->bytes_dirty == 0); - assert(get_filecache()->bytes_commit_pending == 0); assert(get_filecache()->bytes_flushing == 0); AZLogDebug("[{}] Nothing to flush, returning", ino); @@ -1323,11 +1381,10 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, for (auto& bc : bc_vec) { bc.get_membuf()->clear_inuse(); } - bc_vec = filecache_handle->get_dirty_bc_range(start_off, end_off); + bc_vec = filecache_handle->get_dirty_bc_range(); } } else { - assert(get_filecache()->get_bytes_to_commit() == 0); - bc_vec = filecache_handle->get_dirty_bc_range(start_off, end_off); + bc_vec = filecache_handle->get_dirty_bc_range(); } /* @@ -1335,13 +1392,13 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, * membufs. It batches the contiguous dirty membufs and issues a single * write RPC for them. */ - sync_membufs(bc_vec, true); + sync_membufs(bc_vec, true /* is_flush */); /* * Our caller expects us to return only after the flush completes. * Wait for all the membufs to flush and get result back. */ - for (bytes_chunk &bc : bc_vec) { + for (bytes_chunk& bc : bc_vec) { struct membuf *mb = bc.get_membuf(); assert(mb != nullptr); @@ -1389,7 +1446,7 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, assert(get_filecache()->get_bytes_to_flush() == 0 || get_write_error() != 0); - mark_commit_in_progress(); + schedule_or_start_commit(); } /* @@ -1404,11 +1461,10 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, if (get_write_error()) { AZLogError("[{}] Failed to flush cache to stable storage, " "error={}", ino, get_write_error()); - get_filecache()->cleanup_on_error(); } - assert(is_commit_in_progress() == false); - assert(get_filecache()->is_flushing_in_progress() == false); + assert(!is_commit_in_progress()); + assert(!get_filecache()->is_flushing_in_progress()); assert(get_filecache()->bytes_dirty == 0); assert(get_filecache()->bytes_commit_pending == 0); assert(get_filecache()->bytes_flushing == 0); diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp index 9228efdc..62d3f5fa 100644 --- a/turbonfs/src/rpc_task.cpp +++ b/turbonfs/src/rpc_task.cpp @@ -868,7 +868,7 @@ static void commit_callback( assert(!inode->is_stable_write()); // Caller must be inprogress. assert(inode->is_commit_in_progress()); - assert(inode->get_filecache()->is_flushing_in_progress() == false); + assert(!inode->get_filecache()->is_flushing_in_progress()); const int status = task->status(rpc_status, NFS_STATUS(res)); UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc); @@ -1437,7 +1437,6 @@ static void write_iov_callback( /* * We are issuing the commit rpc, so set the commit_in_progress flag. */ - inode->set_commit_in_progress(); inode->commit_membufs(); } inode->flush_unlock(); @@ -1529,17 +1528,25 @@ void rpc_task::issue_write_rpc() args.offset = offset; args.count = length; args.stable = inode->is_stable_write() ? FILE_SYNC : UNSTABLE; - set_task_csched(inode->is_stable_write()); + + /* + * Unstable writes want to make use of multiple connections for better perf. + * Note that they aren't affected by the optimistic concurrency backoff + * issues as seen by stable writes. + */ + if (!inode->is_stable_write()) { + set_csched(CONN_SCHED_RR); + } do { rpc_retry = false; stats.on_rpc_issue(); if (rpc_nfs3_writev_task(get_rpc_ctx(), - write_iov_callback, &args, - bciov->iov, - bciov->iovcnt, - this) == NULL) { + write_iov_callback, &args, + bciov->iov, + bciov->iovcnt, + this) == NULL) { stats.on_rpc_cancel(); /* * Most common reason for this is memory allocation failure, @@ -2606,46 +2613,53 @@ static void perform_inline_writes(struct rpc_task *task, * flush more data we can just wait for the ongoing flush/commit * to complete. */ - const uint64_t flush_now = inode->get_filecache()->get_inline_flush_bytes(); + const uint64_t flush_now = + inode->get_filecache()->get_inline_flush_bytes(); - if (flush_now == 0 && !inode->is_stable_write()) { - AZLogInfo("No bytes to flush, Wait for ongoing flush/commit " + if ((flush_now == 0) && !inode->is_stable_write()) { + AZLogInfo("No new bytes to flush, Wait for ongoing flush/commit " "to complete"); /* - * In case of inline_write, we want to block the writes till the - * enough space is made for new writes. Every write thread will - * come here and try to take the lock, only one thread succeeds - * and wait for the ongoing flush/commit to complete. Rest of the - * threads will wait for the lock to be released by the first - * thread. After first thread completes the wait, check if any ongoing - * flush/commit is in progress and if it's still true. Mostly it will - * be false as new writes are blocked. - * - * Note: flush_lock() is held so that no new flush or commit task - * is started. - */ - inode->wait_for_ongoing_flush_commit(0, UINT64_MAX, task); + * get_inline_flush_bytes() tells us that we have no new data to flush + * inline, but since we are in memory pressure we need to slow down the + * writers by waiting till all the ongoing flush+commit complete. + * wait_for_ongoing_flush_commit() will complete task after all the + * ongoing flush and subsequent commit completes. + */ + inode->wait_for_ongoing_flush_commit(task); inode->flush_unlock(); return; } else { uint64_t size = 0; if (!inode->is_stable_write()) { + assert(flush_now > 0); /* - * We need to wait for the ongoing flush/commit to complete. + * We need to wait for the ongoing flush/commit to complete, as + * for unstable write case we only issue one active flush at a + * time. * This is only for the case where we have to prune some data * to make space for the new writes. Usually this will not be * happen so often, it's ok to wait here. + * + * TODO: See if we can eliminate this wait as it blocks the fuse + * thread. */ - AZLogDebug("Wait for ongoing flush/commit to complete."); - inode->wait_for_ongoing_flush_commit(0, UINT64_MAX); + AZLogWarn("Wait for ongoing flush/commit to complete!!"); + inode->wait_for_ongoing_flush_commit(); + /* + * We already waited for flush and commit to complete and we have + * the flush_lock held, so no new flush and commit can start. + */ assert(!inode->is_commit_in_progress()); assert(!inode->get_filecache()->is_flushing_in_progress()); /* * If the write is not stable, we need to copy the application * data into the membufs and issue the writes to the backend. + * If we don't find flush_now contiguous bytes, then we switch + * to stable write. */ bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size); if (size >= flush_now) { @@ -2657,14 +2671,8 @@ static void perform_inline_writes(struct rpc_task *task, * sync_membufs(), but add the task to the commit_waiters list. */ inode->sync_membufs(bc_vec, false /* is_flush */, nullptr); + inode->schedule_or_start_commit(); - /* - * Set the commit_in_progress flag to ensure that the commit - * task is started after the flush completes. - * Whether we hit commit limit or not, we need to start commit - * as we need to release the memory. - */ - inode->mark_commit_in_progress(); inode->flush_unlock(); inode->add_task_to_commit_waiters(task); @@ -2675,11 +2683,8 @@ static void perform_inline_writes(struct rpc_task *task, if (!inode->is_stable_write()) { AZLogWarn("Not enough contiguous dirty data to flush, may be " "pattern is not sequential. flush_now={}, size={}", - flush_now, size); + flush_now, size); - /* - * Let's switch to stable write mode. - */ inode->switch_to_stable_write(); } @@ -2687,6 +2692,7 @@ static void perform_inline_writes(struct rpc_task *task, * If the write is stable, we can directly write to the * membufs and issue the writes to the backend. */ + assert(inode->is_stable_write()); bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range(); if (!bc_vec.empty()) { /* @@ -2724,205 +2730,6 @@ static void perform_inline_writes(struct rpc_task *task, assert(false); } -static bool handle_writes(struct nfs_inode *inode, - struct rpc_task *task, - const fuse_ino_t ino, - const bool sparse_write, - uint64_t extent_left, - uint64_t extent_right) -{ - assert(task->magic == RPC_TASK_MAGIC); - assert(task->rpc_api->write_task.is_fe()); - assert(task->get_op_type() == FUSE_WRITE); - - const size_t length = task->rpc_api->write_task.get_size(); - - /* - * Check what kind of limit we have hit. - */ - const bool need_inline_write = - (sparse_write || inode->get_filecache()->do_inline_write()); - const bool need_commit = - inode->get_filecache()->commit_required(); - const bool need_flush = - inode->get_filecache()->flush_required(); - - AZLogDebug("[{}] handle write (sparse={}, need_inline_write={}, need_commit={}," - " need_flush={} )", ino, sparse_write, need_inline_write, - need_commit, need_flush); - - /* - * Nothing to do, we can complete the application write rightaway. - */ - if (!need_inline_write && !need_commit && !need_flush) { - task->reply_write(length); - return true; - } - - /* - * We have successfully copied the user data into the cache. - * We can have the following cases: - * 1. This new write caused a contiguous extent to be greater than - * max_dirty_extent_bytes(), aka MDEB. - * In this case we begin flush of this contiguous extent (in multiple - * parallel wsize sized blocks) since there's no benefit in waiting more - * as the data is sufficient for the server scheduler to effectively - * write, in optimal sized blocks. - * We complete the application write rightaway without waiting for the - * flush to complete as we are not under memory pressure. - * This will happen when user is sequentially writing to the file. - * 2. A single contiguous extent is not greater than MDEB but total dirty - * bytes waiting to be flushed is more than MDEB. - * In this case we begin flush of the entire dirty data from the cache - * as we have sufficient data for the server scheduler to perform - * batched writes effectively. - * We complete the application write rightaway without waiting for the - * flush to complete as we are not under memory pressure. - * This will happen when user is writing to the file in a random or - * pseudo-sequential fashion. - * 3. Cache has dirty data beyond the "inline write" threshold, ref - * do_inline_write(). - * In this case we begin flush of the entire dirty data. - * This is a memory pressure situation and hence we do not complete the - * application write till all the backend writes complete. - * This will happen when user is writing faster than our backend write - * throughput, eventually dirty data will grow beyond the "inline write" - * threshold and then we have to slow down the writers by delaying - * completion. - * - * Other than this we have a special case of "write beyond eof" (termed - * sparse write). In sparse write case also we perform "inline write" of - * all the dirty data. This is needed for correct read behaviour. Imagine - * a reader reading from the sparse part of the file which is not yet in - * the bytes_chunk_cache. This read will be issued to the server and since - * server doesn't know the updated file size (as the write may still be - * sitting in our bytes_chunk_cache) it will return eof. This is not correct - * as such reads issued after successful write, are valid and should return - * 0 bytes for the sparse range. - */ - - // Case 1: Inline writes - /* - * Do we need to perform "inline write"? - * Inline write implies, we flush all the dirty data and wait for all the - * corresponding backend writes to complete. - */ - if (need_inline_write) { - INC_GBL_STATS(inline_writes, 1); - - // Free up the fuse thread without completing the application write. - perform_inline_writes(task, inode); - return true; - } - - // Case 2: Commit - /* - * We don't need to do inline writes. See if we need to commit the dirty - * data to the backend - */ - if (need_commit) { - inode->flush_lock(); - /* - * Set the commit_in_progress flag to ensure that the commit - * task is startet. - */ - inode->mark_commit_in_progress(); - - // XXX What if commit completes before we reach here? - assert(inode->is_commit_in_progress()); - - inode->flush_unlock(); - assert(!need_inline_write); - } - - // Case 3: Flush - if (need_flush) { - /* - * We need to flush the dirty data in the range [extent_left, extent_right), - * get the membufs for the dirty data. - * We don't want to run over an inprogress truncate and resetting the file - * size set by truncate, so grab the is_flushing lock. - */ - inode->flush_lock(); - - /* - * If flush is required but flush/commit already in progress, then - * add this task to the respective waiters list and return. - * As we already 1GB worth of dirty data in the cache, we don't want - * to add more data to the cache. So we wait for the ongoing flush/commit. - * - * XXX What if is_commit_in_progress() returns false but some other - * thread starts commit right after that? - * Note that we hold flush_lock which only protects us against some - * other thread starting a flush but commit is protected by - * commit_lock_5. - */ - if (inode->is_commit_in_progress()) { - /* - * If commit is in progress, we need to wait for the commit - * to complete before we can proceed with the write. - */ - inode->add_task_to_commit_waiters(task); - inode->flush_unlock(); - return true; - } - - if (inode->get_filecache()->is_flushing_in_progress()) { - /* - * If flush is in progress, we need to wait for the flush to - * complete before we can proceed with the write. - */ - inode->add_task_to_flush_waiters(task); - inode->flush_unlock(); - return true; - } - - /* - * If we are here, then we need to flush the dirty data to the backend. - * We don't need to wait for the flush to complete, we can complete the - * application write rightaway. - */ - uint64_t size = 0; - std::vector bc_vec; - - /* - * If the inode is stable write, then get the dirty membufs in the range. - * otherwise get the dirty membufs contigious range. - */ - if (inode->is_stable_write()) { - bc_vec = - inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left, - extent_right); - } else { - bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size); - } - - if (bc_vec.size() == 0) { - inode->flush_unlock(); - task->reply_write(length); - return true; - } - - /* - * Pass is_flush as false, since we don't want the writes to complete - * before returning. - */ - inode->sync_membufs(bc_vec, false /* is_flush */); - inode->flush_unlock(); - - // Send reply to original request without waiting for the backend write to complete. - task->reply_write(length); - return true; - } - - /* - * If we reached true, then we have need_commit true, but need_flush false. - */ - assert(need_commit && !need_flush); - task->reply_write(length); - return true; -} - void rpc_task::run_write() { // This must be called only for front end tasks. @@ -2933,6 +2740,9 @@ void rpc_task::run_write() const size_t length = rpc_api->write_task.get_size(); struct fuse_bufvec *const bufv = rpc_api->write_task.get_buffer_vector(); const off_t offset = rpc_api->write_task.get_offset(); + /* + * TODO: Fix sparse writes. + */ const bool sparse_write = false; uint64_t extent_left = 0; uint64_t extent_right = 0; @@ -3022,6 +2832,50 @@ void rpc_task::run_write() assert(extent_right >= (extent_left + length)); + /* + * We have successfully copied the user data into the cache. + * We can have the following cases (in decreasing order of criticality): + * 1. Cache has dirty+uncommitted data beyond the "inline write threshold", + * ref do_inline_write(). + * In this case we begin flush of the dirty data and/or commit of the + * uncommitted data. + * This is a memory pressure situation and hence we do not complete the + * application write till all the above backend writes complete. + * This will happen when application is writing faster than our backend + * write throughput, eventually dirty data will grow beyond the "inline + * write threshold" and then we have to slow down the writers by delaying + * completion. + * 2. Cache has uncommitted data beyond the "commit threshold", ref + * commit_required(). + * In this case we free up space in the cache by committing data. + * We just initiate a commit while the current write request is + * completed. Note that we want to delay commits so that we can reduce + * the commit calls (commit more data per call) as commit calls sort + * of serialize the writes as we cannot send any other write to the + * server while commit is on. + * 3. Cache has enough dirty data that we can flush. + * For sequential writes, this means the new write caused a contiguous + * extent to be greater than max_dirty_extent_bytes(), aka MDEB, while + * for non-seq writes it would mean that the total dirty data grew beyond + * MDEB. + * In this case we begin flush of this contiguous extent (in multiple + * parallel wsize sized blocks) since there's no benefit in waiting more + * as the data is sufficient for the server scheduler to effectively + * write, in optimal sized blocks. + * We complete the application write rightaway without waiting for the + * flush to complete as we are not under memory pressure. + * + * Other than this we have a special case of "write beyond eof" (termed + * sparse write). In sparse write case also we perform "inline write" of + * all the dirty data. This is needed for correct read behaviour. Imagine + * a reader reading from the sparse part of the file which is not yet in + * the bytes_chunk_cache. This read will be issued to the server and since + * server doesn't know the updated file size (as the write may still be + * sitting in our bytes_chunk_cache) it will return eof. This is not correct + * as such reads issued after successful write, are valid and should return + * 0 bytes for the sparse range. + */ + /* * If the extent size exceeds the max allowed dirty size as returned by * max_dirty_extent_bytes(), then it's time to flush the extent. @@ -3038,10 +2892,31 @@ void rpc_task::run_write() assert(max_dirty_extent > 0); /* - * Ok, we neither need inline writes nor commit in progress. See if we have - * enough dirty data and we need to start async flush. + * Check what kind of limit we have hit. */ - if ((extent_right - extent_left) < max_dirty_extent) { + const bool need_inline_write = + (sparse_write || inode->get_filecache()->do_inline_write()); + const bool need_commit = + inode->get_filecache()->commit_required(); + const bool need_flush = + ((extent_right - extent_left) >= max_dirty_extent) || + inode->get_filecache()->flush_required(); + + AZLogDebug("[{}] handle write (sparse={}, need_inline_write={}, " + "need_commit={}, need_flush={}, extent=[{}, {}))", + inode->get_fuse_ino(), sparse_write, need_inline_write, + need_commit, need_flush, extent_left, extent_right); + + /* + * Nothing to do, we can complete the application write rightaway. + * This should be the happy path! + */ + if (!need_inline_write && !need_commit && !need_flush) { + reply_write(length); + return; + } + + if (need_flush && ((extent_right - extent_left) < max_dirty_extent)) { /* * This is the case of non-sequential writes causing enough dirty * data to be accumulated, need to flush all of that. @@ -3050,12 +2925,139 @@ void rpc_task::run_write() extent_right = UINT64_MAX; } - if (handle_writes(inode, this, ino, sparse_write, - extent_left, extent_right)) { + + // Case 1: Inline writes + /* + * Do we need to perform "inline write"? + * Inline write implies, we flush dirty data and commit uncommitted data + * and wait for all the corresponding backend writes to complete. + */ + if (need_inline_write) { + INC_GBL_STATS(inline_writes, 1); + + // Free up the fuse thread without completing the application write. + perform_inline_writes(this, inode); return; - } else { - assert(false); } + + // Case 2: Commit + /* + * We don't need to do inline writes. See if we need to commit the + * uncommitted data to the backend. We just need to stat the commit + * and not hold the current write task till the commit completes. + */ + if (need_commit) { + inode->flush_lock(); + + /* + * Start commit if no flushing is in progress else mark commit + * pending, so that last flush can then start the commit. + */ + inode->schedule_or_start_commit(); + + /* + * Note: Commit request sent by the above call can potentially + * complete before we reach here. In that case the following + * assert will fail. It's very unlikely, so leave the assert + * as it's useful. + */ + assert(inode->is_commit_in_progress()); + + inode->flush_unlock(); + assert(!need_inline_write); + } + + // Case 3: Flush + if (need_flush) { + /* + * We need to flush the dirty data in the range [extent_left, extent_right), + * get the membufs for the dirty data. + * We don't want to run over an inprogress truncate and resetting the file + * size set by truncate, so grab the is_flushing lock. + */ + inode->flush_lock(); + + /* + * If flush is required but flush/commit already in progress, then + * add this task to the respective waiters list and return. + * As we already have 1GB worth of dirty data in the cache, we don't + * want to add more data to the cache. So we wait for the ongoing + * flush/commit. + * + * XXX What if is_commit_in_progress() returns false but some other + * thread starts commit right after that? + * Note that we hold flush_lock which only protects us against some + * other thread starting a flush but commit is protected by + * commit_lock_5. + */ + if (inode->is_commit_in_progress()) { + inode->flush_unlock(); + /* + * If commit is in progress, we need to wait for the commit + * to complete before we can complete the write. + */ + inode->add_task_to_commit_waiters(this); + return; + } + + if (!inode->is_stable_write() && + inode->get_filecache()->is_flushing_in_progress()) { + inode->flush_unlock(); + /* + * If flush is in progress, we need to wait for the flush to + * complete before we can proceed with the write. + * Note that for unstable write we don't start a new flush if + * there's an ongoing one, but for stable it's fine to start + * another flush. + */ + inode->add_task_to_flush_waiters(this); + return; + } + + /* + * If we are here, then we need to flush the dirty data to the backend. + * We don't need to wait for the flush to complete, we can complete the + * application write rightaway. + */ + uint64_t size = 0; + std::vector bc_vec; + + /* + * If the inode is stable write, then get the dirty membufs in the range. + * otherwise get the dirty membufs contigious range. + */ + if (inode->is_stable_write()) { + bc_vec = + inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left, + extent_right); + } else { + bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size); + } + + if (bc_vec.size() == 0) { + inode->flush_unlock(); + reply_write(length); + return; + } + + /* + * Pass is_flush as false, since we don't want the writes to complete + * before returning. + */ + inode->sync_membufs(bc_vec, false /* is_flush */); + inode->flush_unlock(); + + // Send reply to original request without waiting for the backend write to complete. + reply_write(length); + return; + } + + /* + * If we reached true, then we have need_commit true, but need_flush false. + */ + assert(need_commit && !need_flush); + reply_write(length); + return; } void rpc_task::run_flush() From 2a9f3b686ccb725c1ce4e6971ca9765c1ff1997c Mon Sep 17 00:00:00 2001 From: Nitin Singla Date: Thu, 30 Jan 2025 14:57:53 +0000 Subject: [PATCH 8/9] Some fixes about deadlck --- turbonfs/inc/file_cache.h | 2 +- turbonfs/inc/nfs_inode.h | 11 ++++++++--- turbonfs/src/nfs_inode.cpp | 14 ++++++++++---- turbonfs/src/rpc_task.cpp | 15 ++++++--------- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h index 38afcc89..867abe50 100644 --- a/turbonfs/inc/file_cache.h +++ b/turbonfs/inc/file_cache.h @@ -1632,7 +1632,7 @@ class bytes_chunk_cache * the second one. */ static const uint64_t max_dirty_allowed_per_cache = - (max_dirty_extent_bytes() * 2); + (max_dirty_extent_bytes() * 4); const bool local_pressure = ((int64_t) get_bytes_to_flush() > (int64_t) max_dirty_allowed_per_cache); diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h index e33e61f9..35e8ff54 100644 --- a/turbonfs/inc/nfs_inode.h +++ b/turbonfs/inc/nfs_inode.h @@ -559,7 +559,7 @@ struct nfs_inode * change the file size after truncate sets it. */ bool truncate_start(size_t size); - void truncate_end() const; + void truncate_end(); /** * This MUST be called only after has_filecache() returns true, else @@ -989,7 +989,7 @@ struct nfs_inode */ if (!non_append_writes_seen && (offset != attr.st_size)) { non_append_writes_seen = true; - AZLogInfo("[{}] Non-append write seen [{}, {}), file size: {}", + AZLogDebug("[{}] Non-append write seen [{}, {}), file size: {}", ino, offset, offset+length, attr.st_size); } @@ -1512,7 +1512,12 @@ struct nfs_inode * Lock the inode for flushing. */ void flush_lock() const; - void flush_unlock() const; + void flush_unlock(); + + bool flushtry_lock() + { + return !is_flushing.exchange(true); + } /** * Revalidate the inode. diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp index 4776069f..bd7ce930 100644 --- a/turbonfs/src/nfs_inode.cpp +++ b/turbonfs/src/nfs_inode.cpp @@ -980,7 +980,6 @@ void nfs_inode::schedule_or_start_commit() { assert(is_flushing); assert(!is_stable_write()); - /* * We want to commit, but we need to wait for current flushing to complete. * 1. If flushing is going on, set commit_pending for the inode, when last @@ -993,6 +992,7 @@ void nfs_inode::schedule_or_start_commit() */ if (!is_commit_in_progress()) { if (get_filecache()->is_flushing_in_progress()) { + AZLogWarn("[{}] Flushing in progress, setting commit_pending", ino); set_commit_pending(); } else { commit_membufs(); @@ -1233,6 +1233,10 @@ int nfs_inode::wait_for_ongoing_flush_commit(struct rpc_task *task) * we come here we should have flushing going on. */ assert(!get_filecache()->is_flushing_in_progress()); + if (is_commit_pending()) { + AZLogInfo("[{}] Commit is pending, we need to start the commit", ino); + commit_membufs(); + } wait_for_ongoing_commit(); } @@ -1497,7 +1501,7 @@ void nfs_inode::flush_lock() const return; } -void nfs_inode::flush_unlock() const +void nfs_inode::flush_unlock() { AZLogDebug("[{}] flush_unlock() called", ino); @@ -1506,7 +1510,9 @@ void nfs_inode::flush_unlock() const */ assert(has_filecache()); assert(is_flushing == true); - + if (!filecache_handle->is_flushing_in_progress() && is_commit_pending()) { + commit_membufs(); + } { std::unique_lock _lock(iflush_lock_3); is_flushing = false; @@ -1516,7 +1522,7 @@ void nfs_inode::flush_unlock() const flush_cv.notify_one(); } -void nfs_inode::truncate_end() const +void nfs_inode::truncate_end() { AZLogDebug("[{}] truncate_end() called", ino); diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp index 62d3f5fa..c458955d 100644 --- a/turbonfs/src/rpc_task.cpp +++ b/turbonfs/src/rpc_task.cpp @@ -1430,16 +1430,13 @@ static void write_iov_callback( * completes, we need to start the commit. */ if (!inode->get_filecache()->is_flushing_in_progress()) { - inode->flush_lock(); - if (inode->is_commit_pending()) { - assert(!inode->is_stable_write()); - - /* - * We are issuing the commit rpc, so set the commit_in_progress flag. - */ - inode->commit_membufs(); + if(inode->flushtry_lock()) { + if (!inode->get_filecache()->is_flushing_in_progress() && + inode->is_commit_pending()) { + inode->commit_membufs(); + } + inode->flush_unlock(); } - inode->flush_unlock(); } /* From e7fa84484a13a6306a90948b20425ab5ffaa5864 Mon Sep 17 00:00:00 2001 From: Nagendra Tomar Date: Fri, 31 Jan 2025 10:31:01 +0000 Subject: [PATCH 9/9] Some comment changes. --- turbonfs/src/nfs_inode.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp index bd7ce930..50886418 100644 --- a/turbonfs/src/nfs_inode.cpp +++ b/turbonfs/src/nfs_inode.cpp @@ -1307,12 +1307,11 @@ int nfs_inode::flush_cache_and_wait() } /* - * Grab the inode is_flushing lock to ensure that we doen't initiate - * any new flush operation while some truncate call is in progress - * (which must have taken the is_flushing lock). - * Once flush_lock() returns we have the is_flushing lock and we are + * Grab the flush_lock to ensure that we don't initiate any new flushes + * while some truncate call is in progress (which must have taken the + * flush_lock). Once flush_lock() returns we have the lock and we are * guaranteed that no new truncate operation can start till we release - * the is_flushing lock. We can safely start the flush then. + * the flush_lock. We can safely start the flush then. */ flush_lock();