From 45b453b229940e2ae1a9cf8b6701a71b77a7ff3b Mon Sep 17 00:00:00 2001
From: Nitin Singla <nitinsingla@microsoft.com>
Date: Sat, 25 Jan 2025 18:38:50 +0000
Subject: [PATCH 1/9] First cut changes for commit.

---
 turbonfs/inc/file_cache.h   | 104 ++++++++
 turbonfs/inc/nfs_inode.h    |  75 +++++-
 turbonfs/inc/rpc_task.h     |   1 +
 turbonfs/src/file_cache.cpp | 115 +++++++++
 turbonfs/src/nfs_inode.cpp  | 472 ++++++++++++++++++++++++++++++-----
 turbonfs/src/rpc_task.cpp   | 484 +++++++++++++++++++++++++++++++-----
 6 files changed, 1126 insertions(+), 125 deletions(-)
diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index 33729300..be7a8ac0 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -237,6 +237,9 @@ struct membuf
     std::vector<struct rpc_task *> *flush_waiters = nullptr;
     std::mutex flush_waiters_lock_44;
 
+    std::vector<struct rpc_task *> *commit_waiters = nullptr;
+    std::mutex commit_waiters_lock_44;
+
     /**
      * get_flush_waiters() returns the list of tasks waiting for this
      * membuf to be flushed to the Blob. All these tasks must be completed
@@ -247,6 +250,16 @@ struct membuf
      */
     std::vector<struct rpc_task *> get_flush_waiters();
 
+    /**
+     * get_commit_waiters() returns the list of tasks waiting for this
+     * membuf to be committed to the Blob. All these tasks must be completed
+     * appropriately when membuf commit completes (success or failure).
+     *
+     * Note: This splices the commit_waiters list and returns, and hence
+     *       it's not idempotent.
+     */
+    std::vector<struct rpc_task *> get_commit_waiters();
+
     /*
      * If is_file_backed() is true then 'allocated_buffer' is the mmap()ed
      * address o/w it's the heap allocation address.
@@ -1222,6 +1235,11 @@ class bytes_chunk_cache
                           uint64_t length,
                           struct rpc_task *task);
 
+    bool add_commit_waiter(uint64_t offset,
+                           uint64_t length,
+                           struct rpc_task *task);
+
+
     /*
      * Returns all dirty chunks for a given range in chunkmap.
      * Before returning it increases the inuse count of underlying membuf(s).
@@ -1259,6 +1277,16 @@ class bytes_chunk_cache
     std::vector<bytes_chunk> get_dirty_nonflushing_bcs_range(
             uint64_t st_off = 0, uint64_t end_off = UINT64_MAX) const;
 
+    /**
+     * Returns contiguous dirty (and not flushing) chunks from chunmap, starting
+     * with the lowest dirty offset, and returns the total number of (dirty)
+     * bytes contained in the returned chunks.
+     * Before returning it increases the inuse count of underlying membuf(s).
+     * Caller will typically flush these to the backing Blob as UNSTABLE
+     * writes.
+     */
+    std::vector<bytes_chunk> get_contiguous_dirty_bcs(uint64_t& size) const;
+
     /*
      * Returns all dirty chunks which are currently flushing for a given range
      * in chunkmap. Before returning it increases the inuse count of underlying
@@ -1384,6 +1412,36 @@ class bytes_chunk_cache
         return std::min(max_dirty_extent, uint64_t(1024 * 1024 * 1024ULL));
     }
 
+    /*
+     * Maximum size of commit bytes that can be pending in cache, before we
+     * should commit it to Blob.
+     * It should be greater than max_dirty_extent_bytes() and smaller than
+     * inline_dirty_threshold. So, that inline pruning can be avoided.
+     * This is 60% of the allowed cache size.
+     * f.e = Cache size of 4GB then max_commit_bytes = 2.4GB
+     * - Flush will start every 1GB dirty data and each 1GB dirty data
+     *   converted to commit_pending_bytes.
+     *
+     * This is 60% of the allowed cache size.
+     */
+    uint64_t max_commit_bytes() const
+    {
+        // Maximum cache size allowed in bytes.
+        static const uint64_t max_total =
+            (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
+        assert(max_total != 0);
+
+        static const uint64_t max_commit_bytes =
+            std::max((((uint64_t)(max_total * 0.6))/ max_dirty_extent_bytes()) * max_dirty_extent_bytes(),
+                     (uint64_t) (2 * AZNFSCFG_WSIZE_MAX));
+
+        static const uint64_t min_commit_bytes =
+         std::max(((2 * max_dirty_extent_bytes()) - AZNFSCFG_WSIZE_MAX),
+          (uint64_t) (2 * AZNFSCFG_WSIZE_MAX));
+
+        return std::max((max_commit_bytes - AZNFSCFG_WSIZE_MAX), min_commit_bytes);
+    }
+
     /**
      * Get the amount of dirty data that needs to be flushed.
      * This excludes the data which is already flushing.
@@ -1432,6 +1490,52 @@ class bytes_chunk_cache
         return bytes_flushing > 0;
     }
 
+    /*
+     * Check if bytes_commit_pending is equal or more than
+     * max_commit_bytes limit.
+     */
+    bool commit_required() const
+    {
+        return (bytes_commit_pending >= max_commit_bytes());
+    }
+
+    /*
+     * get_bytes_to_prune() returns the number of bytes that need to be flushed
+     * inline to free up the space. If there are enough bytes_flushing then we
+     * can just wait for them to complete.
+     *
+     * Note : We are not considering bytes_commit_pending in this calculation.
+     *        If bytes_commit_pending are high then commit already started and
+     *        if bytes_flushing are high once flushing is done commit will be
+     *        triggered.
+     */
+    uint64_t get_bytes_to_prune() const
+    {
+        static const uint64_t max_dirty_allowed_per_cache =
+            max_dirty_extent_bytes() * 2;
+        int64_t total_bytes =
+            std::max(int64_t(bytes_dirty - bytes_flushing), int64_t(0));
+        const bool local_pressure = total_bytes > (int64_t)max_dirty_allowed_per_cache;
+
+        if (local_pressure) {
+            return std::max(int64_t(0), (int64_t)(total_bytes - max_dirty_extent_bytes()));
+        }
+
+        /*
+         * Global pressure is when get_prune_goals() returns non-zero bytes
+         * to be pruned inline.
+         */
+        uint64_t inline_bytes;
+
+        /*
+         * TODO: Noisy neighbor syndrome, where one file is hogging the cache,
+         *       inline pruning will be triggered for other files.
+         */
+        get_prune_goals(&inline_bytes, nullptr);
+        return std::max(int64_t(inline_bytes - (bytes_flushing + bytes_commit_pending)),
+                    (int64_t)0);
+    }
+
     /**
      * This should be called by writer threads to find out if they must wait
      * for the write to complete. This will check both the cache specific and
diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
index bef85bb9..959e88b5 100644
--- a/turbonfs/inc/nfs_inode.h
+++ b/turbonfs/inc/nfs_inode.h
@@ -279,6 +279,29 @@ struct nfs_inode
      */
     struct stat attr;
 
+    /**
+     * We maintain following multiple views of the file and thus multiple file
+     * sizes for those views.
+     * - Cached.
+     *   This is the view of the file that comprises of data that has been
+     *   written by the application and saved in file cache. It may or may not
+     *   have been flushed and/or committed. This is the most uptodate view of
+     *   the file and applications must use this view.
+     *   cached_filesize tracks the file size for this view.
+     * - Uncommited.
+     *   This is the view of the file that tracks data that has been flushed
+     *   using UNSTABLE writes but not yet COMMITted to the Blob. This view of
+     *   the file is only used to see if the next PB call will write after the
+     *   last PB'ed byte and thus can be appended.
+     *   putblock_filesize tracks the file size for this view.
+     * - Committed.
+     *   This is the view of the file that tracks data committed to the Blob.
+     *   Other clients will see this view.
+     *   attr.st_size tracks the file size for this view.
+     */
+    off_t cached_filesize = 0;
+    off_t putblock_filesize = 0;
+
     /*
      * Has this inode seen any non-append write?
      * This starts as false and remains false as long as copy_to_cache() only
@@ -384,6 +407,13 @@ struct nfs_inode
      */
     int write_error = 0;
 
+    /*
+     * Last offset and length of the last flushing operation.
+     * This is used add task waiting for flush/commit to complete.
+     */
+    uint64_t last_flushing_offset = 0;
+    uint64_t last_flushing_length = 0;
+
     /*
      * Commit state for this inode.
      * This is used to track the state of commit operation for this inode, which
@@ -418,6 +448,12 @@ struct nfs_inode
 
     std::atomic<commit_state_t> commit_state = commit_state_t::COMMIT_NOT_NEEDED;
 
+    /*
+     * commit_lock_5 is used to synchronize flush thread and write thread
+     * for commit operation.
+     */
+    std::mutex commit_lock_5;
+
     /**
      * TODO: Initialize attr with postop attributes received in the RPC
      *       response.
@@ -1319,17 +1355,50 @@ struct nfs_inode
      *       progress (which must have held the is_flushing lock).
      */
     int flush_cache_and_wait(uint64_t start_off = 0,
-                             uint64_t end_off = UINT64_MAX);
+                             uint64_t end_off = UINT64_MAX,
+                             struct rpc_task *parent_task = nullptr);
+
+    void mark_commit_in_progress();
 
     /**
      * Wait for currently flushing membufs to complete.
+     * If parent_task is non-null, then it's added to the commit_waiters list
+     * and returned. Otherwise it waits for the ongoing flush to complete.
+     *
      * Returns 0 on success and a positive errno value on error.
      *
      * Note : Caller must hold the inode is_flushing lock to ensure that
      *        no new membufs are added till this call completes.
      */
-    int wait_for_ongoing_flush(uint64_t start_off = 0,
-                               uint64_t end_off = UINT64_MAX);
+    int wait_for_ongoing_flush_commit(uint64_t start_off = 0,
+                               uint64_t end_off = UINT64_MAX,
+                               struct rpc_task *parent_task = nullptr);
+
+    /*
+     * commit_membufs() is called to commit uncommitted membufs to the BLOB.
+     * It creates commit RPC and sends it to the NFS server.
+     */
+    void commit_membufs();
+
+    /*
+     * switch_to_stable_write() is called to switch the inode to stable write
+     * mode. There is should be no ongoing commit/flusing operation when this
+     * is called. It creates a commit RPC to commit all the uncopmmitted membufs
+     * to the BLOB.
+     */
+    void switch_to_stable_write();
+
+    /**
+     * Check if stable write is required for the given offset.
+     * Given offset is the start of contigious dirty membufs that need to be
+     * flushed to the BLOB.
+     */
+    bool check_stable_write_required(off_t offset);
+
+    /**
+     * Wait for ongoing commit operation to complete.
+     */
+    void wait_for_ongoing_commit();
 
     /**
      * Sync the dirty membufs in the file cache to the NFS server.
diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h
index a7f049ae..c9fd4303 100644
--- a/turbonfs/inc/rpc_task.h
+++ b/turbonfs/inc/rpc_task.h
@@ -2387,6 +2387,7 @@ struct rpc_task
      */
     bool add_bc(const bytes_chunk& bc);
     void issue_write_rpc();
+    void issue_commit_rpc();
 
 #ifdef ENABLE_NO_FUSE
     /*
diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp
index 2b51abbb..d133ed7e 100644
--- a/turbonfs/src/file_cache.cpp
+++ b/turbonfs/src/file_cache.cpp
@@ -712,6 +712,36 @@ void membuf::clear_inuse()
     inuse--;
 }
 
+std::vector<struct rpc_task *> membuf::get_commit_waiters()
+{
+    /*
+     * We cannot assert for !is_dirty() as in case of write failure
+     * we complete flush_waiters w/o clearing dirty flag.
+     */
+    assert(is_inuse());
+    assert(is_locked());
+    assert(!is_flushing());
+    assert(!is_commit_pending());
+
+    std::unique_lock<std::mutex> _lock(commit_waiters_lock_44);
+    std::vector<struct rpc_task *> tasks;
+
+    if (commit_waiters) {
+        // flush_waiters is dynamically allocated when first task is added.
+        assert(!commit_waiters->empty());
+        tasks.swap(*commit_waiters);
+        assert(commit_waiters->empty());
+
+        /*
+         * Free the commit_waiters vector.
+         */
+        delete commit_waiters;
+        commit_waiters = nullptr;
+    }
+
+    return tasks;
+}
+
 std::vector<struct rpc_task *> membuf::get_flush_waiters()
 {
     /*
@@ -2362,6 +2392,43 @@ std::vector<bytes_chunk> bytes_chunk_cache::get_dirty_nonflushing_bcs_range(
     return bc_vec;
 }
 
+std::vector<bytes_chunk> bytes_chunk_cache::get_contiguous_dirty_bcs(
+        uint64_t& size) const
+{
+    std::vector<bytes_chunk> bc_vec;
+    size = 0;
+
+    // TODO: Make it shared lock.
+    const std::unique_lock<std::mutex> _lock(chunkmap_lock_43);
+    auto it = chunkmap.lower_bound(0);
+    uint64_t prev_offset = AZNFSC_BAD_OFFSET;
+
+    while (it != chunkmap.cend()) {
+        const struct bytes_chunk& bc = it->second;
+        struct membuf *mb = bc.get_membuf();
+
+        if (mb->is_dirty() && !mb->is_flushing()) {
+            if (prev_offset != AZNFSC_BAD_OFFSET) {
+                if (prev_offset != bc.offset) {
+                    break;
+                } else {
+                    size += bc.length;
+                    prev_offset = bc.offset + bc.length;
+                }
+            } else {
+                size += bc.length;
+                prev_offset = bc.offset + bc.length;
+            }
+            mb->set_inuse();
+            bc_vec.emplace_back(bc);
+        }
+
+        ++it;
+    }
+
+    return bc_vec;
+}
+
 bool bytes_chunk_cache::add_flush_waiter(uint64_t offset,
                                          uint64_t length,
                                          struct rpc_task *task)
@@ -2409,6 +2476,54 @@ bool bytes_chunk_cache::add_flush_waiter(uint64_t offset,
     return false;
 }
 
+bool bytes_chunk_cache::add_commit_waiter(uint64_t offset,
+                                          uint64_t length,
+                                          struct rpc_task *task)
+{
+    assert(offset < AZNFSC_MAX_FILE_SIZE);
+    assert(length > 0);
+
+    // Only frontend write tasks must ever wait.
+    assert(task->magic == RPC_TASK_MAGIC);
+    assert(task->get_op_type() == FUSE_WRITE);
+    assert(task->rpc_api->write_task.is_fe());
+
+    const std::unique_lock<std::mutex> _lock(chunkmap_lock_43);
+    auto it = chunkmap.lower_bound(offset);
+    struct bytes_chunk *last_bc = nullptr;
+
+    // Get the last bc for the given range.
+    while (it != chunkmap.cend() && (it->first < (offset + length))) {
+        last_bc = &(it->second);
+        ++it;
+    }
+
+    if (last_bc != nullptr) {
+        // membuf must have at least one byte in the requested range.
+        assert(last_bc->offset >= offset &&
+               last_bc->offset < (offset + length));
+
+        struct membuf *mb = last_bc->get_membuf();
+
+        std::unique_lock<std::mutex> _lock2(mb->commit_waiters_lock_44);
+        /*
+         * Only add flush waiters to dirty membufs, else it may have already
+         * completed flush and we don't want the task to be waiting forever.
+         */
+        if (mb->is_flushing() || mb->is_commit_pending()) {
+            if (mb->commit_waiters == nullptr) {
+                mb->commit_waiters = new std::vector<struct rpc_task *>();
+            }
+
+            mb->commit_waiters->emplace_back(task);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+
 std::vector<bytes_chunk> bytes_chunk_cache::get_dirty_bc_range(
         uint64_t start_off, uint64_t end_off) const
 {
diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index 1fb31583..dcb8b18d 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -375,6 +375,142 @@ int nfs_inode::get_actimeo_max() const
     }
 }
 
+/*
+ * Caller should hold flush_lock().
+ */
+void nfs_inode::wait_for_ongoing_commit()
+{
+    assert(is_flushing);
+
+    while (is_commit_in_progress()) {
+        assert(!get_filecache()->is_flushing_in_progress());
+        ::usleep(1000);
+    }
+
+    assert(is_commit_in_progress() == false);
+}
+
+/*
+ * This function is called with flush_lock() held.
+ * In normal case this function should be no-op as
+ * when we reach here all the previous flushes/commits
+ * should have been completed. But in case of committing
+ * at the close of file, we need to wait if any flush
+ * going on and commit all the dirty membufs which are
+ * not yet committed.
+ */
+void nfs_inode::switch_to_stable_write()
+{
+    assert(is_flushing);
+    AZLogInfo("[{}] Switching to stable write", ino);
+
+    /*
+     * We should not be in commit in progress state.
+     * Switch_to_stable_write() is called by flush thread
+     * and it shouldn't be called when commit is in progress.
+     */
+    assert(is_commit_in_progress() == false);
+
+    /*
+     * switch_to_stable_write() called from following paths.
+     * 1. sync_membufs(), No new flush should be started if
+     *    there is ongoing flush is in progress.
+     * 2. flush_cache_and_wait(), Wait for ongoing flush to
+     *    complete.
+     * As we are allowing multiple flush in parallel in case
+     * of unstable write as well, we should wait for ongoing
+     * flush to complere before switching to stable write.
+     * switch_to_stable_write() is called by flush_lock() held
+     * so no new flush can be started.
+     */
+    wait_for_ongoing_flush_commit();
+    assert(!get_filecache()->is_flushing_in_progress());
+
+    /*
+     * Check if there is anything to commit, if not then
+     * switch to stable write.
+     */
+    if (get_filecache()->get_bytes_to_commit() == 0) {
+        AZLogDebug("[{}] Nothing to commit, switching to stable write", ino);
+        set_stable_write();
+        return;
+    }
+
+    /*
+     * If there is something to commit, then commit it and
+     * wait for the commit to complete.
+     */
+    std::unique_lock<std::mutex> commit_lock(commit_lock_5);
+    assert(!is_commit_in_progress());
+    set_commit_in_progress();
+    commit_lock.unlock();
+
+    /*
+     * Issue the commit RPC to commit the dirty membufs.
+     */
+    commit_membufs();
+
+    /*
+     * Wait for the commit to complete.
+     */
+    wait_for_ongoing_commit();
+
+    assert(get_filecache()->get_bytes_to_commit() == 0);
+    assert(!get_filecache()->is_flushing_in_progress());
+    assert(!is_commit_in_progress());
+
+    set_stable_write();
+    return;
+}
+
+/*
+ * This function checks, whether switch to stable write or not.
+ */
+bool nfs_inode::check_stable_write_required(off_t offset)
+{
+    /*
+     * If stable_write is already set, we don't need to do anything.
+     * We don't need lock here as once stable_write is set it's never
+     * unset.
+     */
+    if (is_stable_write()) {
+        return false;
+    }
+
+    /*
+     * If current write is not append write, then we can't go for unstable writes
+     * It may be overwrite to existing data and we don't have the knowldege of
+     * existing block list, it maye require read modified write. So, we can't go
+     * for unstable write. Similarly, if the offset is more than end of the file,
+     * we need to write zero block in between the current end of the file and the
+     * offset.
+     */
+    if (putblock_filesize != offset) {
+        AZLogInfo("Stable write required as putblock_filesize:{} is not at the"
+            "offset:{}", putblock_filesize, offset);
+
+        return true;
+    }
+
+    return false;
+}
+
+/**
+ * commit_membufs() is called by writer thread to commit flushed membufs.
+ */
+void nfs_inode::commit_membufs()
+{
+    /*
+     * Create the commit task to carry out the write.
+     */
+    struct rpc_task *commit_task =
+                get_client()->get_rpc_task_helper()->alloc_rpc_task(FUSE_FLUSH);
+    commit_task->init_flush(nullptr /* fuse_req */, ino);
+    assert(commit_task->rpc_api->pvt == nullptr);
+
+    commit_task->issue_commit_rpc();
+}
+
 void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
                              bool is_flush,
                              struct rpc_task *parent_task)
@@ -407,6 +543,24 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
         parent_task->num_ongoing_backend_writes = 1;
     }
 
+    /*
+     * New flush should not be started if flush/commit is in progress.
+     * But it may happen commit_is_pending() is set by another thread.
+     */
+    assert(!is_commit_in_progress() || is_commit_pending());
+
+    if (bc_vec.empty()) {
+        return;
+    }
+
+    /*
+     * If new offset is not at the end of the file,
+     * then we need to switch to stable write.
+     */
+    if(check_stable_write_required(bc_vec[0].offset)) {
+        switch_to_stable_write();
+    }
+
     /*
      * Create the flush task to carry out the write.
      */
@@ -495,6 +649,12 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
             continue;
         }
 
+        /*
+         * Update the last flushing offset and length.
+         */
+        last_flushing_offset = bc.offset;
+        last_flushing_length = bc.length;
+
         if (write_task == nullptr) {
             write_task =
                 get_client()->get_rpc_task_helper()->alloc_rpc_task(FUSE_WRITE);
@@ -767,12 +927,69 @@ int nfs_inode::copy_to_cache(const struct fuse_bufvec* bufv,
     return err;
 }
 
+void nfs_inode::mark_commit_in_progress()
+{
+    /*
+     * We want to commit, but we need to wait for current flushing to complete.
+     * 1. If flushing is going on, set commit_pending to inode, when last membuf flushed,
+     *    write_iov_callback(), start new task to commit the data.
+     * 2. If flushing is not going on, then create the task and start commit of data.
+     *
+     * Note: Ensure lock is held till we set the commit state depending on flushing
+     *       going on or not. This lock to synchronize the commit and flush task.
+     */
+    bool start_commit_task = false;
+
+    /*
+     * Take the flush_lock to ensure no new flush or commit task is started.
+     * commit_lock is held to synchronize the flush thread and this thread
+     * to set the commit state depending on flushing going on or not.
+     *
+     * Note: flush_lock can't be used as flush task can be started under the
+     *       flush_lock (flush_cache_and_wait()) and it can cause deadlock.
+     *       flush_cache_and_wait() is called under flush_lock and it checks
+     *       if commit required, then it starts the commit task.
+     *
+     * Note: commit_lock held for very small duration, so it's safe to hold in
+     *       fuse context.
+     */
+    if (!is_commit_in_progress())
+    {
+        std::unique_lock<std::mutex> lock(commit_lock_5 /*commit_lock*/);
+        /*
+         * It may be possible that commit is in progress, so check again.
+         * Might be set by other thread.
+         */
+        if (!is_commit_in_progress())
+        {
+            if (get_filecache()->is_flushing_in_progress()) {
+                set_commit_pending();
+            } else {
+                set_commit_in_progress();
+                start_commit_task = true;
+            }
+        }
+        lock.unlock();
+    }
+
+    if (start_commit_task) {
+        commit_membufs();
+    }
+}
+
 /**
  * Note: Caller should call with flush_lock() held.
+ *       On return there is no ongoing flush/commit in progress.
  */
-int nfs_inode::wait_for_ongoing_flush(uint64_t start_off, uint64_t end_off)
+int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
+                                             uint64_t end_off,
+                                             struct rpc_task *task)
 {
     assert(start_off < end_off);
+    assert(!is_stable_write());
+
+    // NAND Gate
+    assert(!(is_commit_in_progress() && get_filecache()->is_flushing_in_progress()));
 
     // Caller must call us with is_flushing lock held.
     assert(is_flushing);
@@ -796,78 +1013,141 @@ int nfs_inode::wait_for_ongoing_flush(uint64_t start_off, uint64_t end_off)
     /*
      * Flushing not in progress and no new flushing can be started as we hold
      * the flush_lock().
+     * It may be possible that the flush is not in progress and but we may have
+     * commit_in_progress or commit_pending_bytes more than max_commit_bytes.
+     * In that case we should start the commit task if it's not already in
+     * progress. And wait for the commit to complete.
      */
-    if (!get_filecache()->is_flushing_in_progress()) {
+    if (!get_filecache()->is_flushing_in_progress() &&
+        !is_commit_in_progress() &&
+        !get_filecache()->commit_required()) {
         AZLogDebug("[{}] No flush in progress, returning", ino);
+
+        /*
+         * Complete the write task if it was passed.
+         */
+        if (task) {
+            task->reply_write(task->rpc_api->write_task.get_size());
+        }
         return 0;
     }
 
     /*
-     * Get the flushing bytes_chunk from the filecache handle.
-     * This will grab an exclusive lock on the file cache and return the list
-     * of flushing bytes_chunks at that point. Note that we can have new dirty
-     * bytes_chunks created but we don't want to wait for those.
+     * Try to get the flushing bytes_chunk from the filecache handle if the
+     * flush is in progress.
      */
-    std::vector<bytes_chunk> bc_vec =
-        filecache_handle->get_flushing_bc_range(start_off, end_off);
+    std::vector<bytes_chunk> bc_vec = {};
+    if (get_filecache()->is_flushing_in_progress()) {
+        /*
+        * Get the flushing bytes_chunk from the filecache handle.
+        * This will grab an exclusive lock on the file cache and return the list
+        * of flushing bytes_chunks at that point. Note that we can have new dirty
+        * bytes_chunks created but we don't want to wait for those.
+        */
+        bc_vec =
+            filecache_handle->get_flushing_bc_range(start_off, end_off);
+    }
+
 
     /*
-     * Our caller expects us to return only after the flush completes.
+     * If there is no task passed, then our caller expects us to return
+     * only after the flush completes (f.e. truncate, switch_to_stable_write).
      * Wait for all the membufs to flush and get result back.
      */
-    for (bytes_chunk &bc : bc_vec) {
-        struct membuf *mb = bc.get_membuf();
+    if (task == nullptr) {
+        for (bytes_chunk &bc : bc_vec) {
+            struct membuf *mb = bc.get_membuf();
 
-        assert(mb != nullptr);
-        assert(mb->is_inuse());
+            assert(mb != nullptr);
+            assert(mb->is_inuse());
+            mb->set_locked();
 
-        /*
-         * sync_membufs() would have taken the membuf lock for the duration
-         * of the backend wite that flushes the membuf, so once we get the
-         * lock we know that the flush write has completed.
-         */
-        mb->set_locked();
+            /*
+             * If still dirty after we get the lock, it may mean two things:
+             * - Write failed.
+             * - Some other thread got the lock before us and it made the
+             *   membuf dirty again.
+             */
+            if (mb->is_dirty() && get_write_error()) {
+                AZLogError("[{}] Flush [{}, {}) failed with error: {}",
+                        ino,
+                        bc.offset, bc.offset + bc.length,
+                        get_write_error());
+            }
 
-        /*
-         * If still dirty after we get the lock, it may mean two things:
-         * - Write failed.
-         * - Some other thread got the lock before us and it made the
-         *   membuf dirty again.
-         */
-        if (mb->is_dirty() && get_write_error()) {
-            AZLogError("[{}] Flush [{}, {}) failed with error: {}",
-                       ino,
-                       bc.offset, bc.offset + bc.length,
-                       get_write_error());
-        }
+            mb->clear_locked();
+            mb->clear_inuse();
 
-        mb->clear_locked();
-        mb->clear_inuse();
+            /*
+             * Release the bytes_chunk back to the filecache.
+             * These bytes_chunks are not needed anymore as the flush is done.
+             *
+             * Note: We come here for bytes_chunks which were found dirty by the
+             *       above loop. These writes may or may not have been issued by
+             *       us (if not issued by us it was because some other thread,
+             *       mostly the writer issued the write so we found it flushing
+             *       and hence didn't issue). In any case since we have an inuse
+             *       count, release() called from write_callback() would not have
+             *       released it, so we need to release it now.
+             */
+            filecache_handle->release(bc.offset, bc.length);
+        }
 
+        mark_commit_in_progress();
+    } else {
         /*
-         * Release the bytes_chunk back to the filecache.
-         * These bytes_chunks are not needed anymore as the flush is done.
-         *
-         * Note: We come here for bytes_chunks which were found dirty by the
-         *       above loop. These writes may or may not have been issued by
-         *       us (if not issued by us it was because some other thread,
-         *       mostly the writer issued the write so we found it flushing
-         *       and hence didn't issue). In any case since we have an inuse
-         *       count, release() called from write_callback() would not have
-         *       released it, so we need to release it now.
+         * Add task to commit_waiters list.
          */
-        filecache_handle->release(bc.offset, bc.length);
+        get_filecache()->add_commit_waiter(last_flushing_offset,
+                                           last_flushing_length, task);
+        mark_commit_in_progress();
+        return 0;
     }
-    AZLogDebug("[{}] wait_for_ongoing_flush() returning with error: {}",
-              ino, get_write_error());
+
+    /*
+     * Wait for the ongoing commit to complete.
+     */
+    wait_for_ongoing_commit();
+
+    /*
+     * Note: There can be bytes_commit_pending(), but that's fine
+     *       as if bytes_commit_pending() reached max_commit_bytes()
+     *       we would have started the commit task. As we held the
+     *       flush_lock() no new flush can be started, which increases
+     *       the bytes_commit_pending().
+     */
+    assert(is_commit_in_progress() == false);
+    assert(get_filecache()->get_bytes_to_flush() == 0);
     return get_write_error();
 }
 
 /**
- * Note: This takes shared lock on ilock_1.
+ * flush_cache_and_wait() is called only from the release/flush call.
+ * It's called with flush_lock() held.
+ * Things it does:
+ * - Before flushing it needs to wait for any ongoing commit to complete.
+ * - First it try to get membufs through get_contiguous_dirty_bcs() or
+ *   get_dirty_bc_range() based on the unstable write or stable write.
+ * - Issue flush for the dirty membufs and wait for the flush to complete.
+ * - If it's unstable write, it issue commit for commit pending membufs.
+ * - Wait for the commit to complete.
+ * - Returns the error code if any.
+ *
+ * Note: Flush_cache_and_wait() block the fuse thread till the flush completes.
+ *       It's called from the release(), flush() and getattr() calls. getattr()
+ *
+ * TODO: Need to add the support of completion of different calls
+ *       (flush and release).
  */
-int nfs_inode::flush_cache_and_wait(uint64_t start_off, uint64_t end_off)
+int nfs_inode::flush_cache_and_wait(uint64_t start_off,
+                                    uint64_t end_off,
+                                    struct rpc_task *parent_task)
 {
+    assert(is_flushing);
+    assert(start_off < end_off);
+    assert(!is_stable_write() || get_filecache()->get_bytes_to_commit() == 0);
+    assert(!is_stable_write() || is_commit_in_progress() == false);
+
     /*
      * MUST be called only for regular files.
      * Leave the assert to catch if fuse ever calls flush() on non-reg files.
@@ -906,14 +1186,62 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, uint64_t end_off)
      */
     flush_lock();
 
+    /*
+     * If flush is in progress, wait for it to complete.
+     * It may happen that last flush is still in progress and we need to wait
+     * for it to complete before we start the new flush.
+     * As we have the flush_lock() we are guaranteed that no new flush can be
+     * started till we release the flush_lock().
+     *
+     * wait_for_ongoing_flush_commit() will wait for the ongoing flush/commt to complete.
+     *
+     * Note: For stable write case, we don't need to wait for the ongoing flush,
+     *       as there is no commit required for stable write.
+     */
+    if (!is_stable_write()) {
+        const int err = wait_for_ongoing_flush_commit();
+        if (err != 0) {
+            AZLogError("[{}] Failed to flush cache to stable storage, "
+                        "error={}", ino, err);
+            return err;
+        }
+    }
+
+    if (get_filecache()->get_bytes_to_flush() == 0) {
+        AZLogDebug("[{}] Nothing to flush, returning", ino);
+        flush_unlock();
+        return 0;
+    }
+
     /*
      * Get the dirty bytes_chunk from the filecache handle.
      * This will grab an exclusive lock on the file cache and return the list
-     * of dirty bytes_chunks at that point. Note that we can have new dirty
-     * bytes_chunks created but we don't want to wait for those.
+     * of dirty bytes_chunks at that point.
      */
-    std::vector<bytes_chunk> bc_vec =
-        filecache_handle->get_dirty_bc_range(start_off, end_off);
+    std::vector<bytes_chunk> bc_vec;
+    uint64_t size = 0;
+    if (!is_stable_write()) {
+        bc_vec = filecache_handle->get_contiguous_dirty_bcs(size);
+
+        if (size != get_filecache()->get_bytes_to_flush()) {
+            AZLogInfo("[{}] Flushed size: {} != bytes to flush: {}",
+                        ino, size, get_filecache()->get_bytes_to_flush());
+            AZLogInfo("SWITCHING TO STABLE WRITE");
+
+            /*
+             * This is rare and no-op as we already completed the pending
+             * flush and commit.
+             */
+            switch_to_stable_write();
+            for (auto& bc : bc_vec) {
+                bc.get_membuf()->clear_inuse();
+            }
+            bc_vec = filecache_handle->get_dirty_bc_range(start_off, end_off);
+        }
+    } else {
+        assert(get_filecache()->get_bytes_to_commit() == 0);
+        bc_vec = filecache_handle->get_dirty_bc_range(start_off, end_off);
+    }
 
     /*
      * sync_membufs() iterate over the bc_vec and starts flushing the dirty
@@ -964,20 +1292,44 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, uint64_t end_off)
         filecache_handle->release(bc.offset, bc.length);
     }
 
-    flush_unlock();
-
     /*
-     * If the file is deleted while we still have data in the cache, don't
-     * treat it as a failure to flush. The file has gone and we don't really
-     * care about the unwritten data.
+     * Flush_cache_and_wait called from a release call only,
+     * we need to commit all the dirty membufs which are not yet committed.
      */
-    if (get_write_error() == ENOENT || get_write_error() == ESTALE) {
-        return 0;
+    bool start_commit_task = false;
+
+    if (get_filecache()->get_bytes_to_commit() > 0) {
+        assert(!is_stable_write());
+        assert(!is_commit_in_progress());
+
+        /*
+         * In case of release, there is no thread which starts the commit
+         * operation. So, we need to start the commit operation here.
+         */
+        std::unique_lock<std::mutex> commit_lock(commit_lock_5);
+        set_commit_in_progress();
+        commit_lock.unlock();
+        start_commit_task = true;
+    }
+
+    if(start_commit_task) {
+        assert(!is_stable_write());
+        assert(!get_filecache()->is_flushing_in_progress());
+        assert(is_commit_in_progress());
+
+        commit_membufs();
     }
 
+    /*
+     * Wait for the ongoing commit to complete.
+     */
+    wait_for_ongoing_commit();
+    flush_unlock();
+
     return get_write_error();
 }
 
+
 void nfs_inode::flush_lock() const
 {
     AZLogDebug("[{}] flush_lock() called", ino);
@@ -1053,7 +1405,7 @@ bool nfs_inode::truncate_start(size_t size)
      */
     flush_lock();
 
-    wait_for_ongoing_flush(0, UINT64_MAX);
+    wait_for_ongoing_flush_commit();
 
     AZLogDebug("[{}] Ongoing flush operations completed", ino);
 
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 9f62bbbe..b63abb57 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -806,6 +806,194 @@ void access_callback(
     }
 }
 
+static void complete_commit_waiter_tasks(struct membuf *mb)
+{
+    std::vector<struct rpc_task *> tvec = mb->get_commit_waiters();
+    for (auto& task : tvec) {
+        assert(task->magic == RPC_TASK_MAGIC);
+        assert(task->get_op_type() == FUSE_WRITE);
+        assert(task->rpc_api->write_task.is_fe());
+        assert(task->rpc_api->write_task.get_size() > 0);
+
+        AZLogDebug("Completing flush waiter task {} for [{}, {})",
+                    fmt::ptr(task),
+                    task->rpc_api->write_task.get_offset(),
+                    task->rpc_api->write_task.get_offset() +
+                    task->rpc_api->write_task.get_size());
+        task->reply_write(task->rpc_api->write_task.get_size());
+    }
+}
+
+static void commit_callback(
+    struct rpc_context *rpc,
+    int rpc_status,
+    void *data,
+    void *private_data)
+{
+    rpc_task *task = (rpc_task*) private_data;
+    assert(task->magic == RPC_TASK_MAGIC);
+    assert(task->get_op_type() == FUSE_FLUSH);
+    assert(task->rpc_api->pvt != nullptr);
+
+    auto res = (COMMIT3res*)data;
+
+    INJECT_JUKEBOX(res, task);
+
+    const fuse_ino_t ino =
+        task->rpc_api->flush_task.get_ino();
+    struct nfs_inode *inode =
+        task->get_client()->get_nfs_inode_from_ino(ino);
+    auto bc_vec_ptr = (std::vector<bytes_chunk> *)task->rpc_api->pvt;
+
+    const int status = task->status(rpc_status, NFS_STATUS(res));
+    UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc);
+    AZLogInfo("commit_callback");
+    /*
+     * Now that the request has completed, we can query libnfs for the
+     * dispatch time.
+     */
+    task->get_stats().on_rpc_complete(rpc_get_pdu(rpc), NFS_STATUS(res));
+    if (status == 0) {
+        uint64_t offset = 0;
+        uint64_t length = 0;
+        auto &bc_vec = *bc_vec_ptr; // get the vector from the pointer.
+
+        for (auto &bc : bc_vec)
+        {
+            struct membuf *mb = bc.get_membuf();
+            assert(mb->is_inuse());
+            assert(mb->is_locked());
+            assert(mb->is_commit_pending());
+            assert(mb->is_uptodate());
+
+            mb->clear_commit_pending();
+            complete_commit_waiter_tasks(mb);
+            mb->clear_locked();
+            mb->clear_inuse();
+
+            /**
+             * Collect the contiguous range of bc's to release.
+             */
+            if (offset == 0 && length == 0) {
+                offset = bc.offset;
+                length = bc.length;
+            } else if (offset + length == bc.offset) {
+                length += bc.length;
+            } else {
+                auto release = inode->get_filecache()->release(offset, length);
+                AZLogDebug("BC released {}, asked for {}", release, length);
+                offset = bc.offset;
+                length = bc.length;
+            }
+        }
+
+        if (length != 0) {
+            assert(length != 0);
+            auto release = inode->get_filecache()->release(offset, length);
+
+            AZLogDebug("BC offset {}, asked for {}, release {}", (offset/1048576), (length/1048576), release);
+        }
+
+        task->free_rpc_task();
+        #if 0
+        if (task->get_fuse_req() != nullptr) {
+            task->reply_error(status);
+        } else {
+            task->free_rpc_task();
+        }
+        #endif
+    } else if (NFS_STATUS(res) == NFS3ERR_JUKEBOX) {
+        task->get_client()->jukebox_retry(task);
+        return;
+    } else {
+        auto &bc_vec = *bc_vec_ptr; // get the vector from the pointer.
+        /*
+         * Mark the membufs as dirty and clear the commit_pending flag.
+         * Next write will initiate the flush again with stable write.
+         */
+        for (auto &bc : bc_vec)
+        {
+            struct membuf *mb = bc.get_membuf();
+            assert(mb->is_inuse());
+            assert(mb->is_locked());
+            assert(mb->is_commit_pending());
+            assert(mb->is_uptodate());
+
+            mb->clear_commit_pending();
+            complete_commit_waiter_tasks(mb);
+            mb->set_dirty();
+            mb->clear_locked();
+            mb->clear_inuse();
+        }
+        inode->set_stable_write();
+
+        task->free_rpc_task();
+        #if 0
+        if (task->get_fuse_req() != nullptr) {
+            task->reply_error(status);
+        } else {
+            inode->set_write_error(status);
+            task->free_rpc_task();
+        }
+        #endif
+    }
+
+    delete bc_vec_ptr;
+
+    inode->clear_commit_in_progress();
+}
+
+void rpc_task::issue_commit_rpc()
+{
+    // Must only be called for a flush task.
+    assert(get_op_type() == FUSE_FLUSH);
+
+    const fuse_ino_t ino = rpc_api->flush_task.get_ino();
+    struct nfs_inode *inode = get_client()->get_nfs_inode_from_ino(ino);
+    assert(inode->is_stable_write() == false);
+    assert(inode->get_filecache()->is_flushing_in_progress() == false);
+
+    COMMIT3args args;
+    ::memset(&args, 0, sizeof(args));
+    bool rpc_retry = false;
+
+    AZLogInfo("issue_commit_rpc");
+
+    /*
+     * Get the bcs marked for commit_pending.
+     */
+    if (rpc_api->pvt == nullptr) {
+        rpc_api->pvt = static_cast<void *>(new std::vector<bytes_chunk>(inode->get_filecache()->get_commit_pending_bcs()));
+    }
+
+    args.file = inode->get_fh();
+    args.offset = 0;
+    args.count = 0;
+
+    do {
+        rpc_retry = false;
+        stats.on_rpc_issue();
+
+        if (rpc_nfs3_commit_task(get_rpc_ctx(),
+                                        commit_callback, &args,
+                                        this) == NULL) {
+            stats.on_rpc_cancel();
+            /*
+             * Most common reason for this is memory allocation failure,
+             * hence wait for some time before retrying. Also block the
+             * current thread as we really want to slow down things.
+             *
+             * TODO: For soft mount should we fail this?
+             */
+            rpc_retry = true;
+
+            AZLogWarn("rpc_nfs3_write_task failed to issue, retrying "
+                        "after 5 secs!");
+            ::sleep(5);
+        }
+    } while (rpc_retry);
+}
+
 /**
  * Must be called when bytes_completed bytes are successfully read/written.
  */
@@ -967,7 +1155,7 @@ void bc_iovec::on_io_fail(int status)
 
         /*
          * This membuf has completed flushing albeit with failure, check if any
-         * task is waiting for this membuf to be flushed.
+         * task is waiting for this membuf to be flushed/commit.
          */
         std::vector<struct rpc_task *> tvec = mb->get_flush_waiters();
         for (auto& task : tvec) {
@@ -986,6 +1174,29 @@ void bc_iovec::on_io_fail(int status)
             task->reply_error(status);
         }
 
+        /*
+         * This membuf has completed flushing albeit with failure, check if any
+         * task is waiting for this membuf to be committed. Now this membuf
+         * failed to flush, so we don't issue commit for this membuf. Hence
+         * we must complete the waiter tasks.
+         */
+        std::vector<struct rpc_task *> tvec_commit = mb->get_commit_waiters();
+        for (auto& task : tvec_commit) {
+            assert(task->magic == RPC_TASK_MAGIC);
+            assert(task->get_op_type() == FUSE_WRITE);
+            assert(task->rpc_api->write_task.is_fe());
+
+            AZLogError("Completing flush waiter task {} for [{}, {}), "
+                       "failed: {}",
+                       fmt::ptr(task),
+                       task->rpc_api->write_task.get_offset(),
+                       task->rpc_api->write_task.get_offset() +
+                       task->rpc_api->write_task.get_size(),
+                       status);
+
+            task->reply_error(status);
+        }
+
         mb->clear_locked();
         mb->clear_inuse();
         iov++;
@@ -2308,6 +2519,141 @@ void rpc_task::run_access()
     } while (rpc_retry);
 }
 
+static void perform_inline_writes(struct rpc_task *task,
+                                  struct nfs_inode *inode)
+{
+    const size_t length = task->rpc_api->write_task.get_size();
+    const off_t offset = task->rpc_api->write_task.get_offset();
+    const fuse_ino_t ino = task->rpc_api->write_task.get_ino();
+    std::vector<bytes_chunk> bc_vec;
+
+    /*
+     * Grab flush_lock to get exclusive list of dirty chunks, which are not
+     * already being flushed. This also protects us racing with a truncate
+     * call and growing the file size after truncate shrinks the file.
+     */
+    inode->flush_lock();
+
+    /*
+     * prune_bytes is the number of bytes that need to be flushed to
+     * make space for the new writes. If it's 0, then we don't need to
+     * flush more data we can just wait for the ongoing flush/commit
+     * to complete.
+     */
+    uint64_t prune_bytes = inode->get_filecache()->get_bytes_to_prune();
+
+    if (prune_bytes == 0 && !inode->is_stable_write()) {
+        AZLogDebug("No bytes to prune, Wait for ongoing flush/commit"
+                    "to complete.");
+        /*
+         * In case of inline_write, we want to block the writes till the
+         * enough space is made for new writes. Every write thread will
+         * come here and try to take the lock, only one thread succeeds
+        * and wait for the ongoing flush/commit to complete. Rest of the
+        * threads will wait for the lock to be released by the first
+        * thread. After first thread completes the wait, check if any ongoing
+        * flush/commit is in progress and if it's still true. Mostly it will
+        * be false as new writes are blocked.
+        *
+        * Note: flush_lock() is held so that no new flush or commit task
+        *       is started.
+        */
+        inode->wait_for_ongoing_flush_commit(0, UINT64_MAX, task);
+    } else {
+        uint64_t size = 0;
+        if (inode->is_stable_write()) {
+            /*
+             * If the write is stable, we can directly write to the
+             * membufs and issue the writes to the backend.
+             */
+            bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range();
+            if (bc_vec.empty()) {
+               /*
+                * Another application write raced with us and it got to do the
+                * inline write before us. Try adding this task to the flush_waiters
+                * list for the membuf where we copied the application data. If
+                * we are able to add successfully, then we will call the fuse
+                * callback when the flush completes at the backend. This will
+                * slow down the writer application, thus relieving the memory
+                * pressure.
+                * If add_flush_waiter() returns false, that means this membuf
+                * is already flushed by that other thread and we can complete
+                * the fuse callback in this context.
+                */
+                inode->flush_unlock();
+
+                if (inode->get_filecache()->add_flush_waiter(offset,
+                                                             length,
+                                                             task)) {
+                    AZLogDebug("[{}] Inline write, membuf not flushed, write "
+                                "[{}, {}) will be completed when membuf is flushed",
+                                ino, offset, offset+length);
+                    return;
+                } else {
+                    AZLogDebug("[{}] Inline write, membuf already flushed, "
+                                "completing fuse write [{}, {})",
+                                ino, offset, offset+length);
+                    task->reply_write(length);
+                    return;
+                }
+            }
+        } else {
+            /*
+             * If the write is not stable, we need to copy the application
+             * data into the membufs and issue the writes to the backend.
+             */
+            bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size);
+            if (!bc_vec.empty()) {
+                /*
+                 * Perform inline sync.
+                 * In case of unstable writes, we need to complete the write
+                 * on commit complete. So, we don't pass the 3rd arg to
+                 * sync_membufs(), but add the task to the commit_waiters list.
+                 */
+                inode->sync_membufs(bc_vec, false /* is_flush */, task);
+            }
+            inode->flush_unlock();
+
+            /*
+             * Set the commit_in_progress flag to ensure that the commit
+             * task is started after the flush completes.
+             */
+            inode->mark_commit_in_progress();
+
+            if (inode->get_filecache()->add_commit_waiter(
+                                                inode->last_flushing_offset,
+                                                inode->last_flushing_length,
+                                                task)) {
+                AZLogDebug("[{}] Inline write, membuf not flushed, write "
+                            "[{}, {}) will be completed when membuf is flushed",
+                            ino, inode->last_flushing_offset,
+                            inode->last_flushing_length);
+                return;
+            } else {
+                AZLogDebug("[{}] Inline write, membuf already flushed, "
+                            "completing fuse write [{}, {})",
+                            ino, inode->last_flushing_offset,
+                            inode->last_flushing_length);
+                task->reply_write(length);
+                return;
+            }
+        }
+    }
+
+    /*
+     * Perform inline sync.
+     * Since we pass the 3rd arg to sync_membufs, it tells sync_membufs()
+     * to call the fuse callback after all the issued backend writes
+     * complete. This will be done asynchronously while the sync_membufs()
+     * call will return after issuing the writes.
+     */
+    inode->sync_membufs(bc_vec, false /* is_flush */, task);
+    inode->flush_unlock();
+
+    // Free up the fuse thread without completing the application write.
+    return;
+}
+
 void rpc_task::run_write()
 {
     // This must be called only for front end tasks.
@@ -2464,69 +2810,25 @@ void rpc_task::run_write()
                    ino, sparse_write, (extent_right - extent_left),
                    extent_left, extent_right);
 
-        /*
-         * Grab flush_lock to get exclusive list of dirty chunks, which are not
-         * already being flushed. This also protects us racing with a truncate
-         * call and growing the file size after truncate shrinks the file.
-         */
-        inode->flush_lock();
-
-        std::vector<bytes_chunk> bc_vec =
-            inode->get_filecache()->get_dirty_nonflushing_bcs_range();
-
-        if (bc_vec.empty()) {
-            /*
-             * Another application write raced with us and it got to do the
-             * inline write before us. Try adding this task to the flush_waiters
-             * list for the membuf where we copied the application data. If
-             * we are able to add successfully, then we will call the fuse
-             * callback when the flush completes at the backend. This will
-             * slow down the writer application, thus relieving the memory
-             * pressure.
-             * If add_flush_waiter() returns false, that means this membuf
-             * is already flushed by that other thread and we can complete
-             * the fuse callback in this context.
-             */
-            inode->flush_unlock();
-
-            if (inode->get_filecache()->add_flush_waiter(offset,
-                                                         length,
-                                                         this)) {
-                AZLogDebug("[{}] Inline write, membuf not flushed, write "
-                           "[{}, {}) will be completed when membuf is flushed",
-                           ino, offset, offset+length);
-                return;
-            } else {
-                AZLogDebug("[{}] Inline write, membuf already flushed, "
-                           "completing fuse write [{}, {})",
-                           ino, offset, offset+length);
-                reply_write(length);
-                return;
-            }
-        }
-
-        /*
-         * Perform inline sync.
-         * Since we pass the 3rd arg to sync_membufs, it tells sync_membufs()
-         * to call the fuse callback after all the issued backend writes
-         * complete. This will be done asynchronously while the sync_membufs()
-         * call will return after issuing the writes.
-         *
-         * Note: sync_membufs() can free this rpc_task if all issued backend
-         *       writes complete before sync_membufs() can return.
-         *       DO NOT access rpc_task after sync_membufs() call.
-         */
-        inode->sync_membufs(bc_vec, false /* is_flush */, this);
-        inode->flush_unlock();
-
         // Free up the fuse thread without completing the application write.
-        return;
+        return perform_inline_writes(this, inode);
     }
 
     /*
-     * Ok, we don't need to do inline writes. See if we have enough dirty
-     * data and we need to start async flush.
+     * We don't need to do inline writes. See if we have enough bytes to
+     * commit to the backend.
      */
+    const bool needs_commit = inode->get_filecache()->commit_required();
+    if (needs_commit) {
+        /*
+         * Set the commit_in_progress flag to ensure that the commit
+         * task is startet.
+         */
+        inode->mark_commit_in_progress();
+
+        assert(inode->is_commit_in_progress());
+        assert(!need_inline_write);
+    }
 
     /*
      * If the extent size exceeds the max allowed dirty size as returned by
@@ -2557,6 +2859,54 @@ void rpc_task::run_write()
                bytes_to_flush,
                max_dirty_extent);
 
+    if (inode->is_commit_in_progress()) {
+        /*
+         * Commit is in progress, we can't start new flush to the BLOB.
+         * Complete the write if it doesn't need flush, otherwise add to
+         * commit_waiters list.
+         * Note: It make sense to complete the write only after commit is done
+         *       if we already reached the flush limit. As otherwise it will
+         *       trigger inline writes.
+         */
+        if (bytes_to_flush < max_dirty_extent) {
+            AZLogDebug("[{}] Commit in progress, skipping write, inline_write is false",
+                       ino);
+            reply_write(length);
+            return;
+        } else {
+            /*
+             * Add this task to the commit_waiters list, so that it can be
+             * completed after the commit is done.
+             */
+            if(inode->get_filecache()->add_commit_waiter(
+                                            inode->last_flushing_offset,
+                                            inode->last_flushing_length,
+                                            this)) {
+                AZLogDebug("[{}] Commit in progress, added to commit_waiters list",
+                           ino);
+                return;
+            } else {
+                /*
+                 * Commit is done, but this task is not added to commit_waiters
+                 * list, so complete the write.
+                 */
+                AZLogDebug("[{}] Commit in progress, but not added to commit_waiters list, "
+                           "completing write", ino);
+                reply_write(length);
+            return;
+            }
+        }
+
+        // We should never reach here.
+        assert(false);
+        return;
+    }
+
+    /*
+     * Ok, we neither need inline writes nor commit in progress. See if we have
+     * enough dirty data and we need to start async flush.
+     */
+
     if ((extent_right - extent_left) < max_dirty_extent) {
         /*
          * Current extent is not large enough to be flushed, see if we have
@@ -2588,9 +2938,19 @@ void rpc_task::run_write()
      */
     inode->flush_lock();
 
-    std::vector<bytes_chunk> bc_vec =
-        inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left,
-                                                                extent_right);
+    uint64_t size = 0;
+    std::vector<bytes_chunk> bc_vec;
+    /*
+     * If the inode is stable write, then get the dirty membufs in the range.
+     * otherwise get the dirty membufs contigious range.
+     */
+    if (inode->is_stable_write()) {
+        bc_vec =
+         inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left,
+                                                                 extent_right);
+    } else {
+        bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size);
+    }
 
     if (bc_vec.size() == 0) {
         inode->flush_unlock();

From 9d545d11a90c8fd135e6df4e378e8cc7e88fec2d Mon Sep 17 00:00:00 2001
From: Nitin Singla <nitinsingla@microsoft.com>
Date: Sun, 26 Jan 2025 14:12:11 +0000
Subject: [PATCH 2/9] Some fixes.

---
 turbonfs/inc/file_cache.h   |  21 +-
 turbonfs/inc/nfs_inode.h    |  10 +-
 turbonfs/inc/rpc_stats.h    |   9 -
 turbonfs/inc/rpc_task.h     |   8 +
 turbonfs/src/file_cache.cpp |  28 +-
 turbonfs/src/nfs_inode.cpp  | 222 ++++++++++----
 turbonfs/src/rpc_stats.cpp  |   1 +
 turbonfs/src/rpc_task.cpp   | 575 +++++++++++++++++++-----------------
 8 files changed, 539 insertions(+), 335 deletions(-)

diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index be7a8ac0..39ddee58 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -1239,6 +1239,8 @@ class bytes_chunk_cache
                            uint64_t length,
                            struct rpc_task *task);
 
+    void cleanup_on_error();
+
 
     /*
      * Returns all dirty chunks for a given range in chunkmap.
@@ -1400,6 +1402,9 @@ class bytes_chunk_cache
      * holding more data than the Blob NFS server's scheduler cache size.
      * We want to send as prompt as possible to utilize the n/w b/w but slow
      * enough to give the write scheduler an opportunity to merge better.
+     *
+     * Note: max_dirty_extent is static as it doesn't change after it's
+     *       queried for the first time.
      */
     uint64_t max_dirty_extent_bytes() const
     {
@@ -1407,9 +1412,12 @@ class bytes_chunk_cache
         static const uint64_t max_total =
             (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
         assert(max_total != 0);
-        static const uint64_t max_dirty_extent = (max_total * 0.6);
+        static const uint64_t max_dirty_extent = std::min(
+            (uint64_t)(max_total * 0.6), (uint64_t)(1024 * 1024 * 1024ULL));
 
-        return std::min(max_dirty_extent, uint64_t(1024 * 1024 * 1024ULL));
+        assert(max_dirty_extent != 0);
+
+        return max_dirty_extent;
     }
 
     /*
@@ -1499,6 +1507,15 @@ class bytes_chunk_cache
         return (bytes_commit_pending >= max_commit_bytes());
     }
 
+    /*
+     * Check if bytes_to_flush is equal or more than
+     * max_dirty_extent_bytes limit.
+     */
+    bool flush_required() const
+    {
+        return (get_bytes_to_flush() >= max_dirty_extent_bytes());
+    }
+
     /*
      * get_bytes_to_prune() returns the number of bytes that need to be flushed
      * inline to free up the space. If there are enough bytes_flushing then we
diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
index 959e88b5..ea3a2c95 100644
--- a/turbonfs/inc/nfs_inode.h
+++ b/turbonfs/inc/nfs_inode.h
@@ -323,7 +323,7 @@ struct nfs_inode
      * Note: As of now, we are not using this flag as commit changes not yet
      *       integrated, so we are setting this flag to true.
      */
-    bool stable_write = true;
+    bool stable_write = false;
 
 public:
     /*
@@ -1169,7 +1169,8 @@ struct nfs_inode
     bool is_commit_in_progress() const
     {
         assert(commit_state != commit_state_t::INVALID);
-        return (commit_state == commit_state_t::COMMIT_IN_PROGRESS);
+        return ((commit_state == commit_state_t::COMMIT_IN_PROGRESS) ||
+               (commit_state == commit_state_t::NEEDS_COMMIT));
     }
 
     /**
@@ -1355,10 +1356,11 @@ struct nfs_inode
      *       progress (which must have held the is_flushing lock).
      */
     int flush_cache_and_wait(uint64_t start_off = 0,
-                             uint64_t end_off = UINT64_MAX,
-                             struct rpc_task *parent_task = nullptr);
+                             uint64_t end_off = UINT64_MAX);
 
     void mark_commit_in_progress();
+    void add_task_to_commit_waiters(struct rpc_task *task);
+    void add_task_to_flush_waiters(struct rpc_task *task);
 
     /**
      * Wait for currently flushing membufs to complete.
diff --git a/turbonfs/inc/rpc_stats.h b/turbonfs/inc/rpc_stats.h
index 626914b1..d0a7bbed 100644
--- a/turbonfs/inc/rpc_stats.h
+++ b/turbonfs/inc/rpc_stats.h
@@ -139,9 +139,6 @@ class rpc_stats_az
      */
     void on_rpc_issue()
     {
-        // FUSE_FLUSH is never issued as an RPC task to the server. FUSE_WRITE is issued instead.
-        assert(optype != FUSE_FLUSH);
-
         assert(stamp.issue == 0);
         stamp.issue = get_current_usecs();
         assert(stamp.issue >= stamp.create);
@@ -156,9 +153,6 @@ class rpc_stats_az
      */
     void on_rpc_cancel()
     {
-        // FUSE_FLUSH is never issued as an RPC task to the server. FUSE_WRITE is issued instead.
-        assert(optype != FUSE_FLUSH);
-
         assert(stamp.issue != 0);
         assert((int64_t) stamp.issue <= get_current_usecs());
         assert(stamp.dispatch == 0);
@@ -178,9 +172,6 @@ class rpc_stats_az
      */
     void on_rpc_complete(struct rpc_pdu *pdu, nfsstat3 status)
     {
-        // FUSE_FLUSH is never issued as an RPC task to the server. FUSE_WRITE is issued instead.
-        assert(optype != FUSE_FLUSH);
-
         assert((status == NFS3ERR_RPC_ERROR) ||
                (nfsstat3_to_errno(status) != -ERANGE));
 
diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h
index c9fd4303..72c77788 100644
--- a/turbonfs/inc/rpc_task.h
+++ b/turbonfs/inc/rpc_task.h
@@ -2000,6 +2000,14 @@ struct rpc_task
         csched = _csched;
     }
 
+    void set_task_csched(bool stable_write)
+    {
+        if (client->mnt_options.nfs_port == 2048) {
+            csched = stable_write ? CONN_SCHED_FH_HASH : CONN_SCHED_RR;
+        }
+        return;
+    }
+
     conn_sched_t get_csched() const
     {
         assert(csched > CONN_SCHED_INVALID &&
diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp
index d133ed7e..a6ff86be 100644
--- a/turbonfs/src/file_cache.cpp
+++ b/turbonfs/src/file_cache.cpp
@@ -483,7 +483,7 @@ void membuf::clear_flushing()
      * TODO: Remove me once we have enough testing.
      *       Don't release it to production.
      */
-    assert(!is_dirty());
+    // assert(!is_dirty());
 
     flag &= ~MB_Flag::Flushing;
 
@@ -2268,7 +2268,7 @@ void bytes_chunk_cache::clear_nolock()
     assert(bytes_cached == 0);
 
     if (bytes_allocated != 0) {
-        AZLogWarn("[{}] Cache purge: bytes_allocated is still {}, some user "
+        AZLogDebug("[{}] Cache purge: bytes_allocated is still {}, some user "
                   "is still holding on to the bytes_chunk/membuf even after "
                   "dropping the inuse count: backing_file_name={}",
                   CACHE_TAG, bytes_allocated.load(), backing_file_name);
@@ -2523,6 +2523,30 @@ bool bytes_chunk_cache::add_commit_waiter(uint64_t offset,
     return false;
 }
 
+void bytes_chunk_cache::cleanup_on_error()
+{
+    const std::unique_lock<std::mutex> _lock(chunkmap_lock_43);
+
+    for (auto it = chunkmap.begin(); it != chunkmap.cend(); ++it) {
+        struct bytes_chunk& bc = it->second;
+        struct membuf *mb = bc.get_membuf();
+        if (!mb->is_inuse() && !mb->is_locked()) {
+            mb->set_inuse();
+            mb->set_locked();
+            if (mb->is_dirty()) {
+                mb->clear_dirty();
+            }
+
+            if (mb->is_flushing()) {
+                mb->clear_flushing();
+            }
+
+            if (mb->is_commit_pending()) {
+                mb->clear_commit_pending();
+            }
+        }
+    }
+}
 
 std::vector<bytes_chunk> bytes_chunk_cache::get_dirty_bc_range(
         uint64_t start_off, uint64_t end_off) const
diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index dcb8b18d..a6eeba51 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -34,7 +34,7 @@ nfs_inode::nfs_inode(const struct nfs_fh3 *filehandle,
     assert(write_error == 0);
 
     // TODO: Revert this to false once commit changes integrated.
-    assert(stable_write == true);
+    assert(stable_write == false);
     assert(commit_state == commit_state_t::COMMIT_NOT_NEEDED);
 
 #ifndef ENABLE_NON_AZURE_NFS
@@ -547,7 +547,7 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
      * New flush should not be started if flush/commit is in progress.
      * But it may happen commit_is_pending() is set by another thread.
      */
-    assert(!is_commit_in_progress() || is_commit_pending());
+    assert(!is_commit_in_progress());
 
     if (bc_vec.empty()) {
         return;
@@ -649,11 +649,13 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
             continue;
         }
 
+        #if 0
         /*
          * Update the last flushing offset and length.
          */
         last_flushing_offset = bc.offset;
         last_flushing_length = bc.length;
+        #endif
 
         if (write_task == nullptr) {
             write_task =
@@ -678,6 +680,10 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
          * Once packed completely, then dispatch the write.
          */
         if (write_task->add_bc(bc)) {
+            assert(is_stable_write() || ((uint64_t) putblock_filesize == bc.offset));
+            putblock_filesize += bc.length;
+            last_flushing_offset = bc.offset;
+            last_flushing_length = bc.length;
             continue;
         } else {
             /*
@@ -703,6 +709,10 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
             // Single bc addition should not fail.
             [[maybe_unused]] bool res = write_task->add_bc(bc);
             assert(res == true);
+            assert(is_stable_write() || ((uint64_t) putblock_filesize == bc.offset));
+            putblock_filesize += bc.length;
+            last_flushing_offset = bc.offset;
+            last_flushing_length = bc.length;
         }
     }
 
@@ -929,6 +939,8 @@ int nfs_inode::copy_to_cache(const struct fuse_bufvec* bufv,
 
 void nfs_inode::mark_commit_in_progress()
 {
+    assert(is_stable_write() == false);
+
     /*
      * We want to commit, but we need to wait for current flushing to complete.
      * 1. If flushing is going on, set commit_pending to inode, when last membuf flushed,
@@ -977,22 +989,90 @@ void nfs_inode::mark_commit_in_progress()
     }
 }
 
+void nfs_inode::add_task_to_flush_waiters(struct rpc_task *task)
+{
+    /*
+     * Add this task to the flush_waiter list, so that it can be
+     * completed after the flush is done.
+     */
+    if(get_filecache()->add_flush_waiter(last_flushing_offset,
+                                         last_flushing_length,
+                                         task)) {
+        AZLogDebug("[{}] flush in progress, added to flush_waiter list"
+                   " for [{}, {})", ino, last_flushing_offset,
+                   last_flushing_length);
+        return;
+    } else {
+        /*
+         * flush is done, but this task is not added to flush_waiter
+         * list, so complete the write.
+         */
+        AZLogDebug("[{}] flush is done, not added to flush_waiter list, "
+                    "completing write", ino);
+        task->reply_write(task->rpc_api->write_task.get_size());
+    return;
+    }
+}
+
+void nfs_inode::add_task_to_commit_waiters(struct rpc_task *task)
+{
+    /*
+     * This should be called only for unstable writes.
+     */
+    assert(is_stable_write() == false);
+
+    /*
+     * Add this task to the commit_waiters list, so that it can be
+     * completed after the commit is done.
+     */
+    if(get_filecache()->add_commit_waiter(last_flushing_offset,
+                                          last_flushing_length,
+                                          task)) {
+        AZLogDebug("[{}] Commit in progress, added to commit_waiters list"
+                   " for [{}, {})", ino, last_flushing_offset,
+                    last_flushing_length);
+        return;
+    } else {
+        /*
+         * Commit is done, but this task is not added to commit_waiters
+         * list, so complete the write.
+         */
+        AZLogDebug("[{}] Commit in progress, but not added to commit_waiters list, "
+                    "completing write", ino);
+        task->reply_write(task->rpc_api->write_task.get_size());
+        return;
+    }
+}
+
 /**
  * Note: Caller should call with flush_lock() held.
  *       On return there is no ongoing flush/commit in progress.
+ *
+ * flush_cache_and_wait() checks for return value, as it's
+ * called on close() and need to know final status of the writes.
  */
 int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
                                              uint64_t end_off,
                                              struct rpc_task *task)
 {
+    // Caller must call us with is_flushing lock held.
+    assert(is_flushing);
     assert(start_off < end_off);
-    assert(!is_stable_write());
 
-    // NAND Gate
-    assert(!(is_commit_in_progress() && get_filecache()->is_flushing_in_progress()));
+    if (is_stable_write()) {
+        assert(!is_commit_in_progress());
+        assert(get_filecache()->bytes_commit_pending == 0);
+    }
 
-    // Caller must call us with is_flushing lock held.
-    assert(is_flushing);
+    /*
+     * If commit_pending limit reached, we need to start the commit,
+     * but we need to wait for the current flushing to complete, so
+     * we set the commit_pending flag and once flusing completes it
+     * will start the commit task.
+     */
+    if (get_filecache()->is_flushing_in_progress()) {
+        assert(!is_commit_in_progress() || is_commit_pending());
+    }
 
     /*
      * MUST be called only for regular files.
@@ -1014,13 +1094,13 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
      * Flushing not in progress and no new flushing can be started as we hold
      * the flush_lock().
      * It may be possible that the flush is not in progress and but we may have
-     * commit_in_progress or commit_pending_bytes more than max_commit_bytes.
+     * commit_in_progress or commit_pending_bytes to commit.
      * In that case we should start the commit task if it's not already in
      * progress. And wait for the commit to complete.
      */
     if (!get_filecache()->is_flushing_in_progress() &&
         !is_commit_in_progress() &&
-        !get_filecache()->commit_required()) {
+        get_filecache()->bytes_commit_pending == 0) {
         AZLogDebug("[{}] No flush in progress, returning", ino);
 
         /*
@@ -1037,7 +1117,7 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
      * flush is in progress.
      */
     std::vector<bytes_chunk> bc_vec = {};
-    if (get_filecache()->is_flushing_in_progress()) {
+    if (task == nullptr && get_filecache()->is_flushing_in_progress()) {
         /*
         * Get the flushing bytes_chunk from the filecache handle.
         * This will grab an exclusive lock on the file cache and return the list
@@ -1048,11 +1128,13 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
             filecache_handle->get_flushing_bc_range(start_off, end_off);
     }
 
-
     /*
      * If there is no task passed, then our caller expects us to return
      * only after the flush completes (f.e. truncate, switch_to_stable_write).
      * Wait for all the membufs to flush and get result back.
+     * Otherwise add the task to the flush_waiter list for stable write and
+     * commit_waiter list for unstable write.
+     * Task will wait on last flushed offset and length.
      */
     if (task == nullptr) {
         for (bytes_chunk &bc : bc_vec) {
@@ -1092,32 +1174,47 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
              */
             filecache_handle->release(bc.offset, bc.length);
         }
-
-        mark_commit_in_progress();
     } else {
-        /*
-         * Add task to commit_waiters list.
-         */
-        get_filecache()->add_commit_waiter(last_flushing_offset,
-                                           last_flushing_length, task);
-        mark_commit_in_progress();
-        return 0;
+        if (is_stable_write()) {
+            /*
+             * Add task to flush_waiters list.
+             */
+            add_task_to_flush_waiters(task);
+        } else {
+            /*
+             * Add task to commit_waiters list.
+             */
+            add_task_to_commit_waiters(task);
+        }
     }
 
     /*
-     * Wait for the ongoing commit to complete.
+     * Unstable write case, we need to wait for the commit to complete.
+     * So, mark the commit in progress and wait for the commit to complete.
      */
-    wait_for_ongoing_commit();
+    if (!is_stable_write()) {
+        mark_commit_in_progress();
+        if (task) {
+            return 0;
+        }
+
+        /*
+         * Wait for the ongoing commit to complete.
+         */
+        wait_for_ongoing_commit();
+    }
 
     /*
-     * Note: There can be bytes_commit_pending(), but that's fine
-     *       as if bytes_commit_pending() reached max_commit_bytes()
-     *       we would have started the commit task. As we held the
-     *       flush_lock() no new flush can be started, which increases
-     *       the bytes_commit_pending().
+     * As we have flush_lock held, no new flush can be started.
+     * We already waited for the ongoing flush and commit to complete.
+     * For stable write case there should not be any commit pending.
+     * So, following conditions should be true for both stable and
+     * unstable writes.:
      */
     assert(is_commit_in_progress() == false);
-    assert(get_filecache()->get_bytes_to_flush() == 0);
+    assert(get_filecache()->bytes_commit_pending == 0);
+    assert(get_filecache()->is_flushing_in_progress() == 0);
+
     return get_write_error();
 }
 
@@ -1134,16 +1231,14 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
  * - Returns the error code if any.
  *
  * Note: Flush_cache_and_wait() block the fuse thread till the flush completes.
- *       It's called from the release(), flush() and getattr() calls. getattr()
- *
- * TODO: Need to add the support of completion of different calls
- *       (flush and release).
+ *       It's called from the release(), flush() and getattr() calls. It's ok
+ *       as of now as it's not very often called. We can optimize to complete
+ *       the flush in background and return immediately. For that we need to add
+ *       special handling for the getattr() call.
  */
 int nfs_inode::flush_cache_and_wait(uint64_t start_off,
-                                    uint64_t end_off,
-                                    struct rpc_task *parent_task)
+                                    uint64_t end_off)
 {
-    assert(is_flushing);
     assert(start_off < end_off);
     assert(!is_stable_write() || get_filecache()->get_bytes_to_commit() == 0);
     assert(!is_stable_write() || is_commit_in_progress() == false);
@@ -1163,6 +1258,9 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
      */
     const int error_code = get_write_error();
     if (error_code != 0) {
+        // Cleanup of the membufs if any left.
+        get_filecache()->cleanup_on_error();
+
         AZLogWarn("[{}] Previous write to this Blob failed with error={}, "
                   "skipping new flush!", ino, error_code);
 
@@ -1207,7 +1305,23 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
         }
     }
 
-    if (get_filecache()->get_bytes_to_flush() == 0) {
+    /*
+     * For unstable write, If no bytes to flush, then return.
+     * For stable write, If no bytes to flush and no flush in
+     * progress, then return. As for stable write, we didn't
+     * wait for the flush to complete as above.
+     *
+     * TODO: Need to check getattr() call. As that can cause this
+     *       assert to fail.
+     */
+    if ((get_filecache()->get_bytes_to_flush() == 0) &&
+        (get_filecache()->is_flushing_in_progress() == false)) {
+        assert(is_commit_in_progress() == false);
+        assert(get_filecache()->is_flushing_in_progress() == false);
+        assert(get_filecache()->bytes_dirty == 0);
+        assert(get_filecache()->bytes_commit_pending == 0);
+        assert(get_filecache()->bytes_flushing == 0);
+
         AZLogDebug("[{}] Nothing to flush, returning", ino);
         flush_unlock();
         return 0;
@@ -1296,28 +1410,13 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
      * Flush_cache_and_wait called from a release call only,
      * we need to commit all the dirty membufs which are not yet committed.
      */
-    bool start_commit_task = false;
-
     if (get_filecache()->get_bytes_to_commit() > 0) {
         assert(!is_stable_write());
         assert(!is_commit_in_progress());
+        assert(get_filecache()->get_bytes_to_flush() == 0 ||
+               get_write_error() != 0);
 
-        /*
-         * In case of release, there is no thread which starts the commit
-         * operation. So, we need to start the commit operation here.
-         */
-        std::unique_lock<std::mutex> commit_lock(commit_lock_5);
-        set_commit_in_progress();
-        commit_lock.unlock();
-        start_commit_task = true;
-    }
-
-    if(start_commit_task) {
-        assert(!is_stable_write());
-        assert(!get_filecache()->is_flushing_in_progress());
-        assert(is_commit_in_progress());
-
-        commit_membufs();
+        mark_commit_in_progress();
     }
 
     /*
@@ -1326,6 +1425,21 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
     wait_for_ongoing_commit();
     flush_unlock();
 
+    /*
+     * Cleanup on error.
+     */
+    if (get_write_error()) {
+        AZLogError("[{}] Failed to flush cache to stable storage, "
+                   "error={}", ino, get_write_error());
+        get_filecache()->cleanup_on_error();
+    }
+
+    assert(is_commit_in_progress() == false);
+    assert(get_filecache()->is_flushing_in_progress() == false);
+    assert(get_filecache()->bytes_dirty == 0);
+    assert(get_filecache()->bytes_commit_pending == 0);
+    assert(get_filecache()->bytes_flushing == 0);
+
     return get_write_error();
 }
 
diff --git a/turbonfs/src/rpc_stats.cpp b/turbonfs/src/rpc_stats.cpp
index ad26697a..c3fe7be4 100644
--- a/turbonfs/src/rpc_stats.cpp
+++ b/turbonfs/src/rpc_stats.cpp
@@ -303,6 +303,7 @@ do { \
     DUMP_OP(FUSE_READDIRPLUS);
     DUMP_OP(FUSE_READ);
     DUMP_OP(FUSE_WRITE);
+    DUMP_OP(FUSE_FLUSH);
 
     /*
      * TODO: Add more ops.
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index b63abb57..145493d6 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -847,7 +847,7 @@ static void commit_callback(
 
     const int status = task->status(rpc_status, NFS_STATUS(res));
     UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc);
-    AZLogInfo("commit_callback");
+    AZLogDebug("commit_callback");
     /*
      * Now that the request has completed, we can query libnfs for the
      * dispatch time.
@@ -881,7 +881,7 @@ static void commit_callback(
                 length += bc.length;
             } else {
                 auto release = inode->get_filecache()->release(offset, length);
-                AZLogDebug("BC released {}, asked for {}", release, length);
+                AZLogInfo("BC released {}, asked for {}", release, length);
                 offset = bc.offset;
                 length = bc.length;
             }
@@ -891,7 +891,7 @@ static void commit_callback(
             assert(length != 0);
             auto release = inode->get_filecache()->release(offset, length);
 
-            AZLogDebug("BC offset {}, asked for {}, release {}", (offset/1048576), (length/1048576), release);
+            AZLogInfo("BC offset {}, asked for {}, release {}", (offset/1048576), (length/1048576), release);
         }
 
         task->free_rpc_task();
@@ -925,6 +925,12 @@ static void commit_callback(
             mb->clear_locked();
             mb->clear_inuse();
         }
+        /*
+         * Set the inode to stable write, so that next write will initiate
+         * the flush again with stable write.
+         * There should be no flush in progress as this moment.
+         */
+        assert(inode->get_filecache()->is_flushing_in_progress() == false);
         inode->set_stable_write();
 
         task->free_rpc_task();
@@ -957,7 +963,7 @@ void rpc_task::issue_commit_rpc()
     ::memset(&args, 0, sizeof(args));
     bool rpc_retry = false;
 
-    AZLogInfo("issue_commit_rpc");
+    AZLogDebug("issue_commit_rpc");
 
     /*
      * Get the bcs marked for commit_pending.
@@ -1314,7 +1320,7 @@ static void write_iov_callback(
                        bciov->orig_offset + bciov->orig_length);
 
             // Update bciov after the current write.
-            bciov->on_io_complete(res->WRITE3res_u.resok.count);
+            bciov->on_io_complete(res->WRITE3res_u.resok.count, !inode->is_stable_write());
 
             // Create a new write_task for the remaining bc_iovec.
             struct rpc_task *write_task =
@@ -1352,7 +1358,7 @@ static void write_iov_callback(
             return;
         } else {
             // Complete bc_iovec IO completed.
-            bciov->on_io_complete(res->WRITE3res_u.resok.count);
+            bciov->on_io_complete(res->WRITE3res_u.resok.count, !inode->is_stable_write());
 
             // Complete data writen to blob.
             AZLogDebug("[{}] <{}> Completed write, off: {}, len: {}",
@@ -1384,8 +1390,28 @@ static void write_iov_callback(
         bciov->on_io_fail(status);
     }
 
-    delete bciov;
-    task->rpc_api->pvt = nullptr;
+    // check if commit is pending.
+    if (!inode->get_filecache()->is_flushing_in_progress()) {
+        bool create_commit_task = false;
+
+        std::unique_lock<std::mutex> lock(inode->commit_lock_5);
+        {
+            if (inode->is_commit_pending()) {
+                assert(inode->is_stable_write() == false);
+                inode->set_commit_in_progress();
+                create_commit_task = true;
+            }
+        }
+        lock.unlock();
+
+        if (create_commit_task) {
+            // Create a new flush_task for the remaining bc_iovec.
+            struct rpc_task *commit_task =
+                    client->get_rpc_task_helper()->alloc_rpc_task_reserved(FUSE_FLUSH);
+            commit_task->init_flush(nullptr /* fuse_req */, ino);
+            commit_task->issue_commit_rpc();
+        }
+    }
 
     /*
      * If this write_rpc was issued as part of a parent write task, then
@@ -1411,6 +1437,9 @@ static void write_iov_callback(
         }
     }
 
+    delete bciov;
+    task->rpc_api->pvt = nullptr;
+
     // Release the task.
     task->free_rpc_task();
 }
@@ -1469,7 +1498,8 @@ void rpc_task::issue_write_rpc()
     args.file = inode->get_fh();
     args.offset = offset;
     args.count = length;
-    args.stable = FILE_SYNC;
+    args.stable = inode->is_stable_write() ? FILE_SYNC : UNSTABLE;
+    set_task_csched(inode->is_stable_write());
 
     do {
         rpc_retry = false;
@@ -2522,9 +2552,8 @@ void rpc_task::run_access()
 static void perform_inline_writes(struct rpc_task *task,
                                   struct nfs_inode *inode)
 {
+    AZLogInfo("Performing inline writes for inode {}", inode->get_fuse_ino());
     const size_t length = task->rpc_api->write_task.get_size();
-    const off_t offset = task->rpc_api->write_task.get_offset();
-    const fuse_ino_t ino = task->rpc_api->write_task.get_ino();
     std::vector<bytes_chunk> bc_vec;
 
     /*
@@ -2534,6 +2563,12 @@ static void perform_inline_writes(struct rpc_task *task,
      */
     inode->flush_lock();
 
+    if (!inode->get_filecache()->do_inline_write()) {
+        inode->flush_unlock();
+        task->reply_write(length);
+        return;
+    }
+
     /*
      * prune_bytes is the number of bytes that need to be flushed to
      * make space for the new writes. If it's 0, then we don't need to
@@ -2543,7 +2578,7 @@ static void perform_inline_writes(struct rpc_task *task,
     uint64_t prune_bytes = inode->get_filecache()->get_bytes_to_prune();
 
     if (prune_bytes == 0 && !inode->is_stable_write()) {
-        AZLogDebug("No bytes to prune, Wait for ongoing flush/commit"
+        AZLogInfo("No bytes to prune, Wait for ongoing flush/commit"
                     "to complete.");
         /*
          * In case of inline_write, we want to block the writes till the
@@ -2559,99 +2594,291 @@ static void perform_inline_writes(struct rpc_task *task,
         *       is started.
         */
         inode->wait_for_ongoing_flush_commit(0, UINT64_MAX, task);
+        inode->flush_unlock();
+        return;
     } else {
         uint64_t size = 0;
-        if (inode->is_stable_write()) {
+        if (!inode->is_stable_write()) {
             /*
-             * If the write is stable, we can directly write to the
-             * membufs and issue the writes to the backend.
+             * We need to wait for the ongoing flush/commit to complete.
+             * This is only for the case where we have to prune some data
+             * to make space for the new writes. Usually this will not be
+             * happen so often, it's ok to wait here.
              */
-            bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range();
-            if (bc_vec.empty()) {
-               /*
-                * Another application write raced with us and it got to do the
-                * inline write before us. Try adding this task to the flush_waiters
-                * list for the membuf where we copied the application data. If
-                * we are able to add successfully, then we will call the fuse
-                * callback when the flush completes at the backend. This will
-                * slow down the writer application, thus relieving the memory
-                * pressure.
-                * If add_flush_waiter() returns false, that means this membuf
-                * is already flushed by that other thread and we can complete
-                * the fuse callback in this context.
-                */
-                inode->flush_unlock();
+            AZLogDebug("Wait for ongoing flush/commit to complete.");
+            inode->wait_for_ongoing_flush_commit(0, UINT64_MAX);
+
+            assert(!inode->is_commit_in_progress());
+            assert(!inode->get_filecache()->is_flushing_in_progress());
 
-                if (inode->get_filecache()->add_flush_waiter(offset,
-                                                             length,
-                                                             task)) {
-                    AZLogDebug("[{}] Inline write, membuf not flushed, write "
-                                "[{}, {}) will be completed when membuf is flushed",
-                                ino, offset, offset+length);
-                    return;
-                } else {
-                    AZLogDebug("[{}] Inline write, membuf already flushed, "
-                                "completing fuse write [{}, {})",
-                                ino, offset, offset+length);
-                    task->reply_write(length);
-                    return;
-                }
-            }
-        } else {
             /*
              * If the write is not stable, we need to copy the application
              * data into the membufs and issue the writes to the backend.
              */
             bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size);
-            if (!bc_vec.empty()) {
+            if (size >= prune_bytes) {
+                assert(!bc_vec.empty());
                 /*
                  * Perform inline sync.
                  * In case of unstable writes, we need to complete the write
                  * on commit complete. So, we don't pass the 3rd arg to
                  * sync_membufs(), but add the task to the commit_waiters list.
                  */
-                inode->sync_membufs(bc_vec, false /* is_flush */, task);
+                inode->sync_membufs(bc_vec, false /* is_flush */, nullptr);
+                inode->flush_unlock();
+
+                /*
+                 * Set the commit_in_progress flag to ensure that the commit
+                 * task is started after the flush completes.
+                 * Whether we hit commit limit or not, we need to start commit
+                 * as we need to release the memory.
+                 */
+                inode->mark_commit_in_progress();
+
+                inode->add_task_to_commit_waiters(task);
+                return;
             }
-            inode->flush_unlock();
+        }
+
+        if (!inode->is_stable_write()) {
+            AZLogWarn("Not enough contiguous dirty data to prune, may be"
+                      "pattern is not sequential. Prune_bytes={}, size={}",
+                       prune_bytes, size);
 
             /*
-             * Set the commit_in_progress flag to ensure that the commit
-             * task is started after the flush completes.
+             * Let's switch to stable write mode.
              */
-            inode->mark_commit_in_progress();
-
-            if (inode->get_filecache()->add_commit_waiter(
-                                                inode->last_flushing_offset,
-                                                inode->last_flushing_length,
-                                                task)) {
-                AZLogDebug("[{}] Inline write, membuf not flushed, write "
-                            "[{}, {}) will be completed when membuf is flushed",
-                            ino, inode->last_flushing_offset,
-                            inode->last_flushing_length);
-                return;
-            } else {
-                AZLogDebug("[{}] Inline write, membuf already flushed, "
-                            "completing fuse write [{}, {})",
-                            ino, inode->last_flushing_offset,
-                            inode->last_flushing_length);
-                task->reply_write(length);
-                return;
-            }
+            inode->switch_to_stable_write();
+        }
+
+        /*
+         * If the write is stable, we can directly write to the
+         * membufs and issue the writes to the backend.
+         */
+        bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range();
+        if (!bc_vec.empty()) {
+            /*
+            * Perform inline sync.
+            * Since we pass the 3rd arg to sync_membufs, it tells sync_membufs()
+            * to call the fuse callback after all the issued backend writes
+            * complete. This will be done asynchronously while the sync_membufs()
+            * call will return after issuing the writes.
+            */
+            inode->sync_membufs(bc_vec, false /* is_flush */, task);
+            inode->flush_unlock();
+
+            // Free up the fuse thread without completing the application write.
+            return;
         }
+
+        /*
+        * Another application write raced with us and it got to do the
+        * inline write before us. Try adding this task to the flush_waiters
+        * list for the membuf where we copied the application data. If
+        * we are able to add successfully, then we will call the fuse
+        * callback when the flush completes at the backend. This will
+        * slow down the writer application, thus relieving the memory
+        * pressure.
+        * If add_flush_waiter() returns false, that means this membuf
+        * is already flushed by that other thread and we can complete
+        * the fuse callback in this context.
+        */
+        inode->flush_unlock();
+
+        inode->add_task_to_flush_waiters(task);
+        return;
+    }
+
+    assert(false);
+}
+
+static bool handle_writes(struct nfs_inode *inode,
+                          struct rpc_task *task,
+                          const fuse_ino_t ino,
+                          const bool sparse_write,
+                          uint64_t extent_left,
+                          uint64_t extent_right)
+{
+    assert(task->magic == RPC_TASK_MAGIC);
+    assert(task->rpc_api->write_task.is_fe());
+    assert(task->get_op_type() == FUSE_WRITE);
+
+    const size_t length = task->rpc_api->write_task.get_size();
+
+    /*
+     * Check what kind of limit we have hit.
+     */
+    const bool need_inline_write =
+        (sparse_write || inode->get_filecache()->do_inline_write());
+    const bool need_commit =
+        inode->get_filecache()->commit_required();
+    const bool need_flush =
+        inode->get_filecache()->flush_required();
+
+    AZLogDebug("[{}] handle write (sparse={}, need_inline_write={}, need_commit={},"
+                   " need_flush={} )", ino, sparse_write, need_inline_write,
+                    need_commit, need_flush);
+
+    /*
+     * Nothing to do, we can complete the application write rightaway.
+     */
+    if (!need_inline_write && !need_commit && !need_flush) {
+        task->reply_write(length);
+        return true;
+    }
+
+    /*
+     * We have successfully copied the user data into the cache.
+     * We can have the following cases:
+     * 1. This new write caused a contiguous extent to be greater than
+     *    max_dirty_extent_bytes(), aka MDEB.
+     *    In this case we begin flush of this contiguous extent (in multiple
+     *    parallel wsize sized blocks) since there's no benefit in waiting more
+     *    as the data is sufficient for the server scheduler to effectively
+     *    write, in optimal sized blocks.
+     *    We complete the application write rightaway without waiting for the
+     *    flush to complete as we are not under memory pressure.
+     *    This will happen when user is sequentially writing to the file.
+     * 2. A single contiguous extent is not greater than MDEB but total dirty
+     *    bytes waiting to be flushed is more than MDEB.
+     *    In this case we begin flush of the entire dirty data from the cache
+     *    as we have sufficient data for the server scheduler to perform
+     *    batched writes effectively.
+     *    We complete the application write rightaway without waiting for the
+     *    flush to complete as we are not under memory pressure.
+     *    This will happen when user is writing to the file in a random or
+     *    pseudo-sequential fashion.
+     * 3. Cache has dirty data beyond the "inline write" threshold, ref
+     *    do_inline_write().
+     *    In this case we begin flush of the entire dirty data.
+     *    This is a memory pressure situation and hence we do not complete the
+     *    application write till all the backend writes complete.
+     *    This will happen when user is writing faster than our backend write
+     *    throughput, eventually dirty data will grow beyond the "inline write"
+     *    threshold and then we have to slow down the writers by delaying
+     *    completion.
+     *
+     * Other than this we have a special case of "write beyond eof" (termed
+     * sparse write). In sparse write case also we perform "inline write" of
+     * all the dirty data. This is needed for correct read behaviour. Imagine
+     * a reader reading from the sparse part of the file which is not yet in
+     * the bytes_chunk_cache. This read will be issued to the server and since
+     * server doesn't know the updated file size (as the write may still be
+     * sitting in our bytes_chunk_cache) it will return eof. This is not correct
+     * as such reads issued after successful write, are valid and should return
+     * 0 bytes for the sparse range.
+     */
+
+    // Case 1: Inline writes
+    /*
+     * Do we need to perform "inline write"?
+     * Inline write implies, we flush all the dirty data and wait for all the
+     * corresponding backend writes to complete.
+     */
+    if (need_inline_write) {
+        INC_GBL_STATS(inline_writes, 1);
+
+        // Free up the fuse thread without completing the application write.
+        perform_inline_writes(task, inode);
+        return true;
     }
 
+    // Case 2: Commit
     /*
-     * Perform inline sync.
-     * Since we pass the 3rd arg to sync_membufs, it tells sync_membufs()
-     * to call the fuse callback after all the issued backend writes
-     * complete. This will be done asynchronously while the sync_membufs()
-     * call will return after issuing the writes.
+     * We don't need to do inline writes. See if we need to commit the dirty
+     * data to the backend
      */
-    inode->sync_membufs(bc_vec, false /* is_flush */, task);
-    inode->flush_unlock();
+    if (need_commit) {
+        /*
+         * Set the commit_in_progress flag to ensure that the commit
+         * task is startet.
+         */
+        inode->mark_commit_in_progress();
+
+        assert(inode->is_commit_in_progress());
+        assert(!need_inline_write);
+    }
+
+    // Case 3: Flush
+    if (need_flush) {
+        /*
+         * We need to flush the dirty data in the range [extent_left, extent_right),
+         * get the membufs for the dirty data.
+         * We don't want to run over an inprogress truncate and resetting the file
+         * size set by truncate, so grab the is_flushing lock.
+         */
+        inode->flush_lock();
+
+        /*
+         * If flush is required but flush/commit already in progress, then
+         * add this task to the respective waiters list and return.
+         * As we already 1GB worth of dirty data in the cache, we don't want
+         * to add more data to the cache. So we wait for the ongoing flush/commit.
+         */
+        if (inode->is_commit_in_progress()) {
+            /*
+             * If commit is in progress, we need to wait for the commit
+             * to complete before we can proceed with the write.
+             */
+            inode->add_task_to_commit_waiters(task);
+            inode->flush_unlock();
+            return true;
+        }
+
+        if (inode->get_filecache()->is_flushing_in_progress()) {
+            /*
+             * If flush is in progress, we need to wait for the flush to
+             * complete before we can proceed with the write.
+             */
+            inode->add_task_to_flush_waiters(task);
+            inode->flush_unlock();
+            return true;
+        }
+
+        /*
+         * If we are here, then we need to flush the dirty data to the backend.
+         * We don't need to wait for the flush to complete, we can complete the
+         * application write rightaway.
+         */
+        uint64_t size = 0;
+        std::vector<bytes_chunk> bc_vec;
+
+        /*
+        * If the inode is stable write, then get the dirty membufs in the range.
+        * otherwise get the dirty membufs contigious range.
+        */
+        if (inode->is_stable_write()) {
+            bc_vec =
+            inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left,
+                                                                    extent_right);
+        } else {
+            bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size);
+        }
+
+        if (bc_vec.size() == 0) {
+            inode->flush_unlock();
+            task->reply_write(length);
+            return true;
+        }
 
-    // Free up the fuse thread without completing the application write.
-    return;
+        /*
+        * Pass is_flush as false, since we don't want the writes to complete
+        * before returning.
+        */
+        inode->sync_membufs(bc_vec, false /* is_flush */);
+        inode->flush_unlock();
+
+        // Send reply to original request without waiting for the backend write to complete.
+        task->reply_write(length);
+        return true;
+    }
+
+    /*
+     * If we reached true, then we have need_commit true, but need_flush false.
+     */
+    assert(need_commit && !need_flush);
+    task->reply_write(length);
+    return true;
 }
 
 void rpc_task::run_write()
@@ -2664,7 +2891,7 @@ void rpc_task::run_write()
     const size_t length = rpc_api->write_task.get_size();
     struct fuse_bufvec *const bufv = rpc_api->write_task.get_buffer_vector();
     const off_t offset = rpc_api->write_task.get_offset();
-    const bool sparse_write = (offset > inode->get_file_size(true));
+    const bool sparse_write = false;
     uint64_t extent_left = 0;
     uint64_t extent_right = 0;
 
@@ -2753,83 +2980,6 @@ void rpc_task::run_write()
 
     assert(extent_right >= (extent_left + length));
 
-    /*
-     * We have successfully copied the user data into the cache.
-     * We can have the following cases:
-     * 1. This new write caused a contiguous extent to be greater than
-     *    max_dirty_extent_bytes(), aka MDEB.
-     *    In this case we begin flush of this contiguous extent (in multiple
-     *    parallel wsize sized blocks) since there's no benefit in waiting more
-     *    as the data is sufficient for the server scheduler to effectively
-     *    write, in optimal sized blocks.
-     *    We complete the application write rightaway without waiting for the
-     *    flush to complete as we are not under memory pressure.
-     *    This will happen when user is sequentially writing to the file.
-     * 2. A single contiguous extent is not greater than MDEB but total dirty
-     *    bytes waiting to be flushed is more than MDEB.
-     *    In this case we begin flush of the entire dirty data from the cache
-     *    as we have sufficient data for the server scheduler to perform
-     *    batched writes effectively.
-     *    We complete the application write rightaway without waiting for the
-     *    flush to complete as we are not under memory pressure.
-     *    This will happen when user is writing to the file in a random or
-     *    pseudo-sequential fashion.
-     * 3. Cache has dirty data beyond the "inline write" threshold, ref
-     *    do_inline_write().
-     *    In this case we begin flush of the entire dirty data.
-     *    This is a memory pressure situation and hence we do not complete the
-     *    application write till all the backend writes complete.
-     *    This will happen when user is writing faster than our backend write
-     *    throughput, eventually dirty data will grow beyond the "inline write"
-     *    threshold and then we have to slow down the writers by delaying
-     *    completion.
-     *
-     * Other than this we have a special case of "write beyond eof" (termed
-     * sparse write). In sparse write case also we perform "inline write" of
-     * all the dirty data. This is needed for correct read behaviour. Imagine
-     * a reader reading from the sparse part of the file which is not yet in
-     * the bytes_chunk_cache. This read will be issued to the server and since
-     * server doesn't know the updated file size (as the write may still be
-     * sitting in our bytes_chunk_cache) it will return eof. This is not correct
-     * as such reads issued after successful write, are valid and should return
-     * 0 bytes for the sparse range.
-     */
-
-    /*
-     * Do we need to perform "inline write"?
-     * Inline write implies, we flush all the dirty data and wait for all the
-     * corresponding backend writes to complete.
-     */
-    const bool need_inline_write =
-        (sparse_write || inode->get_filecache()->do_inline_write());
-
-    if (need_inline_write) {
-        INC_GBL_STATS(inline_writes, 1);
-
-        AZLogDebug("[{}] Inline write (sparse={}), {} bytes, extent @ [{}, {})",
-                   ino, sparse_write, (extent_right - extent_left),
-                   extent_left, extent_right);
-
-        // Free up the fuse thread without completing the application write.
-        return perform_inline_writes(this, inode);
-    }
-
-    /*
-     * We don't need to do inline writes. See if we have enough bytes to
-     * commit to the backend.
-     */
-    const bool needs_commit = inode->get_filecache()->commit_required();
-    if (needs_commit) {
-        /*
-         * Set the commit_in_progress flag to ensure that the commit
-         * task is startet.
-         */
-        inode->mark_commit_in_progress();
-
-        assert(inode->is_commit_in_progress());
-        assert(!need_inline_write);
-    }
-
     /*
      * If the extent size exceeds the max allowed dirty size as returned by
      * max_dirty_extent_bytes(), then it's time to flush the extent.
@@ -2845,83 +2995,11 @@ void rpc_task::run_write()
         inode->get_filecache()->max_dirty_extent_bytes();
     assert(max_dirty_extent > 0);
 
-    /*
-     * How many bytes in the cache need to be flushed. These are dirty chunks
-     * which have not started flushing yet.
-     */
-    const uint64_t bytes_to_flush =
-        inode->get_filecache()->get_bytes_to_flush();
-
-    AZLogDebug("[{}] extent_left: {}, extent_right: {}, size: {}, "
-               "bytes_to_flush: {} (max_dirty_extent: {})",
-               ino, extent_left, extent_right,
-               (extent_right - extent_left),
-               bytes_to_flush,
-               max_dirty_extent);
-
-    if (inode->is_commit_in_progress()) {
-        /*
-         * Commit is in progress, we can't start new flush to the BLOB.
-         * Complete the write if it doesn't need flush, otherwise add to
-         * commit_waiters list.
-         * Note: It make sense to complete the write only after commit is done
-         *       if we already reached the flush limit. As otherwise it will
-         *       trigger inline writes.
-         */
-        if (bytes_to_flush < max_dirty_extent) {
-            AZLogDebug("[{}] Commit in progress, skipping write, inline_write is false",
-                       ino);
-            reply_write(length);
-            return;
-        } else {
-            /*
-             * Add this task to the commit_waiters list, so that it can be
-             * completed after the commit is done.
-             */
-            if(inode->get_filecache()->add_commit_waiter(
-                                            inode->last_flushing_offset,
-                                            inode->last_flushing_length,
-                                            this)) {
-                AZLogDebug("[{}] Commit in progress, added to commit_waiters list",
-                           ino);
-                return;
-            } else {
-                /*
-                 * Commit is done, but this task is not added to commit_waiters
-                 * list, so complete the write.
-                 */
-                AZLogDebug("[{}] Commit in progress, but not added to commit_waiters list, "
-                           "completing write", ino);
-                reply_write(length);
-            return;
-            }
-        }
-
-        // We should never reach here.
-        assert(false);
-        return;
-    }
-
     /*
      * Ok, we neither need inline writes nor commit in progress. See if we have
      * enough dirty data and we need to start async flush.
      */
-
     if ((extent_right - extent_left) < max_dirty_extent) {
-        /*
-         * Current extent is not large enough to be flushed, see if we have
-         * enough total dirty data that needs to be flushed. This is to cause
-         * random writes to be periodically flushed.
-         */
-        if (bytes_to_flush < max_dirty_extent) {
-            /*
-             * No memory pressure.
-             */
-            AZLogDebug("Reply write without syncing to Blob");
-            reply_write(length);
-            return;
-        }
-
         /*
          * This is the case of non-sequential writes causing enough dirty
          * data to be accumulated, need to flush all of that.
@@ -2930,43 +3008,12 @@ void rpc_task::run_write()
         extent_right = UINT64_MAX;
     }
 
-    /*
-     * We need to flush the dirty data in the range [extent_left, extent_right),
-     * get the membufs for the dirty data.
-     * We don't want to run over an inprogress truncate and resetting the file
-     * size set by truncate, so grab the is_flushing lock.
-     */
-    inode->flush_lock();
-
-    uint64_t size = 0;
-    std::vector<bytes_chunk> bc_vec;
-    /*
-     * If the inode is stable write, then get the dirty membufs in the range.
-     * otherwise get the dirty membufs contigious range.
-     */
-    if (inode->is_stable_write()) {
-        bc_vec =
-         inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left,
-                                                                 extent_right);
-    } else {
-        bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size);
-    }
-
-    if (bc_vec.size() == 0) {
-        inode->flush_unlock();
-        reply_write(length);
+    if (handle_writes(inode, this, ino, sparse_write,
+                 extent_left, extent_right)) {
         return;
+    } else {
+        assert(false);
     }
-
-    /*
-     * Pass is_flush as false, since we don't want the writes to complete
-     * before returning.
-     */
-    inode->sync_membufs(bc_vec, false /* is_flush */);
-    inode->flush_unlock();
-
-    // Send reply to original request without waiting for the backend write to complete.
-    reply_write(length);
 }
 
 void rpc_task::run_flush()

From fff47d6f9866a94d5c609055514fb825b26c5bbe Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Tue, 28 Jan 2025 13:47:39 +0000
Subject: [PATCH 3/9] Audit changes.

- audited file_cache.h
---
 turbonfs/inc/file_cache.h | 226 ++++++++++++++++++++++++++------------
 turbonfs/src/rpc_task.cpp |   2 +-
 2 files changed, 158 insertions(+), 70 deletions(-)

diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index 39ddee58..6a6165d1 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -233,10 +233,34 @@ struct membuf
      * immediately so that it can handle other interactive requests (note that
      * fuse threads are very limited). Later when the flush/write to Blob
      * actually completes, it completes the fuse task(s) queued in this list.
+     *
+     * Note: Depending on whether a membuf is written using UNSTABLE or STABLE
+     *       write, it will or won't need to be committed. A membuf written
+     *       using UNSTABLE write will be just added to the TBL (Temporary Block
+     *       List) of the Blob and won't be visible to other readers and
+     *       writers, till it's committed (which will add it to the Committed
+     *       Block List, the CBL).
+     *       A membuf written using STABLE write will be committed, i.e., it'll
+     *       be added to the CBL, and won't need an extra commit call.
+     *       So, with UNSTABLE write a membuf will have a flush call to write
+     *       the membuf data and then a commit call to commit it to the Blob,
+     *       (usually) along with more such membufs. Various membufs can be
+     *       flushed in parallel, which allows us to get higher write throughput
+     *       than what a single connection can support.
      */
     std::vector<struct rpc_task *> *flush_waiters = nullptr;
     std::mutex flush_waiters_lock_44;
 
+    /**
+     * Like flush_waiters, commit_waiters is the list of tasks(s) waiting for
+     * this membuf to be committed, i.e., data in this membuf to be added to
+     * the CBL (Committed Block List) of the backing Blob. Note that the membuf
+     * must have started flushing, but it may not have yet completed flushing.
+     * When the membuf successfully completes flushing, the flush callback will
+     * start the commit and on completion of commit, the commit_waiters will be
+     * called.
+     * Also see flush_waiters.
+     */
     std::vector<struct rpc_task *> *commit_waiters = nullptr;
     std::mutex commit_waiters_lock_44;
 
@@ -1235,6 +1259,27 @@ class bytes_chunk_cache
                           uint64_t length,
                           struct rpc_task *task);
 
+    /**
+     * Add 'task' to the commit_waiters list for membuf covering the region
+     * [offset, offset+list). If the region is covered by more than one membuf
+     * then 'task' is added to the commit_waiters list of the last membuf.
+     *
+     * Returns true if task was successfully added. This will happen when the
+     * following conditions are met:
+     * 1. There is a membuf with at least one byte overlapping with the given
+     *    region [offset, offset+list).
+     * 2. The membuf is either in flushing or commit_pending state.
+     *    If in flushing state, it'll first move to commit_pending state
+     *    after successful flush and then after commit completed, this
+     *    task will be completed.
+     *
+     * If it returns true, caller need not call the fuse callback as it'll
+     * get called when the membuf is committed.
+     * If task cannot be added to the commit_waiters list, it returns false,
+     * and in that case caller must complete the fuse callback.
+     *
+     * LOCK: This takes chunkmap_lock_43 lock exclusively.
+     */
     bool add_commit_waiter(uint64_t offset,
                            uint64_t length,
                            struct rpc_task *task);
@@ -1396,58 +1441,65 @@ class bytes_chunk_cache
     }
 
     /**
-     * Maximum size a dirty extent can grow before we should flush it.
+     * Maximum size a dirty extent can grow before we must flush it.
      * This is 60% of the allowed cache size or 1GB whichever is lower.
      * The reason for limiting it to 1GB is because there's not much value in
      * holding more data than the Blob NFS server's scheduler cache size.
      * We want to send as prompt as possible to utilize the n/w b/w but slow
      * enough to give the write scheduler an opportunity to merge better.
      *
-     * Note: max_dirty_extent is static as it doesn't change after it's
-     *       queried for the first time.
+     * TODO: For unstable writes, it may not make sense to wait after we have
+     *       a full block worth of data.
      */
-    uint64_t max_dirty_extent_bytes() const
+    static uint64_t max_dirty_extent_bytes()
     {
         // Maximum cache size allowed in bytes.
         static const uint64_t max_total =
             (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
         assert(max_total != 0);
-        static const uint64_t max_dirty_extent = std::min(
-            (uint64_t)(max_total * 0.6), (uint64_t)(1024 * 1024 * 1024ULL));
 
+        /*
+         * Minimum of 60% of max cache and 1000MiB.
+         * 1000MiB, as it can be equally divided into 100MiB blocks.
+         */
+        static const uint64_t max_dirty_extent =
+            std::min((uint64_t)(max_total * 0.6), 1000 * 1024 * 1024UL);
+
+        // Must be non-zero.
         assert(max_dirty_extent != 0);
 
         return max_dirty_extent;
     }
 
-    /*
-     * Maximum size of commit bytes that can be pending in cache, before we
-     * should commit it to Blob.
-     * It should be greater than max_dirty_extent_bytes() and smaller than
-     * inline_dirty_threshold. So, that inline pruning can be avoided.
-     * This is 60% of the allowed cache size.
-     * f.e = Cache size of 4GB then max_commit_bytes = 2.4GB
-     * - Flush will start every 1GB dirty data and each 1GB dirty data
-     *   converted to commit_pending_bytes.
-     *
-     * This is 60% of the allowed cache size.
-     */
-    uint64_t max_commit_bytes() const
+    /**
+     * Maximum size of commit_pending data that can be in cache, before we
+     * must commit it to Blob.
+     * It should be greater than or equal to the flush threshold (as returned
+     * by max_dirty_extent_bytes()) and smaller than the inline write threshold
+     * (as suggested by do_inline_write()), to minimize inline flush waits as
+     * much as possible, in steady state.
+     */
+    static uint64_t max_commit_bytes()
     {
         // Maximum cache size allowed in bytes.
         static const uint64_t max_total =
             (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
         assert(max_total != 0);
 
+        /*
+         * Minimum of 60% of max cache and 2 times the flush limit.
+         * We want to commit as soon as possible w/o affecting performance.
+         * If we commit too often, since commit is a serializing operation,
+         * it'll affect the write throughput, otoh, if we commit too late
+         * then we might hit the inline write threshold, which again would
+         * serialize writes, bringing down throughput.
+         */
         static const uint64_t max_commit_bytes =
-            std::max((((uint64_t)(max_total * 0.6))/ max_dirty_extent_bytes()) * max_dirty_extent_bytes(),
-                     (uint64_t) (2 * AZNFSCFG_WSIZE_MAX));
-
-        static const uint64_t min_commit_bytes =
-         std::max(((2 * max_dirty_extent_bytes()) - AZNFSCFG_WSIZE_MAX),
-          (uint64_t) (2 * AZNFSCFG_WSIZE_MAX));
+            std::min((uint64_t)(max_total * 0.6),
+                     2 * max_dirty_extent_bytes());
+        assert(max_commit_bytes > 0);
 
-        return std::max((max_commit_bytes - AZNFSCFG_WSIZE_MAX), min_commit_bytes);
+        return max_commit_bytes;
     }
 
     /**
@@ -1498,59 +1550,61 @@ class bytes_chunk_cache
         return bytes_flushing > 0;
     }
 
-    /*
-     * Check if bytes_commit_pending is equal or more than
-     * max_commit_bytes limit.
+    /**
+     * Check if we must initiate a COMMIT RPC now. Note that the caller would
+     * just send the COMMIT RPC and not necessarily block the user write
+     * request till the COMMIT RPC completes, i.e., it's not an inline commit.
+     *
+     * We must start commit if:
+     * 1. We have enough commit_pending data for this file/cache, or,
+     * 2. Global memory pressure dictates that we commit now to free up
+     *    memory. In this case we might be committing more frequently which
+     *    won't necessarily be optimal, but we have no choice due to the
+     *    memory pressure.
      */
     bool commit_required() const
     {
-        return (bytes_commit_pending >= max_commit_bytes());
-    }
+        const bool local_pressure =
+            (bytes_commit_pending >= max_commit_bytes());
 
-    /*
-     * Check if bytes_to_flush is equal or more than
-     * max_dirty_extent_bytes limit.
-     */
-    bool flush_required() const
-    {
-        return (get_bytes_to_flush() >= max_dirty_extent_bytes());
+        if (local_pressure) {
+            return true;
+        }
+
+        /*
+         * TODO: Take cue from global memory pressure.
+         */
+        return false;
     }
 
-    /*
-     * get_bytes_to_prune() returns the number of bytes that need to be flushed
-     * inline to free up the space. If there are enough bytes_flushing then we
-     * can just wait for them to complete.
-     *
-     * Note : We are not considering bytes_commit_pending in this calculation.
-     *        If bytes_commit_pending are high then commit already started and
-     *        if bytes_flushing are high once flushing is done commit will be
-     *        triggered.
+    /**
+     * Check if we must initiate flush of some cached data. Note that the caller
+     * would just send the corresponding WRITE RPC and not necessarily block the
+     * user write request till the WRITE RPC completes, i.e., it's not an inline
+     * write.
+     *
+     * We must start flush/write if:
+     * 1. We have enough bytes to flush so that we can write a full sized
+     *    block, or for the case of stable write, we have enough data to fill
+     *    the scheduler queue.
+     * 2. Global memory pressure dictates that we flush now to free up memory.
+     *    In this case we might be flushing more frequently which won't
+     *    necessarily be optimal, but we have no choice due to the memory
+     *    pressure.
      */
-    uint64_t get_bytes_to_prune() const
+    bool flush_required() const
     {
-        static const uint64_t max_dirty_allowed_per_cache =
-            max_dirty_extent_bytes() * 2;
-        int64_t total_bytes =
-            std::max(int64_t(bytes_dirty - bytes_flushing), int64_t(0));
-        const bool local_pressure = total_bytes > (int64_t)max_dirty_allowed_per_cache;
+        const bool local_pressure =
+            (get_bytes_to_flush() >= max_dirty_extent_bytes());
 
         if (local_pressure) {
-            return std::max(int64_t(0), (int64_t)(total_bytes - max_dirty_extent_bytes()));
+            return true;
         }
 
         /*
-         * Global pressure is when get_prune_goals() returns non-zero bytes
-         * to be pruned inline.
+         * TODO: Take cue from global memory pressure.
          */
-        uint64_t inline_bytes;
-
-        /*
-         * TODO: Noisy neighbor syndrome, where one file is hogging the cache,
-         *       inline pruning will be triggered for other files.
-         */
-        get_prune_goals(&inline_bytes, nullptr);
-        return std::max(int64_t(inline_bytes - (bytes_flushing + bytes_commit_pending)),
-                    (int64_t)0);
+        return false;
     }
 
     /**
@@ -1559,6 +1613,21 @@ class bytes_chunk_cache
      * global memory pressure.
      */
     bool do_inline_write() const
+    {
+        return get_inline_flush_bytes() > 0;
+    }
+
+    /**
+     * Inline write/flush means that we are under sufficient memory pressure
+     * that we want to slow down the application writes by not completing them
+     * after copying the data to cache (using copy_to_cache()) but instead
+     * complete application writes only after the flush completes.
+     *
+     * This function returns a non-zero number representing the number of bytes
+     * that we need to write inline, else if not under memory pressure returns
+     * zero.
+     */
+    uint64_t get_inline_flush_bytes() const
     {
         /*
          * Allow two dirty extents before we force inline write.
@@ -1566,11 +1635,18 @@ class bytes_chunk_cache
          * the second one.
          */
         static const uint64_t max_dirty_allowed_per_cache =
-            max_dirty_extent_bytes() * 2;
-        const bool local_pressure = bytes_dirty > max_dirty_allowed_per_cache;
+            (max_dirty_extent_bytes() * 2);
+        const bool local_pressure =
+            ((int64_t) get_bytes_to_flush() > (int64_t) max_dirty_allowed_per_cache);
 
         if (local_pressure) {
-            return true;
+            /*
+             * Leave one max_dirty_extent_bytes worth of dirty bytes, and
+             * flush the rest.
+             */
+            const int64_t flush_now =
+                (get_bytes_to_flush() - max_dirty_extent_bytes());
+            return std::max((int64_t) 0, flush_now);
         }
 
         /*
@@ -1579,8 +1655,20 @@ class bytes_chunk_cache
          */
         uint64_t inline_bytes;
 
+        /*
+         * TODO: Noisy neighbor syndrome, where one file is hogging the cache,
+         *       inline pruning will be triggered for other files.
+         */
         get_prune_goals(&inline_bytes, nullptr);
-        return (inline_bytes > 0);
+
+        /*
+         * (bytes_flushing + bytes_commit_pending) represents the data which
+         * is either already flushed or being flushed. Exclude that from the
+         * needs-to-be-flushed data.
+         */
+        const int64_t flush_now =
+            (inline_bytes - (bytes_flushing + bytes_commit_pending));
+        return std::max((int64_t) 0, flush_now);
     }
 
     /**
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 145493d6..ba6117da 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -2575,7 +2575,7 @@ static void perform_inline_writes(struct rpc_task *task,
      * flush more data we can just wait for the ongoing flush/commit
      * to complete.
      */
-    uint64_t prune_bytes = inode->get_filecache()->get_bytes_to_prune();
+    uint64_t prune_bytes = inode->get_filecache()->get_inline_flush_bytes();
 
     if (prune_bytes == 0 && !inode->is_stable_write()) {
         AZLogInfo("No bytes to prune, Wait for ongoing flush/commit"

From 00353ef4158e24c956c38765a2931b3eccbf74d5 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Wed, 29 Jan 2025 05:14:14 +0000
Subject: [PATCH 4/9] Audit rpc_task.cpp

---
 turbonfs/src/rpc_task.cpp | 179 ++++++++++++++++++++++++--------------
 1 file changed, 112 insertions(+), 67 deletions(-)

diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index ba6117da..6f776571 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -806,8 +806,20 @@ void access_callback(
     }
 }
 
+/**
+ * Helper function to complete all write tasks waiting for this membuf to be
+ * committed.
+ */
 static void complete_commit_waiter_tasks(struct membuf *mb)
 {
+    /*
+     * membuf must have finished committing.
+     * Obviously must not be dirty, as we will only ever commit a clean membuf
+     * (after it completes flushing).
+     */
+    assert(!mb->is_commit_pending());
+    assert(!mb->is_dirty());
+
     std::vector<struct rpc_task *> tvec = mb->get_commit_waiters();
     for (auto& task : tvec) {
         assert(task->magic == RPC_TASK_MAGIC);
@@ -815,7 +827,7 @@ static void complete_commit_waiter_tasks(struct membuf *mb)
         assert(task->rpc_api->write_task.is_fe());
         assert(task->rpc_api->write_task.get_size() > 0);
 
-        AZLogDebug("Completing flush waiter task {} for [{}, {})",
+        AZLogDebug("Completing commit waiter task {} for [{}, {})",
                     fmt::ptr(task),
                     task->rpc_api->write_task.get_offset(),
                     task->rpc_api->write_task.get_offset() +
@@ -831,40 +843,61 @@ static void commit_callback(
     void *private_data)
 {
     rpc_task *task = (rpc_task*) private_data;
+
+    /*
+     * Commit is issued as a FUSE_FLUSH.
+     * TODO: Maybe it should have a type of its own.
+     */
     assert(task->magic == RPC_TASK_MAGIC);
     assert(task->get_op_type() == FUSE_FLUSH);
     assert(task->rpc_api->pvt != nullptr);
+    // Commit is never called for a fuse request.
+    assert(task->get_fuse_req() == nullptr);
 
-    auto res = (COMMIT3res*)data;
+    auto res = (COMMIT3res*) data;
 
     INJECT_JUKEBOX(res, task);
 
-    const fuse_ino_t ino =
-        task->rpc_api->flush_task.get_ino();
+    const fuse_ino_t ino = task->rpc_api->flush_task.get_ino();
     struct nfs_inode *inode =
         task->get_client()->get_nfs_inode_from_ino(ino);
-    auto bc_vec_ptr = (std::vector<bytes_chunk> *)task->rpc_api->pvt;
+    // List of bcs committed by this commit call.
+    auto bc_vec_ptr = (std::vector<bytes_chunk> *) task->rpc_api->pvt;
+
+    // We should call commit only when inode is doing unstable+commit.
+    assert(!inode->is_stable_write());
+    // Caller must be inprogress.
+    assert(inode->is_commit_in_progress());
 
     const int status = task->status(rpc_status, NFS_STATUS(res));
     UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc);
-    AZLogDebug("commit_callback");
+
+    AZLogDebug("[{}] commit_callback", ino);
+
     /*
      * Now that the request has completed, we can query libnfs for the
      * dispatch time.
      */
     task->get_stats().on_rpc_complete(rpc_get_pdu(rpc), NFS_STATUS(res));
+
     if (status == 0) {
         uint64_t offset = 0;
         uint64_t length = 0;
-        auto &bc_vec = *bc_vec_ptr; // get the vector from the pointer.
 
-        for (auto &bc : bc_vec)
-        {
+        /*
+         * Go over all the successfully committed bcs and release them from
+         * file cache (note that successful commit confirms that server has
+         * persisted the data and client can fee it).
+         * Also complete any tasks waiting for these membufs to be committed.
+         */
+        for (auto &bc : *bc_vec_ptr) {
             struct membuf *mb = bc.get_membuf();
             assert(mb->is_inuse());
             assert(mb->is_locked());
             assert(mb->is_commit_pending());
             assert(mb->is_uptodate());
+            // Dirty membufs must not be committed.
+            assert(!mb->is_dirty());
 
             mb->clear_commit_pending();
             complete_commit_waiter_tasks(mb);
@@ -880,44 +913,41 @@ static void commit_callback(
             } else if (offset + length == bc.offset) {
                 length += bc.length;
             } else {
-                auto release = inode->get_filecache()->release(offset, length);
-                AZLogInfo("BC released {}, asked for {}", release, length);
+                const uint64_t released =
+                    inode->get_filecache()->release(offset, length);
+                AZLogDebug("[{}] commit_callback releasing bc [{}, {}), "
+                           "released {} bytes",
+                           ino, offset, offset+length, released);
                 offset = bc.offset;
                 length = bc.length;
             }
         }
 
+        // Release the last bc not released by the above loop.
         if (length != 0) {
-            assert(length != 0);
-            auto release = inode->get_filecache()->release(offset, length);
-
-            AZLogInfo("BC offset {}, asked for {}, release {}", (offset/1048576), (length/1048576), release);
-        }
-
-        task->free_rpc_task();
-        #if 0
-        if (task->get_fuse_req() != nullptr) {
-            task->reply_error(status);
-        } else {
-            task->free_rpc_task();
+            const uint64_t released =
+                inode->get_filecache()->release(offset, length);
+            AZLogDebug("[{}] commit_callback releasing bc [{}, {}), "
+                       "released {} bytes",
+                       ino, offset, offset+length, released);
         }
-        #endif
     } else if (NFS_STATUS(res) == NFS3ERR_JUKEBOX) {
         task->get_client()->jukebox_retry(task);
         return;
     } else {
-        auto &bc_vec = *bc_vec_ptr; // get the vector from the pointer.
         /*
-         * Mark the membufs as dirty and clear the commit_pending flag.
+         * Commit has failed.
+         * Go over all the bcs that this commit was targetting, mark the
+         * membufs back as dirty and clear the commit_pending flag.
          * Next write will initiate the flush again with stable write.
          */
-        for (auto &bc : bc_vec)
-        {
+        for (auto &bc : *bc_vec_ptr) {
             struct membuf *mb = bc.get_membuf();
             assert(mb->is_inuse());
             assert(mb->is_locked());
             assert(mb->is_commit_pending());
             assert(mb->is_uptodate());
+            assert(!mb->is_dirty());
 
             mb->clear_commit_pending();
             complete_commit_waiter_tasks(mb);
@@ -932,45 +962,48 @@ static void commit_callback(
          */
         assert(inode->get_filecache()->is_flushing_in_progress() == false);
         inode->set_stable_write();
-
-        task->free_rpc_task();
-        #if 0
-        if (task->get_fuse_req() != nullptr) {
-            task->reply_error(status);
-        } else {
-            inode->set_write_error(status);
-            task->free_rpc_task();
-        }
-        #endif
     }
 
-    delete bc_vec_ptr;
-
+    /*
+     * Clear commit inprogress for this inode.
+     * Only after any new flushes or commits can be sent for this file.
+     */
     inode->clear_commit_in_progress();
+
+    delete bc_vec_ptr;
+    task->rpc_api->pvt = nullptr;
+    task->free_rpc_task();
 }
 
 void rpc_task::issue_commit_rpc()
 {
     // Must only be called for a flush task.
     assert(get_op_type() == FUSE_FLUSH);
+    // Commit is never called for a fuse request.
+    assert(get_fuse_req() == nullptr);
 
     const fuse_ino_t ino = rpc_api->flush_task.get_ino();
     struct nfs_inode *inode = get_client()->get_nfs_inode_from_ino(ino);
-    assert(inode->is_stable_write() == false);
-    assert(inode->get_filecache()->is_flushing_in_progress() == false);
+
+    // Commit must not be called if we are doing stable writes.
+    assert(!inode->is_stable_write());
+    // Commit must mot be sent with ongoing flushes.
+    assert(!inode->get_filecache()->is_flushing_in_progress());
+    // Caller must have marked commit inprogress.
+    assert(inode->is_commit_in_progress());
 
     COMMIT3args args;
     ::memset(&args, 0, sizeof(args));
     bool rpc_retry = false;
 
-    AZLogDebug("issue_commit_rpc");
+    AZLogDebug("[{}] issue_commit_rpc", ino);
 
     /*
      * Get the bcs marked for commit_pending.
      */
-    if (rpc_api->pvt == nullptr) {
-        rpc_api->pvt = static_cast<void *>(new std::vector<bytes_chunk>(inode->get_filecache()->get_commit_pending_bcs()));
-    }
+    assert(rpc_api->pvt != nullptr);
+    rpc_api->pvt = static_cast<void *>(new std::vector<bytes_chunk>(
+                inode->get_filecache()->get_commit_pending_bcs()));
 
     args.file = inode->get_fh();
     args.offset = 0;
@@ -981,8 +1014,7 @@ void rpc_task::issue_commit_rpc()
         stats.on_rpc_issue();
 
         if (rpc_nfs3_commit_task(get_rpc_ctx(),
-                                        commit_callback, &args,
-                                        this) == NULL) {
+                                 commit_callback, &args, this) == NULL) {
             stats.on_rpc_cancel();
             /*
              * Most common reason for this is memory allocation failure,
@@ -1161,7 +1193,7 @@ void bc_iovec::on_io_fail(int status)
 
         /*
          * This membuf has completed flushing albeit with failure, check if any
-         * task is waiting for this membuf to be flushed/commit.
+         * task is waiting for this membuf to be flushed.
          */
         std::vector<struct rpc_task *> tvec = mb->get_flush_waiters();
         for (auto& task : tvec) {
@@ -1184,7 +1216,7 @@ void bc_iovec::on_io_fail(int status)
          * This membuf has completed flushing albeit with failure, check if any
          * task is waiting for this membuf to be committed. Now this membuf
          * failed to flush, so we don't issue commit for this membuf. Hence
-         * we must complete the waiter tasks.
+         * we must complete the commit waiter tasks.
          */
         std::vector<struct rpc_task *> tvec_commit = mb->get_commit_waiters();
         for (auto& task : tvec_commit) {
@@ -1192,7 +1224,7 @@ void bc_iovec::on_io_fail(int status)
             assert(task->get_op_type() == FUSE_WRITE);
             assert(task->rpc_api->write_task.is_fe());
 
-            AZLogError("Completing flush waiter task {} for [{}, {}), "
+            AZLogError("Completing commit waiter task {} for [{}, {}), "
                        "failed: {}",
                        fmt::ptr(task),
                        task->rpc_api->write_task.get_offset(),
@@ -1390,22 +1422,26 @@ static void write_iov_callback(
         bciov->on_io_fail(status);
     }
 
-    // check if commit is pending.
+    /*
+     * Check if commit is pending.
+     * Any thread that decides to commit and finds that there's an ongoing
+     * flush, will mark the inode as "needing commit" and when the flush
+     * completes, we need to start the commit.
+     */
     if (!inode->get_filecache()->is_flushing_in_progress()) {
         bool create_commit_task = false;
 
-        std::unique_lock<std::mutex> lock(inode->commit_lock_5);
         {
+            std::unique_lock<std::mutex> lock(inode->commit_lock_5);
             if (inode->is_commit_pending()) {
-                assert(inode->is_stable_write() == false);
+                assert(!inode->is_stable_write());
                 inode->set_commit_in_progress();
                 create_commit_task = true;
             }
         }
-        lock.unlock();
 
         if (create_commit_task) {
-            // Create a new flush_task for the remaining bc_iovec.
+            // Create a commit task to commit all bcs needing commit.
             struct rpc_task *commit_task =
                     client->get_rpc_task_helper()->alloc_rpc_task_reserved(FUSE_FLUSH);
             commit_task->init_flush(nullptr /* fuse_req */, ino);
@@ -2552,7 +2588,8 @@ void rpc_task::run_access()
 static void perform_inline_writes(struct rpc_task *task,
                                   struct nfs_inode *inode)
 {
-    AZLogInfo("Performing inline writes for inode {}", inode->get_fuse_ino());
+    AZLogInfo("[{}] Performing inline write", inode->get_fuse_ino());
+
     const size_t length = task->rpc_api->write_task.get_size();
     std::vector<bytes_chunk> bc_vec;
 
@@ -2570,16 +2607,17 @@ static void perform_inline_writes(struct rpc_task *task,
     }
 
     /*
-     * prune_bytes is the number of bytes that need to be flushed to
+     * flush_now is the number of bytes that need to be flushed to
      * make space for the new writes. If it's 0, then we don't need to
      * flush more data we can just wait for the ongoing flush/commit
      * to complete.
      */
-    uint64_t prune_bytes = inode->get_filecache()->get_inline_flush_bytes();
+    const uint64_t flush_now = inode->get_filecache()->get_inline_flush_bytes();
+
+    if (flush_now == 0 && !inode->is_stable_write()) {
+        AZLogInfo("No bytes to flush, Wait for ongoing flush/commit "
+                  "to complete");
 
-    if (prune_bytes == 0 && !inode->is_stable_write()) {
-        AZLogInfo("No bytes to prune, Wait for ongoing flush/commit"
-                    "to complete.");
         /*
          * In case of inline_write, we want to block the writes till the
          * enough space is made for new writes. Every write thread will
@@ -2616,7 +2654,7 @@ static void perform_inline_writes(struct rpc_task *task,
              * data into the membufs and issue the writes to the backend.
              */
             bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size);
-            if (size >= prune_bytes) {
+            if (size >= flush_now) {
                 assert(!bc_vec.empty());
                 /*
                  * Perform inline sync.
@@ -2641,9 +2679,9 @@ static void perform_inline_writes(struct rpc_task *task,
         }
 
         if (!inode->is_stable_write()) {
-            AZLogWarn("Not enough contiguous dirty data to prune, may be"
-                      "pattern is not sequential. Prune_bytes={}, size={}",
-                       prune_bytes, size);
+            AZLogWarn("Not enough contiguous dirty data to flush, may be "
+                      "pattern is not sequential. flush_now={}, size={}",
+                       flush_now, size);
 
             /*
              * Let's switch to stable write mode.
@@ -2795,6 +2833,7 @@ static bool handle_writes(struct nfs_inode *inode,
          */
         inode->mark_commit_in_progress();
 
+        // XXX What if commit completes before we reach here?
         assert(inode->is_commit_in_progress());
         assert(!need_inline_write);
     }
@@ -2814,6 +2853,12 @@ static bool handle_writes(struct nfs_inode *inode,
          * add this task to the respective waiters list and return.
          * As we already 1GB worth of dirty data in the cache, we don't want
          * to add more data to the cache. So we wait for the ongoing flush/commit.
+         *
+         * XXX What if is_commit_in_progress() returns false but some other
+         *     thread starts commit right after that?
+         *     Note that we hold flush_lock which only protects us against some
+         *     other thread starting a flush but commit is protected by
+         *     commit_lock_5.
          */
         if (inode->is_commit_in_progress()) {
             /*
@@ -3009,7 +3054,7 @@ void rpc_task::run_write()
     }
 
     if (handle_writes(inode, this, ino, sparse_write,
-                 extent_left, extent_right)) {
+                      extent_left, extent_right)) {
         return;
     } else {
         assert(false);

From 624924d48bf8f820695db37e608fe4610833d311 Mon Sep 17 00:00:00 2001
From: Nitin Singla <nitinsingla@microsoft.com>
Date: Wed, 29 Jan 2025 06:34:21 +0000
Subject: [PATCH 5/9] Removed the commit_lock.

---
 turbonfs/inc/nfs_inode.h   |  6 -----
 turbonfs/src/nfs_inode.cpp | 55 ++++++++++----------------------------
 turbonfs/src/rpc_task.cpp  | 35 +++++++++++-------------
 3 files changed, 30 insertions(+), 66 deletions(-)

diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
index ea3a2c95..17b6c4e5 100644
--- a/turbonfs/inc/nfs_inode.h
+++ b/turbonfs/inc/nfs_inode.h
@@ -448,12 +448,6 @@ struct nfs_inode
 
     std::atomic<commit_state_t> commit_state = commit_state_t::COMMIT_NOT_NEEDED;
 
-    /*
-     * commit_lock_5 is used to synchronize flush thread and write thread
-     * for commit operation.
-     */
-    std::mutex commit_lock_5;
-
     /**
      * TODO: Initialize attr with postop attributes received in the RPC
      *       response.
diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index a6eeba51..de342b89 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -440,15 +440,8 @@ void nfs_inode::switch_to_stable_write()
      * If there is something to commit, then commit it and
      * wait for the commit to complete.
      */
-    std::unique_lock<std::mutex> commit_lock(commit_lock_5);
     assert(!is_commit_in_progress());
-    set_commit_in_progress();
-    commit_lock.unlock();
-
-    /*
-     * Issue the commit RPC to commit the dirty membufs.
-     */
-    commit_membufs();
+    mark_commit_in_progress();
 
     /*
      * Wait for the commit to complete.
@@ -497,9 +490,13 @@ bool nfs_inode::check_stable_write_required(off_t offset)
 
 /**
  * commit_membufs() is called by writer thread to commit flushed membufs.
+ * It's always issued under flush_lock().
  */
 void nfs_inode::commit_membufs()
 {
+    assert(is_flushing);
+    assert(is_commit_in_progress());
+
     /*
      * Create the commit task to carry out the write.
      */
@@ -937,8 +934,12 @@ int nfs_inode::copy_to_cache(const struct fuse_bufvec* bufv,
     return err;
 }
 
+/*
+ * This function is called with flush_lock() held.
+ */
 void nfs_inode::mark_commit_in_progress()
 {
+    assert(is_flushing);
     assert(is_stable_write() == false);
 
     /*
@@ -950,42 +951,14 @@ void nfs_inode::mark_commit_in_progress()
      * Note: Ensure lock is held till we set the commit state depending on flushing
      *       going on or not. This lock to synchronize the commit and flush task.
      */
-    bool start_commit_task = false;
-
-    /*
-     * Take the flush_lock to ensure no new flush or commit task is started.
-     * commit_lock is held to synchronize the flush thread and this thread
-     * to set the commit state depending on flushing going on or not.
-     *
-     * Note: flush_lock can't be used as flush task can be started under the
-     *       flush_lock (flush_cache_and_wait()) and it can cause deadlock.
-     *       flush_cache_and_wait() is called under flush_lock and it checks
-     *       if commit required, then it starts the commit task.
-     *
-     * Note: commit_lock held for very small duration, so it's safe to hold in
-     *       fuse context.
-     */
     if (!is_commit_in_progress())
     {
-        std::unique_lock<std::mutex> lock(commit_lock_5 /*commit_lock*/);
-        /*
-         * It may be possible that commit is in progress, so check again.
-         * Might be set by other thread.
-         */
-        if (!is_commit_in_progress())
-        {
-            if (get_filecache()->is_flushing_in_progress()) {
-                set_commit_pending();
-            } else {
-                set_commit_in_progress();
-                start_commit_task = true;
-            }
+        if (get_filecache()->is_flushing_in_progress()) {
+            set_commit_pending();
+        } else {
+            set_commit_in_progress();
+            commit_membufs();
         }
-        lock.unlock();
-    }
-
-    if (start_commit_task) {
-        commit_membufs();
     }
 }
 
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 6f776571..9228efdc 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -868,6 +868,7 @@ static void commit_callback(
     assert(!inode->is_stable_write());
     // Caller must be inprogress.
     assert(inode->is_commit_in_progress());
+    assert(inode->get_filecache()->is_flushing_in_progress() == false);
 
     const int status = task->status(rpc_status, NFS_STATUS(res));
     UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc);
@@ -1001,7 +1002,7 @@ void rpc_task::issue_commit_rpc()
     /*
      * Get the bcs marked for commit_pending.
      */
-    assert(rpc_api->pvt != nullptr);
+    assert(rpc_api->pvt == nullptr);
     rpc_api->pvt = static_cast<void *>(new std::vector<bytes_chunk>(
                 inode->get_filecache()->get_commit_pending_bcs()));
 
@@ -1425,28 +1426,21 @@ static void write_iov_callback(
     /*
      * Check if commit is pending.
      * Any thread that decides to commit and finds that there's an ongoing
-     * flush, will mark the inode as "needing commit" and when the flush
+     * flush, will mark the inode as 'NEEDS_COMMIT' and when the flush
      * completes, we need to start the commit.
      */
     if (!inode->get_filecache()->is_flushing_in_progress()) {
-        bool create_commit_task = false;
-
-        {
-            std::unique_lock<std::mutex> lock(inode->commit_lock_5);
-            if (inode->is_commit_pending()) {
-                assert(!inode->is_stable_write());
-                inode->set_commit_in_progress();
-                create_commit_task = true;
-            }
-        }
+        inode->flush_lock();
+        if (inode->is_commit_pending()) {
+            assert(!inode->is_stable_write());
 
-        if (create_commit_task) {
-            // Create a commit task to commit all bcs needing commit.
-            struct rpc_task *commit_task =
-                    client->get_rpc_task_helper()->alloc_rpc_task_reserved(FUSE_FLUSH);
-            commit_task->init_flush(nullptr /* fuse_req */, ino);
-            commit_task->issue_commit_rpc();
+            /*
+             * We are issuing the commit rpc, so set the commit_in_progress flag.
+             */
+            inode->set_commit_in_progress();
+            inode->commit_membufs();
         }
+        inode->flush_unlock();
     }
 
     /*
@@ -2663,7 +2657,6 @@ static void perform_inline_writes(struct rpc_task *task,
                  * sync_membufs(), but add the task to the commit_waiters list.
                  */
                 inode->sync_membufs(bc_vec, false /* is_flush */, nullptr);
-                inode->flush_unlock();
 
                 /*
                  * Set the commit_in_progress flag to ensure that the commit
@@ -2672,6 +2665,7 @@ static void perform_inline_writes(struct rpc_task *task,
                  * as we need to release the memory.
                  */
                 inode->mark_commit_in_progress();
+                inode->flush_unlock();
 
                 inode->add_task_to_commit_waiters(task);
                 return;
@@ -2827,6 +2821,7 @@ static bool handle_writes(struct nfs_inode *inode,
      * data to the backend
      */
     if (need_commit) {
+        inode->flush_lock();
         /*
          * Set the commit_in_progress flag to ensure that the commit
          * task is startet.
@@ -2835,6 +2830,8 @@ static bool handle_writes(struct nfs_inode *inode,
 
         // XXX What if commit completes before we reach here?
         assert(inode->is_commit_in_progress());
+
+        inode->flush_unlock();
         assert(!need_inline_write);
     }
 

From 86107211a76ef613f2ef1c6ddf958a0afd94ddd9 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Wed, 29 Jan 2025 22:59:15 +0000
Subject: [PATCH 6/9] Audit changes

Audit nfs_inode.h
Audit rpc_task.h
---
 turbonfs/inc/file_cache.h |   4 +-
 turbonfs/inc/nfs_inode.h  | 130 ++++++++++++++++++++++++++++++--------
 turbonfs/inc/rpc_task.h   |   8 ---
 3 files changed, 106 insertions(+), 36 deletions(-)

diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index 6a6165d1..ef2b65df 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -1349,8 +1349,8 @@ class bytes_chunk_cache
      *       check for that after holding the membuf lock, before it tries to
      *       commit those membuf(s).
      */
-    std::vector<bytes_chunk> get_flushing_bc_range(uint64_t st_off,
-                                                   uint64_t end_off) const;
+    std::vector<bytes_chunk> get_flushing_bc_range(uint64_t st_off = 0,
+                                                   uint64_t end_off = UINT64_MAX) const;
 
     /*
      * Returns *all* commit pending chunks in chunkmap.
diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
index 17b6c4e5..9abfc0ce 100644
--- a/turbonfs/inc/nfs_inode.h
+++ b/turbonfs/inc/nfs_inode.h
@@ -127,8 +127,16 @@ struct nfs_inode
      *
      * See flush_lock()/flush_unlock() for the actual locking.
      *
+     * Note: While flush_lock is held it's guaranteed that no new flush can
+     *       start, but there can be ongoing flushes at the time when flush_lock
+     *       is called (and it returns with the lock held). If the caller wants
+     *       to proceed only after all ongoing flushes complete (and no new
+     *       flushes are started), it must hold the flush_lock and then wait
+     *       for ongoing flushes to complete. Ref flush_cache_and_wait().
+     *
      * Note: Though it's called flush lock, but it protects backend file size
-     *       changes through both flush and/or commit.
+     *       changes through both flush and/or commit, i.e., if flush_lock is
+     *       held it's guaranteed that no new flush or commit can start.
      */
     mutable std::atomic<bool> is_flushing = false;
     mutable std::condition_variable_any flush_cv;
@@ -319,9 +327,6 @@ struct nfs_inode
      * writes (either overwrite or sparse write) must go as a stable write
      * (since server knows best how to allocate blocks for them).
      * Once set to true, it remains true for the life of the inode.
-     * 
-     * Note: As of now, we are not using this flag as commit changes not yet
-     *       integrated, so we are setting this flag to true.
      */
     bool stable_write = false;
 
@@ -408,8 +413,10 @@ struct nfs_inode
     int write_error = 0;
 
     /*
-     * Last offset and length of the last flushing operation.
-     * This is used add task waiting for flush/commit to complete.
+     * Offset and length of the latest flush operation.
+     * Tasks that want to wait for multiple membufs to be flushed/committed,
+     * will add themselves to flush_waiters/commit_waiters list for this
+     * membuf. See add_task_to_flush_waiters()/add_task_to_commit_waiters().
      */
     uint64_t last_flushing_offset = 0;
     uint64_t last_flushing_length = 0;
@@ -1138,42 +1145,86 @@ struct nfs_inode
      }
 
     /**
-     * Is commit pending for this inode?
+     * Is commit pending/scheduled for this inode?
+     *
+     * It's called from write_iov_callback() when flush/write completes to see
+     * if some other thread has scheduled commit (as it could not start commit
+     * since flush was in progress) and it needs to start commit now.
+     *
+     * This MUST be called with flush_lock held.
      */
     bool is_commit_pending() const
     {
+        assert(is_flushing);
         assert(commit_state != commit_state_t::INVALID);
         return (commit_state == commit_state_t::NEEDS_COMMIT);
     }
 
     /**
-     * set needs_commit state for this inode.
-     * Note this is set to let flushing task know that commit is pending and start commit task.
+     * Schedule commit for this inode.
+     *
+     * This MUST be called with flush_lock held, as only one thread must
+     * schedule commit. Note that a thread wanting to commit some data, checks
+     * if some other thread is already flushing data from that cache, if yes it
+     * cannot start the commit till the flush completes (as we need consistent
+     * TBL at the time of commit), so it just schedules the commit and
+     * write_iov_callback() then starts the commit when the last ongoing flush
+     * completes. If there's no ongoing flush at the time of this call, then the
+     * thread that wants to commit must start the commit itself, in that case
+     * it'll call set_commit_in_progress() and not set_commit_pending().
+     * See schedule_or_start_commit().
+     *
+     * XXX Do not confuse it with similar function in membuf.
+     *     That is used to set "commit needed" for a membuf after it's written
+     *     using UNSTABLE write. This one is for the inode and conveys that one
+     *     or more membufs in this inode's filecache need to be committed.
      */
     void set_commit_pending()
     {
-        // Commit can be set to pending only if it is in commit_not_needed state.
+        assert(is_flushing);
+        /*
+         * Commit MUST be scheduled only if it is not already running or
+         * scheduled.
+         */
         assert(commit_state == commit_state_t::COMMIT_NOT_NEEDED);
         commit_state = commit_state_t::NEEDS_COMMIT;
     }
 
     /**
      * Is commit in progress for this inode?
+     * Note that we consider commit to be in-progress even if it's scheduled
+     * (commit_state == NEEDS_COMMIT) but not yet running. This is because
+     * once scheduled the commit WILL DEFINITELY run (when the last flush
+     * completes) and we usually don't need to distinguish between that and
+     * whether it's actually running now.
+     *
+     * This can only be safely called with flush_lock held.
+     * Depending on other conditions, caller may be able to call it w/o the
+     * flush_lock but be careful and think through scenarios where some
+     * other thread can call set_commit_pending()/set_commit_in_progress(),
+     * right after or before our call to is_commit_in_progress().
      */
     bool is_commit_in_progress() const
     {
         assert(commit_state != commit_state_t::INVALID);
         return ((commit_state == commit_state_t::COMMIT_IN_PROGRESS) ||
-               (commit_state == commit_state_t::NEEDS_COMMIT));
+                (commit_state == commit_state_t::NEEDS_COMMIT));
     }
 
     /**
      * Set commit_in_progress state for this inode.
+     *
+     * This MUST be called with flush_lock held, as only one thread must
+     * start commit.
      */
     void set_commit_in_progress()
     {
+        assert(is_flushing);
+
         assert(commit_state != commit_state_t::INVALID);
+        // Must not already be in-progress or scheduled.
         assert(commit_state != commit_state_t::COMMIT_IN_PROGRESS);
+
         commit_state = commit_state_t::COMMIT_IN_PROGRESS;
     }
 
@@ -1352,42 +1403,67 @@ struct nfs_inode
     int flush_cache_and_wait(uint64_t start_off = 0,
                              uint64_t end_off = UINT64_MAX);
 
-    void mark_commit_in_progress();
-    void add_task_to_commit_waiters(struct rpc_task *task);
+    /**
+     * Start commit of (all) uncommitted data. Since commit and flush cannot
+     * run in parallel, if there's an ongoing flush it schedules commit by
+     * calling set_commit_pending() which will cause the last completing flush
+     * to trigger the commit, else it starts the commit.
+     * MUST be called only when doing unstable writes.
+     */
+    void schedule_or_start_commit();
+
+    /**
+     * Helper method for calling file_cache's add_flush_waiter() method.
+     * If not able to add successfully, then it completes the task else the
+     * task will be completed when ongoing flush completes.
+     * Call to this function guarantees that task will be completed and
+     * caller need not complete the task.
+     */
     void add_task_to_flush_waiters(struct rpc_task *task);
 
+    /**
+     * Helper method for calling file_cache's add_commit_waiter() method.
+     * If not able to add successfully, then it completes the task else the
+     * task will be completed when ongoing commit or commit initiated after
+     * the ongoing flush completes.
+     * Call to this function guarantees that task will be completed and
+     * caller need not complete the task.
+     */
+    void add_task_to_commit_waiters(struct rpc_task *task);
+
     /**
      * Wait for currently flushing membufs to complete.
      * If parent_task is non-null, then it's added to the commit_waiters list
-     * and returned. Otherwise it waits for the ongoing flush to complete.
+     * and returned, otherwise it waits for the ongoing flush (and subsequent
+     * commit) to complete.
      *
      * Returns 0 on success and a positive errno value on error.
      *
      * Note : Caller must hold the inode is_flushing lock to ensure that
-     *        no new membufs are added till this call completes.
+     *        no new flush is started till this call completes.
      */
-    int wait_for_ongoing_flush_commit(uint64_t start_off = 0,
-                               uint64_t end_off = UINT64_MAX,
-                               struct rpc_task *parent_task = nullptr);
+    int wait_for_ongoing_flush_commit(struct rpc_task *parent_task = nullptr);
 
-    /*
-     * commit_membufs() is called to commit uncommitted membufs to the BLOB.
+    /**
+     * commit_membufs() is called to commit uncommitted membufs to the Blob.
      * It creates commit RPC and sends it to the NFS server.
      */
     void commit_membufs();
 
-    /*
+    /**
      * switch_to_stable_write() is called to switch the inode to stable write
-     * mode. There is should be no ongoing commit/flusing operation when this
-     * is called. It creates a commit RPC to commit all the uncopmmitted membufs
-     * to the BLOB.
+     * mode. It waits for all ongoing flush and subsequent commit to complete.
+     * If not already scheduled, it'll perform an explicit commit after the
+     * flush complete.
+     * Post that it'll mark inode for stable write and return. From then on
+     * any writes to this inode will be sent as stable writes.
      */
     void switch_to_stable_write();
 
     /**
      * Check if stable write is required for the given offset.
-     * Given offset is the start of contigious dirty membufs that need to be
-     * flushed to the BLOB.
+     * Given offset is the start of contiguous dirty membufs that need to be
+     * flushed to the Blob.
      */
     bool check_stable_write_required(off_t offset);
 
@@ -1405,6 +1481,8 @@ struct nfs_inode
      * caller in case of memory pressure when we want to delay fuse callbacks
      * to slow down writes which can cause more memory to be dirtied.
      *
+     * Note: Must be called with flush_lock held.
+     *
      * Note: sync_membufs() can free parent_task if all issued backend
      *       writes complete before sync_membufs() could return.
      *       DO NOT access parent_task after sync_membufs() returns.
diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h
index 72c77788..c9fd4303 100644
--- a/turbonfs/inc/rpc_task.h
+++ b/turbonfs/inc/rpc_task.h
@@ -2000,14 +2000,6 @@ struct rpc_task
         csched = _csched;
     }
 
-    void set_task_csched(bool stable_write)
-    {
-        if (client->mnt_options.nfs_port == 2048) {
-            csched = stable_write ? CONN_SCHED_FH_HASH : CONN_SCHED_RR;
-        }
-        return;
-    }
-
     conn_sched_t get_csched() const
     {
         assert(csched > CONN_SCHED_INVALID &&

From ba09ab205411c74d7592137d6d8682e2f19f7898 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Thu, 30 Jan 2025 05:58:43 +0000
Subject: [PATCH 7/9] Audit changes

---
 turbonfs/inc/file_cache.h   |   7 +-
 turbonfs/inc/nfs_inode.h    |   7 +-
 turbonfs/src/file_cache.cpp |  25 --
 turbonfs/src/nfs_client.cpp |   2 +
 turbonfs/src/nfs_inode.cpp  | 338 ++++++++++++++-----------
 turbonfs/src/rpc_task.cpp   | 488 ++++++++++++++++++------------------
 6 files changed, 448 insertions(+), 419 deletions(-)

diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index ef2b65df..38afcc89 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -1284,9 +1284,6 @@ class bytes_chunk_cache
                            uint64_t length,
                            struct rpc_task *task);
 
-    void cleanup_on_error();
-
-
     /*
      * Returns all dirty chunks for a given range in chunkmap.
      * Before returning it increases the inuse count of underlying membuf(s).
@@ -1302,8 +1299,8 @@ class bytes_chunk_cache
      *       check for that after holding the membuf lock, before it tries to
      *       flush those membuf(s).
      */
-    std::vector<bytes_chunk> get_dirty_bc_range(uint64_t st_off,
-                                                uint64_t end_off) const;
+    std::vector<bytes_chunk> get_dirty_bc_range(uint64_t st_off = 0,
+                                                uint64_t end_off = UINT64_MAX) const;
 
     /*
      * Returns dirty chunks which are not already flushing, in the given range,
diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
index 9abfc0ce..e33e61f9 100644
--- a/turbonfs/inc/nfs_inode.h
+++ b/turbonfs/inc/nfs_inode.h
@@ -1383,9 +1383,7 @@ struct nfs_inode
 
     /**
      * Flush the dirty file cache represented by filecache_handle and wait
-     * till all dirty data is sync'ed with the NFS server. Only dirty data
-     * in the given range is flushed if provided, else all dirty data is
-     * flushed.
+     * till all dirty data is sync'ed with the NFS server.
      * Note that filecache_handle is the only writeback cache that we have
      * and hence this only flushes that.
      * For a non-reg file inode this will be a no-op.
@@ -1400,8 +1398,7 @@ struct nfs_inode
      *       initiate any new flush operations while some truncate call is in
      *       progress (which must have held the is_flushing lock).
      */
-    int flush_cache_and_wait(uint64_t start_off = 0,
-                             uint64_t end_off = UINT64_MAX);
+    int flush_cache_and_wait();
 
     /**
      * Start commit of (all) uncommitted data. Since commit and flush cannot
diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp
index a6ff86be..f948c3c1 100644
--- a/turbonfs/src/file_cache.cpp
+++ b/turbonfs/src/file_cache.cpp
@@ -2523,31 +2523,6 @@ bool bytes_chunk_cache::add_commit_waiter(uint64_t offset,
     return false;
 }
 
-void bytes_chunk_cache::cleanup_on_error()
-{
-    const std::unique_lock<std::mutex> _lock(chunkmap_lock_43);
-
-    for (auto it = chunkmap.begin(); it != chunkmap.cend(); ++it) {
-        struct bytes_chunk& bc = it->second;
-        struct membuf *mb = bc.get_membuf();
-        if (!mb->is_inuse() && !mb->is_locked()) {
-            mb->set_inuse();
-            mb->set_locked();
-            if (mb->is_dirty()) {
-                mb->clear_dirty();
-            }
-
-            if (mb->is_flushing()) {
-                mb->clear_flushing();
-            }
-
-            if (mb->is_commit_pending()) {
-                mb->clear_commit_pending();
-            }
-        }
-    }
-}
-
 std::vector<bytes_chunk> bytes_chunk_cache::get_dirty_bc_range(
         uint64_t start_off, uint64_t end_off) const
 {
diff --git a/turbonfs/src/nfs_client.cpp b/turbonfs/src/nfs_client.cpp
index 0589c5e4..d5ec131e 100644
--- a/turbonfs/src/nfs_client.cpp
+++ b/turbonfs/src/nfs_client.cpp
@@ -1180,6 +1180,8 @@ void nfs_client::getattr(
      *       updating nfs_inode::attr during cached writes and then returning
      *       attributes from that instead of making a getattr call here.
      *       We need to think carefully though.
+     *
+     * TODO: flush_cache_and_wait()
      */
     if (inode->is_regfile()) {
         AZLogDebug("[{}] Flushing file data ahead of getattr",
diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index de342b89..4776069f 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -382,48 +382,47 @@ void nfs_inode::wait_for_ongoing_commit()
 {
     assert(is_flushing);
 
+    /*
+     * TODO: See if we can eliminate inline sleep.
+     */
+    if (is_commit_in_progress()) {
+        AZLogWarn("[{}] wait_for_ongoing_commit() will sleep inline!!",
+                  get_fuse_ino());
+    }
+
     while (is_commit_in_progress()) {
         assert(!get_filecache()->is_flushing_in_progress());
         ::usleep(1000);
     }
 
-    assert(is_commit_in_progress() == false);
+    assert(!is_commit_in_progress());
 }
 
 /*
  * This function is called with flush_lock() held.
- * In normal case this function should be no-op as
- * when we reach here all the previous flushes/commits
- * should have been completed. But in case of committing
- * at the close of file, we need to wait if any flush
- * going on and commit all the dirty membufs which are
- * not yet committed.
+ * This should be called whenever we figure out that we cannot proceed with
+ * unstable writes (most common reason being, next write is not an append
+ * write). Once this function returns, following is guaranteed:
+ * - There will be no flushes in progress.
+ * - There will be no commit_pending data and no commit inprogress.
+ * - inode->stable_write will be set to true.
  */
 void nfs_inode::switch_to_stable_write()
 {
     assert(is_flushing);
-    AZLogInfo("[{}] Switching to stable write", ino);
+    assert(!is_stable_write());
 
-    /*
-     * We should not be in commit in progress state.
-     * Switch_to_stable_write() is called by flush thread
-     * and it shouldn't be called when commit is in progress.
-     */
-    assert(is_commit_in_progress() == false);
+    AZLogInfo("[{}] Switching to stable write", ino);
 
     /*
-     * switch_to_stable_write() called from following paths.
-     * 1. sync_membufs(), No new flush should be started if
-     *    there is ongoing flush is in progress.
-     * 2. flush_cache_and_wait(), Wait for ongoing flush to
-     *    complete.
-     * As we are allowing multiple flush in parallel in case
-     * of unstable write as well, we should wait for ongoing
-     * flush to complere before switching to stable write.
-     * switch_to_stable_write() is called by flush_lock() held
-     * so no new flush can be started.
+     * switch_to_stable_write() is called from places where we are about to
+     * start a flush operation. Before that we check to see if we need to
+     * change to stable write. Since we are not flushing yet and since we
+     * do not support multiple ongoing flushes, we are guaranteed that no
+     * flush should be in progress when we reach here.
+     * Similarly commit should not be in progress.
      */
-    wait_for_ongoing_flush_commit();
+    assert(!is_commit_in_progress());
     assert(!get_filecache()->is_flushing_in_progress());
 
     /*
@@ -440,8 +439,7 @@ void nfs_inode::switch_to_stable_write()
      * If there is something to commit, then commit it and
      * wait for the commit to complete.
      */
-    assert(!is_commit_in_progress());
-    mark_commit_in_progress();
+    schedule_or_start_commit();
 
     /*
      * Wait for the commit to complete.
@@ -495,7 +493,8 @@ bool nfs_inode::check_stable_write_required(off_t offset)
 void nfs_inode::commit_membufs()
 {
     assert(is_flushing);
-    assert(is_commit_in_progress());
+
+    set_commit_in_progress();
 
     /*
      * Create the commit task to carry out the write.
@@ -508,10 +507,29 @@ void nfs_inode::commit_membufs()
     commit_task->issue_commit_rpc();
 }
 
+/**
+ * sync_membufs()
+ */
 void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
                              bool is_flush,
                              struct rpc_task *parent_task)
 {
+    assert(is_flushing);
+
+    if (!is_stable_write()) {
+        /*
+         * We do not allow a new flush while there's an ongoing one, in case
+         * of unstable writes.
+         */
+        assert(!get_filecache()->is_flushing_in_progress());
+    }
+
+    /*
+     * Stable won't have commit and for unstable we cannot flush while
+     * commit is going on.
+     */
+    assert(!is_commit_in_progress());
+
     if (bc_vec.empty()) {
         return;
     }
@@ -540,21 +558,11 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
         parent_task->num_ongoing_backend_writes = 1;
     }
 
-    /*
-     * New flush should not be started if flush/commit is in progress.
-     * But it may happen commit_is_pending() is set by another thread.
-     */
-    assert(!is_commit_in_progress());
-
-    if (bc_vec.empty()) {
-        return;
-    }
-
     /*
      * If new offset is not at the end of the file,
      * then we need to switch to stable write.
      */
-    if(check_stable_write_required(bc_vec[0].offset)) {
+    if (check_stable_write_required(bc_vec[0].offset)) {
         switch_to_stable_write();
     }
 
@@ -563,7 +571,20 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
      */
     struct rpc_task *write_task = nullptr;
 
-    // Flush dirty membufs to backend.
+    /*
+     * Flush dirty membufs to backend.
+     * We increment bytes_flushing by 1 before starting any of the flush to
+     * protect is against the condition where whatever flush we have issued
+     * completes till we could issue all the flushes. In write_iov_callback()
+     * we will wrongly treat it as "all flush completed" and start the commit,
+     * while this thread may still keep doing flush for membufs.
+     *
+     * Also assert that for unstable writes, we should never start a new flush
+     * while there is an ongoing flush.
+     */
+    assert((get_filecache()->bytes_flushing == 0) || is_stable_write());
+    get_filecache()->bytes_flushing++;
+
     for (bytes_chunk& bc : bc_vec) {
         /*
          * Get the underlying membuf for bc.
@@ -718,6 +739,22 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
         write_task->issue_write_rpc();
     }
 
+    /*
+     * Drop the protective bytes_flushing count.
+     * If bytes_flushing becomes 0 then all flushes issued above have completed
+     * by the time we reached here. Check if we need to start the commit.
+     * Any thread that decides to commit and finds that there's an ongoing
+     * flush, will mark the inode as 'NEEDS_COMMIT' and when the flush
+     * completes, we need to start the commit.
+     */
+    if (--get_filecache()->bytes_flushing == 0) {
+        if (is_commit_pending()) {
+            assert(!is_stable_write());
+
+            commit_membufs();
+        }
+    }
+
     /*
      * Drop the protective num_ongoing_backend_writes count taken at the start
      * of this function, and if it's the only one remaining that means all
@@ -936,27 +973,28 @@ int nfs_inode::copy_to_cache(const struct fuse_bufvec* bufv,
 
 /*
  * This function is called with flush_lock() held.
+ * Once this function is called it's guaranteed that a commit will be done,
+ * either from here or from the last flush (write_iov_callback()).
  */
-void nfs_inode::mark_commit_in_progress()
+void nfs_inode::schedule_or_start_commit()
 {
     assert(is_flushing);
-    assert(is_stable_write() == false);
+    assert(!is_stable_write());
 
     /*
      * We want to commit, but we need to wait for current flushing to complete.
-     * 1. If flushing is going on, set commit_pending to inode, when last membuf flushed,
-     *    write_iov_callback(), start new task to commit the data.
-     * 2. If flushing is not going on, then create the task and start commit of data.
+     * 1. If flushing is going on, set commit_pending for the inode, when last
+     *    membuf flush completes write_iov_callback() will commit the data.
+     * 2. If flushing is not going on, then start commit of data rightaway.
      *
-     * Note: Ensure lock is held till we set the commit state depending on flushing
-     *       going on or not. This lock to synchronize the commit and flush task.
+     * Note: Ensure lock is held till we set the commit state depending on
+     *       flushing going on or not. This lock is to synchronize the commit
+     *       and flush task.
      */
-    if (!is_commit_in_progress())
-    {
+    if (!is_commit_in_progress()) {
         if (get_filecache()->is_flushing_in_progress()) {
             set_commit_pending();
         } else {
-            set_commit_in_progress();
             commit_membufs();
         }
     }
@@ -964,13 +1002,20 @@ void nfs_inode::mark_commit_in_progress()
 
 void nfs_inode::add_task_to_flush_waiters(struct rpc_task *task)
 {
+    // Must be called only for unstable writes.
+    assert(!is_stable_write());
+    assert(task->magic == RPC_TASK_MAGIC);
+    // Must be a frontend write task.
+    assert(task->get_op_type() == FUSE_WRITE);
+    assert(task->rpc_api->write_task.is_fe());
+
     /*
      * Add this task to the flush_waiter list, so that it can be
      * completed after the flush is done.
      */
-    if(get_filecache()->add_flush_waiter(last_flushing_offset,
-                                         last_flushing_length,
-                                         task)) {
+    if (get_filecache()->add_flush_waiter(last_flushing_offset,
+                                          last_flushing_length,
+                                          task)) {
         AZLogDebug("[{}] flush in progress, added to flush_waiter list"
                    " for [{}, {})", ino, last_flushing_offset,
                    last_flushing_length);
@@ -980,8 +1025,8 @@ void nfs_inode::add_task_to_flush_waiters(struct rpc_task *task)
          * flush is done, but this task is not added to flush_waiter
          * list, so complete the write.
          */
-        AZLogDebug("[{}] flush is done, not added to flush_waiter list, "
-                    "completing write", ino);
+        AZLogDebug("[{}] Could not add to flush_waiters list, completing "
+                   "write task {}", ino, fmt::ptr(task));
         task->reply_write(task->rpc_api->write_task.get_size());
     return;
     }
@@ -989,63 +1034,55 @@ void nfs_inode::add_task_to_flush_waiters(struct rpc_task *task)
 
 void nfs_inode::add_task_to_commit_waiters(struct rpc_task *task)
 {
-    /*
-     * This should be called only for unstable writes.
-     */
-    assert(is_stable_write() == false);
+    // Must be called only for unstable writes.
+    assert(!is_stable_write());
+    assert(task->magic == RPC_TASK_MAGIC);
+    // Must be a frontend write task.
+    assert(task->get_op_type() == FUSE_WRITE);
+    assert(task->rpc_api->write_task.is_fe());
 
     /*
      * Add this task to the commit_waiters list, so that it can be
      * completed after the commit is done.
      */
-    if(get_filecache()->add_commit_waiter(last_flushing_offset,
-                                          last_flushing_length,
-                                          task)) {
+    if (get_filecache()->add_commit_waiter(last_flushing_offset,
+                                           last_flushing_length,
+                                           task)) {
         AZLogDebug("[{}] Commit in progress, added to commit_waiters list"
                    " for [{}, {})", ino, last_flushing_offset,
                     last_flushing_length);
+        /*
+         * Since we don't complete the write task here, add_commit_waiter()
+         * MUST ensure task will be completed if it returns true.
+         */
         return;
     } else {
         /*
          * Commit is done, but this task is not added to commit_waiters
          * list, so complete the write.
          */
-        AZLogDebug("[{}] Commit in progress, but not added to commit_waiters list, "
-                    "completing write", ino);
+        AZLogDebug("[{}] Could not add to commit_waiters list, completing "
+                   "write task {}", ino, fmt::ptr(task));
         task->reply_write(task->rpc_api->write_task.get_size());
         return;
     }
 }
 
 /**
- * Note: Caller should call with flush_lock() held.
- *       On return there is no ongoing flush/commit in progress.
+ * Wait for ongoing flushes and commit all uncommitted data.
+ * Caller must call with flush_lock() held.
+ * On return there would be no ongoing flush/commit in progress and all
+ * uncommitted data would have been committed.
  *
- * flush_cache_and_wait() checks for return value, as it's
- * called on close() and need to know final status of the writes.
+ * If caller passes a valid frontend write task, the call itself will
+ * return immediately but the task will be completed after flush and commit
+ * have completed. If caller does not pass a tak pointer, then the call
+ * itself would block till all flush and commit complete.
  */
-int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
-                                             uint64_t end_off,
-                                             struct rpc_task *task)
+int nfs_inode::wait_for_ongoing_flush_commit(struct rpc_task *task)
 {
-    // Caller must call us with is_flushing lock held.
+    // Caller must call us with flush_lock held.
     assert(is_flushing);
-    assert(start_off < end_off);
-
-    if (is_stable_write()) {
-        assert(!is_commit_in_progress());
-        assert(get_filecache()->bytes_commit_pending == 0);
-    }
-
-    /*
-     * If commit_pending limit reached, we need to start the commit,
-     * but we need to wait for the current flushing to complete, so
-     * we set the commit_pending flag and once flusing completes it
-     * will start the commit task.
-     */
-    if (get_filecache()->is_flushing_in_progress()) {
-        assert(!is_commit_in_progress() || is_commit_pending());
-    }
 
     /*
      * MUST be called only for regular files.
@@ -1056,6 +1093,14 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
         return 0;
     }
 
+    if (task) {
+        assert(task->magic == RPC_TASK_MAGIC);
+        // Must be a frontend write task.
+        assert(task->get_op_type() == FUSE_WRITE);
+        assert(task->rpc_api->write_task.is_fe());
+        assert(task->rpc_api->write_task.get_size() > 0);
+    }
+
     /*
      * If flush() is called w/o open(), there won't be any cache, skip.
      */
@@ -1064,17 +1109,27 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
     }
 
     /*
-     * Flushing not in progress and no new flushing can be started as we hold
-     * the flush_lock().
-     * It may be possible that the flush is not in progress and but we may have
-     * commit_in_progress or commit_pending_bytes to commit.
-     * In that case we should start the commit task if it's not already in
-     * progress. And wait for the commit to complete.
+     * Stable writes do not need commit, so no commit inprogress and no pending
+     * commit data.
      */
-    if (!get_filecache()->is_flushing_in_progress() &&
-        !is_commit_in_progress() &&
-        get_filecache()->bytes_commit_pending == 0) {
-        AZLogDebug("[{}] No flush in progress, returning", ino);
+    if (is_stable_write()) {
+        assert(!is_commit_in_progress());
+        assert(get_filecache()->bytes_commit_pending == 0);
+    }
+
+    if (get_filecache()->is_flushing_in_progress()) {
+        /*
+         * Flushing and committing cannot happen simultaneously.
+         * This is effectively asserting the following:
+         * assert(commit_state != commit_state_t::COMMIT_IN_PROGRESS);
+         */
+        assert(!is_commit_in_progress() || is_commit_pending());
+    } else if (!is_commit_in_progress() &&
+               get_filecache()->bytes_commit_pending == 0) {
+        /*
+         * No flush or commit in progress and nothing to commit, return.
+         */
+        AZLogDebug("[{}] No flush or commit in progress, returning", ino);
 
         /*
          * Complete the write task if it was passed.
@@ -1086,19 +1141,18 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
     }
 
     /*
-     * Try to get the flushing bytes_chunk from the filecache handle if the
-     * flush is in progress.
+     * Ok, so either flushing is going on or commit is going on or we have at
+     * least one byte to commit.
      */
     std::vector<bytes_chunk> bc_vec = {};
-    if (task == nullptr && get_filecache()->is_flushing_in_progress()) {
+    if ((task == nullptr) && get_filecache()->is_flushing_in_progress()) {
         /*
-        * Get the flushing bytes_chunk from the filecache handle.
-        * This will grab an exclusive lock on the file cache and return the list
-        * of flushing bytes_chunks at that point. Note that we can have new dirty
-        * bytes_chunks created but we don't want to wait for those.
+        * Get the flushing bc from the filecache.
+        * We will need to wait for these inline.
+        * Note that we can have new dirty membufs created after the call but
+        * we don't want to wait for those.
         */
-        bc_vec =
-            filecache_handle->get_flushing_bc_range(start_off, end_off);
+        bc_vec = filecache_handle->get_flushing_bc_range();
     }
 
     /*
@@ -1110,7 +1164,7 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
      * Task will wait on last flushed offset and length.
      */
     if (task == nullptr) {
-        for (bytes_chunk &bc : bc_vec) {
+        for (bytes_chunk& bc : bc_vec) {
             struct membuf *mb = bc.get_membuf();
 
             assert(mb != nullptr);
@@ -1119,15 +1173,16 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
 
             /*
              * If still dirty after we get the lock, it may mean two things:
-             * - Write failed.
+             * - Write failed. We ignore the failure and pretend as if flush
+             *   completed.
              * - Some other thread got the lock before us and it made the
-             *   membuf dirty again.
+             *   membuf dirty again. Since we wanted to wait only for ongoing
+             *   flushes and not newly written data, we ignore this.
              */
             if (mb->is_dirty() && get_write_error()) {
                 AZLogError("[{}] Flush [{}, {}) failed with error: {}",
-                        ino,
-                        bc.offset, bc.offset + bc.length,
-                        get_write_error());
+                           ino, bc.offset, bc.offset + bc.length,
+                           get_write_error());
             }
 
             mb->clear_locked();
@@ -1163,17 +1218,21 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
 
     /*
      * Unstable write case, we need to wait for the commit to complete.
-     * So, mark the commit in progress and wait for the commit to complete.
+     * We must start/schedule commit.
      */
     if (!is_stable_write()) {
-        mark_commit_in_progress();
+        schedule_or_start_commit();
         if (task) {
             return 0;
         }
 
         /*
          * Wait for the ongoing commit to complete.
+         * If task is not valid we would have waited for all ongoing flushes,
+         * and no new flush can start because we hold the flush_lock, so when
+         * we come here we should have flushing going on.
          */
+        assert(!get_filecache()->is_flushing_in_progress());
         wait_for_ongoing_commit();
     }
 
@@ -1184,9 +1243,9 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
      * So, following conditions should be true for both stable and
      * unstable writes.:
      */
-    assert(is_commit_in_progress() == false);
+    assert(!get_filecache()->is_flushing_in_progress());
+    assert(!is_commit_in_progress());
     assert(get_filecache()->bytes_commit_pending == 0);
-    assert(get_filecache()->is_flushing_in_progress() == 0);
 
     return get_write_error();
 }
@@ -1203,18 +1262,17 @@ int nfs_inode::wait_for_ongoing_flush_commit(uint64_t start_off,
  * - Wait for the commit to complete.
  * - Returns the error code if any.
  *
- * Note: Flush_cache_and_wait() block the fuse thread till the flush completes.
+ * Note: Flush_cache_and_wait() blocks the fuse thread till the flush completes.
  *       It's called from the release(), flush() and getattr() calls. It's ok
  *       as of now as it's not very often called. We can optimize to complete
  *       the flush in background and return immediately. For that we need to add
  *       special handling for the getattr() call.
  */
-int nfs_inode::flush_cache_and_wait(uint64_t start_off,
-                                    uint64_t end_off)
+int nfs_inode::flush_cache_and_wait()
 {
-    assert(start_off < end_off);
-    assert(!is_stable_write() || get_filecache()->get_bytes_to_commit() == 0);
-    assert(!is_stable_write() || is_commit_in_progress() == false);
+    assert(!is_stable_write() ||
+           get_filecache()->bytes_commit_pending == 0);
+    assert(!is_stable_write() || !is_commit_in_progress());
 
     /*
      * MUST be called only for regular files.
@@ -1231,9 +1289,6 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
      */
     const int error_code = get_write_error();
     if (error_code != 0) {
-        // Cleanup of the membufs if any left.
-        get_filecache()->cleanup_on_error();
-
         AZLogWarn("[{}] Previous write to this Blob failed with error={}, "
                   "skipping new flush!", ino, error_code);
 
@@ -1264,7 +1319,8 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
      * As we have the flush_lock() we are guaranteed that no new flush can be
      * started till we release the flush_lock().
      *
-     * wait_for_ongoing_flush_commit() will wait for the ongoing flush/commt to complete.
+     * wait_for_ongoing_flush_commit() will wait for the ongoing flush/commt to
+     * complete.
      *
      * Note: For stable write case, we don't need to wait for the ongoing flush,
      *       as there is no commit required for stable write.
@@ -1272,6 +1328,7 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
     if (!is_stable_write()) {
         const int err = wait_for_ongoing_flush_commit();
         if (err != 0) {
+            flush_unlock();
             AZLogError("[{}] Failed to flush cache to stable storage, "
                         "error={}", ino, err);
             return err;
@@ -1287,12 +1344,13 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
      * TODO: Need to check getattr() call. As that can cause this
      *       assert to fail.
      */
-    if ((get_filecache()->get_bytes_to_flush() == 0) &&
-        (get_filecache()->is_flushing_in_progress() == false)) {
-        assert(is_commit_in_progress() == false);
-        assert(get_filecache()->is_flushing_in_progress() == false);
+    assert(!get_filecache()->is_flushing_in_progress() ||
+           is_stable_write());
+    assert(!is_commit_in_progress());
+    assert(get_filecache()->bytes_commit_pending == 0);
+
+    if (get_filecache()->get_bytes_to_flush() == 0) {
         assert(get_filecache()->bytes_dirty == 0);
-        assert(get_filecache()->bytes_commit_pending == 0);
         assert(get_filecache()->bytes_flushing == 0);
 
         AZLogDebug("[{}] Nothing to flush, returning", ino);
@@ -1323,11 +1381,10 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
             for (auto& bc : bc_vec) {
                 bc.get_membuf()->clear_inuse();
             }
-            bc_vec = filecache_handle->get_dirty_bc_range(start_off, end_off);
+            bc_vec = filecache_handle->get_dirty_bc_range();
         }
     } else {
-        assert(get_filecache()->get_bytes_to_commit() == 0);
-        bc_vec = filecache_handle->get_dirty_bc_range(start_off, end_off);
+        bc_vec = filecache_handle->get_dirty_bc_range();
     }
 
     /*
@@ -1335,13 +1392,13 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
      * membufs. It batches the contiguous dirty membufs and issues a single
      * write RPC for them.
      */
-    sync_membufs(bc_vec, true);
+    sync_membufs(bc_vec, true /* is_flush */);
 
     /*
      * Our caller expects us to return only after the flush completes.
      * Wait for all the membufs to flush and get result back.
      */
-    for (bytes_chunk &bc : bc_vec) {
+    for (bytes_chunk& bc : bc_vec) {
         struct membuf *mb = bc.get_membuf();
 
         assert(mb != nullptr);
@@ -1389,7 +1446,7 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
         assert(get_filecache()->get_bytes_to_flush() == 0 ||
                get_write_error() != 0);
 
-        mark_commit_in_progress();
+        schedule_or_start_commit();
     }
 
     /*
@@ -1404,11 +1461,10 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off,
     if (get_write_error()) {
         AZLogError("[{}] Failed to flush cache to stable storage, "
                    "error={}", ino, get_write_error());
-        get_filecache()->cleanup_on_error();
     }
 
-    assert(is_commit_in_progress() == false);
-    assert(get_filecache()->is_flushing_in_progress() == false);
+    assert(!is_commit_in_progress());
+    assert(!get_filecache()->is_flushing_in_progress());
     assert(get_filecache()->bytes_dirty == 0);
     assert(get_filecache()->bytes_commit_pending == 0);
     assert(get_filecache()->bytes_flushing == 0);
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 9228efdc..62d3f5fa 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -868,7 +868,7 @@ static void commit_callback(
     assert(!inode->is_stable_write());
     // Caller must be inprogress.
     assert(inode->is_commit_in_progress());
-    assert(inode->get_filecache()->is_flushing_in_progress() == false);
+    assert(!inode->get_filecache()->is_flushing_in_progress());
 
     const int status = task->status(rpc_status, NFS_STATUS(res));
     UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc);
@@ -1437,7 +1437,6 @@ static void write_iov_callback(
             /*
              * We are issuing the commit rpc, so set the commit_in_progress flag.
              */
-            inode->set_commit_in_progress();
             inode->commit_membufs();
         }
         inode->flush_unlock();
@@ -1529,17 +1528,25 @@ void rpc_task::issue_write_rpc()
     args.offset = offset;
     args.count = length;
     args.stable = inode->is_stable_write() ? FILE_SYNC : UNSTABLE;
-    set_task_csched(inode->is_stable_write());
+
+    /*
+     * Unstable writes want to make use of multiple connections for better perf.
+     * Note that they aren't affected by the optimistic concurrency backoff
+     * issues as seen by stable writes.
+     */
+    if (!inode->is_stable_write()) {
+        set_csched(CONN_SCHED_RR);
+    }
 
     do {
         rpc_retry = false;
         stats.on_rpc_issue();
 
         if (rpc_nfs3_writev_task(get_rpc_ctx(),
-                                        write_iov_callback, &args,
-                                        bciov->iov,
-                                        bciov->iovcnt,
-                                        this) == NULL) {
+                                 write_iov_callback, &args,
+                                 bciov->iov,
+                                 bciov->iovcnt,
+                                 this) == NULL) {
             stats.on_rpc_cancel();
             /*
              * Most common reason for this is memory allocation failure,
@@ -2606,46 +2613,53 @@ static void perform_inline_writes(struct rpc_task *task,
      * flush more data we can just wait for the ongoing flush/commit
      * to complete.
      */
-    const uint64_t flush_now = inode->get_filecache()->get_inline_flush_bytes();
+    const uint64_t flush_now =
+        inode->get_filecache()->get_inline_flush_bytes();
 
-    if (flush_now == 0 && !inode->is_stable_write()) {
-        AZLogInfo("No bytes to flush, Wait for ongoing flush/commit "
+    if ((flush_now == 0) && !inode->is_stable_write()) {
+        AZLogInfo("No new bytes to flush, Wait for ongoing flush/commit "
                   "to complete");
 
         /*
-         * In case of inline_write, we want to block the writes till the
-         * enough space is made for new writes. Every write thread will
-         * come here and try to take the lock, only one thread succeeds
-        * and wait for the ongoing flush/commit to complete. Rest of the
-        * threads will wait for the lock to be released by the first
-        * thread. After first thread completes the wait, check if any ongoing
-        * flush/commit is in progress and if it's still true. Mostly it will
-        * be false as new writes are blocked.
-        *
-        * Note: flush_lock() is held so that no new flush or commit task
-        *       is started.
-        */
-        inode->wait_for_ongoing_flush_commit(0, UINT64_MAX, task);
+         * get_inline_flush_bytes() tells us that we have no new data to flush
+         * inline, but since we are in memory pressure we need to slow down the
+         * writers by waiting till all the ongoing flush+commit complete.
+         * wait_for_ongoing_flush_commit() will complete task after all the
+         * ongoing flush and subsequent commit completes.
+         */
+        inode->wait_for_ongoing_flush_commit(task);
         inode->flush_unlock();
         return;
     } else {
         uint64_t size = 0;
         if (!inode->is_stable_write()) {
+            assert(flush_now > 0);
             /*
-             * We need to wait for the ongoing flush/commit to complete.
+             * We need to wait for the ongoing flush/commit to complete, as
+             * for unstable write case we only issue one active flush at a
+             * time.
              * This is only for the case where we have to prune some data
              * to make space for the new writes. Usually this will not be
              * happen so often, it's ok to wait here.
+             *
+             * TODO: See if we can eliminate this wait as it blocks the fuse
+             *       thread.
              */
-            AZLogDebug("Wait for ongoing flush/commit to complete.");
-            inode->wait_for_ongoing_flush_commit(0, UINT64_MAX);
+            AZLogWarn("Wait for ongoing flush/commit to complete!!");
+            inode->wait_for_ongoing_flush_commit();
 
+            /*
+             * We already waited for flush and commit to complete and we have
+             * the flush_lock held, so no new flush and commit can start.
+             */
             assert(!inode->is_commit_in_progress());
             assert(!inode->get_filecache()->is_flushing_in_progress());
 
             /*
              * If the write is not stable, we need to copy the application
              * data into the membufs and issue the writes to the backend.
+             * If we don't find flush_now contiguous bytes, then we switch
+             * to stable write.
              */
             bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size);
             if (size >= flush_now) {
@@ -2657,14 +2671,8 @@ static void perform_inline_writes(struct rpc_task *task,
                  * sync_membufs(), but add the task to the commit_waiters list.
                  */
                 inode->sync_membufs(bc_vec, false /* is_flush */, nullptr);
+                inode->schedule_or_start_commit();
 
-                /*
-                 * Set the commit_in_progress flag to ensure that the commit
-                 * task is started after the flush completes.
-                 * Whether we hit commit limit or not, we need to start commit
-                 * as we need to release the memory.
-                 */
-                inode->mark_commit_in_progress();
                 inode->flush_unlock();
 
                 inode->add_task_to_commit_waiters(task);
@@ -2675,11 +2683,8 @@ static void perform_inline_writes(struct rpc_task *task,
         if (!inode->is_stable_write()) {
             AZLogWarn("Not enough contiguous dirty data to flush, may be "
                       "pattern is not sequential. flush_now={}, size={}",
-                       flush_now, size);
+                      flush_now, size);
 
-            /*
-             * Let's switch to stable write mode.
-             */
             inode->switch_to_stable_write();
         }
 
@@ -2687,6 +2692,7 @@ static void perform_inline_writes(struct rpc_task *task,
          * If the write is stable, we can directly write to the
          * membufs and issue the writes to the backend.
          */
+        assert(inode->is_stable_write());
         bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range();
         if (!bc_vec.empty()) {
             /*
@@ -2724,205 +2730,6 @@ static void perform_inline_writes(struct rpc_task *task,
     assert(false);
 }
 
-static bool handle_writes(struct nfs_inode *inode,
-                          struct rpc_task *task,
-                          const fuse_ino_t ino,
-                          const bool sparse_write,
-                          uint64_t extent_left,
-                          uint64_t extent_right)
-{
-    assert(task->magic == RPC_TASK_MAGIC);
-    assert(task->rpc_api->write_task.is_fe());
-    assert(task->get_op_type() == FUSE_WRITE);
-
-    const size_t length = task->rpc_api->write_task.get_size();
-
-    /*
-     * Check what kind of limit we have hit.
-     */
-    const bool need_inline_write =
-        (sparse_write || inode->get_filecache()->do_inline_write());
-    const bool need_commit =
-        inode->get_filecache()->commit_required();
-    const bool need_flush =
-        inode->get_filecache()->flush_required();
-
-    AZLogDebug("[{}] handle write (sparse={}, need_inline_write={}, need_commit={},"
-                   " need_flush={} )", ino, sparse_write, need_inline_write,
-                    need_commit, need_flush);
-
-    /*
-     * Nothing to do, we can complete the application write rightaway.
-     */
-    if (!need_inline_write && !need_commit && !need_flush) {
-        task->reply_write(length);
-        return true;
-    }
-
-    /*
-     * We have successfully copied the user data into the cache.
-     * We can have the following cases:
-     * 1. This new write caused a contiguous extent to be greater than
-     *    max_dirty_extent_bytes(), aka MDEB.
-     *    In this case we begin flush of this contiguous extent (in multiple
-     *    parallel wsize sized blocks) since there's no benefit in waiting more
-     *    as the data is sufficient for the server scheduler to effectively
-     *    write, in optimal sized blocks.
-     *    We complete the application write rightaway without waiting for the
-     *    flush to complete as we are not under memory pressure.
-     *    This will happen when user is sequentially writing to the file.
-     * 2. A single contiguous extent is not greater than MDEB but total dirty
-     *    bytes waiting to be flushed is more than MDEB.
-     *    In this case we begin flush of the entire dirty data from the cache
-     *    as we have sufficient data for the server scheduler to perform
-     *    batched writes effectively.
-     *    We complete the application write rightaway without waiting for the
-     *    flush to complete as we are not under memory pressure.
-     *    This will happen when user is writing to the file in a random or
-     *    pseudo-sequential fashion.
-     * 3. Cache has dirty data beyond the "inline write" threshold, ref
-     *    do_inline_write().
-     *    In this case we begin flush of the entire dirty data.
-     *    This is a memory pressure situation and hence we do not complete the
-     *    application write till all the backend writes complete.
-     *    This will happen when user is writing faster than our backend write
-     *    throughput, eventually dirty data will grow beyond the "inline write"
-     *    threshold and then we have to slow down the writers by delaying
-     *    completion.
-     *
-     * Other than this we have a special case of "write beyond eof" (termed
-     * sparse write). In sparse write case also we perform "inline write" of
-     * all the dirty data. This is needed for correct read behaviour. Imagine
-     * a reader reading from the sparse part of the file which is not yet in
-     * the bytes_chunk_cache. This read will be issued to the server and since
-     * server doesn't know the updated file size (as the write may still be
-     * sitting in our bytes_chunk_cache) it will return eof. This is not correct
-     * as such reads issued after successful write, are valid and should return
-     * 0 bytes for the sparse range.
-     */
-
-    // Case 1: Inline writes
-    /*
-     * Do we need to perform "inline write"?
-     * Inline write implies, we flush all the dirty data and wait for all the
-     * corresponding backend writes to complete.
-     */
-    if (need_inline_write) {
-        INC_GBL_STATS(inline_writes, 1);
-
-        // Free up the fuse thread without completing the application write.
-        perform_inline_writes(task, inode);
-        return true;
-    }
-
-    // Case 2: Commit
-    /*
-     * We don't need to do inline writes. See if we need to commit the dirty
-     * data to the backend
-     */
-    if (need_commit) {
-        inode->flush_lock();
-        /*
-         * Set the commit_in_progress flag to ensure that the commit
-         * task is startet.
-         */
-        inode->mark_commit_in_progress();
-
-        // XXX What if commit completes before we reach here?
-        assert(inode->is_commit_in_progress());
-
-        inode->flush_unlock();
-        assert(!need_inline_write);
-    }
-
-    // Case 3: Flush
-    if (need_flush) {
-        /*
-         * We need to flush the dirty data in the range [extent_left, extent_right),
-         * get the membufs for the dirty data.
-         * We don't want to run over an inprogress truncate and resetting the file
-         * size set by truncate, so grab the is_flushing lock.
-         */
-        inode->flush_lock();
-
-        /*
-         * If flush is required but flush/commit already in progress, then
-         * add this task to the respective waiters list and return.
-         * As we already 1GB worth of dirty data in the cache, we don't want
-         * to add more data to the cache. So we wait for the ongoing flush/commit.
-         *
-         * XXX What if is_commit_in_progress() returns false but some other
-         *     thread starts commit right after that?
-         *     Note that we hold flush_lock which only protects us against some
-         *     other thread starting a flush but commit is protected by
-         *     commit_lock_5.
-         */
-        if (inode->is_commit_in_progress()) {
-            /*
-             * If commit is in progress, we need to wait for the commit
-             * to complete before we can proceed with the write.
-             */
-            inode->add_task_to_commit_waiters(task);
-            inode->flush_unlock();
-            return true;
-        }
-
-        if (inode->get_filecache()->is_flushing_in_progress()) {
-            /*
-             * If flush is in progress, we need to wait for the flush to
-             * complete before we can proceed with the write.
-             */
-            inode->add_task_to_flush_waiters(task);
-            inode->flush_unlock();
-            return true;
-        }
-
-        /*
-         * If we are here, then we need to flush the dirty data to the backend.
-         * We don't need to wait for the flush to complete, we can complete the
-         * application write rightaway.
-         */
-        uint64_t size = 0;
-        std::vector<bytes_chunk> bc_vec;
-
-        /*
-        * If the inode is stable write, then get the dirty membufs in the range.
-        * otherwise get the dirty membufs contigious range.
-        */
-        if (inode->is_stable_write()) {
-            bc_vec =
-            inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left,
-                                                                    extent_right);
-        } else {
-            bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size);
-        }
-
-        if (bc_vec.size() == 0) {
-            inode->flush_unlock();
-            task->reply_write(length);
-            return true;
-        }
-
-        /*
-        * Pass is_flush as false, since we don't want the writes to complete
-        * before returning.
-        */
-        inode->sync_membufs(bc_vec, false /* is_flush */);
-        inode->flush_unlock();
-
-        // Send reply to original request without waiting for the backend write to complete.
-        task->reply_write(length);
-        return true;
-    }
-
-    /*
-     * If we reached true, then we have need_commit true, but need_flush false.
-     */
-    assert(need_commit && !need_flush);
-    task->reply_write(length);
-    return true;
-}
-
 void rpc_task::run_write()
 {
     // This must be called only for front end tasks.
@@ -2933,6 +2740,9 @@ void rpc_task::run_write()
     const size_t length = rpc_api->write_task.get_size();
     struct fuse_bufvec *const bufv = rpc_api->write_task.get_buffer_vector();
     const off_t offset = rpc_api->write_task.get_offset();
+    /*
+     * TODO: Fix sparse writes.
+     */
     const bool sparse_write = false;
     uint64_t extent_left = 0;
     uint64_t extent_right = 0;
@@ -3022,6 +2832,50 @@ void rpc_task::run_write()
 
     assert(extent_right >= (extent_left + length));
 
+    /*
+     * We have successfully copied the user data into the cache.
+     * We can have the following cases (in decreasing order of criticality):
+     * 1. Cache has dirty+uncommitted data beyond the "inline write threshold",
+     *    ref do_inline_write().
+     *    In this case we begin flush of the dirty data and/or commit of the
+     *    uncommitted data.
+     *    This is a memory pressure situation and hence we do not complete the
+     *    application write till all the above backend writes complete.
+     *    This will happen when application is writing faster than our backend
+     *    write throughput, eventually dirty data will grow beyond the "inline
+     *    write threshold" and then we have to slow down the writers by delaying
+     *    completion.
+     * 2. Cache has uncommitted data beyond the "commit threshold", ref
+     *    commit_required().
+     *    In this case we free up space in the cache by committing data.
+     *    We just initiate a commit while the current write request is
+     *    completed. Note that we want to delay commits so that we can reduce
+     *    the commit calls (commit more data per call) as commit calls sort
+     *    of serialize the writes as we cannot send any other write to the
+     *    server while commit is on.
+     * 3. Cache has enough dirty data that we can flush.
+     *    For sequential writes, this means the new write caused a contiguous
+     *    extent to be greater than max_dirty_extent_bytes(), aka MDEB, while
+     *    for non-seq writes it would mean that the total dirty data grew beyond
+     *    MDEB.
+     *    In this case we begin flush of this contiguous extent (in multiple
+     *    parallel wsize sized blocks) since there's no benefit in waiting more
+     *    as the data is sufficient for the server scheduler to effectively
+     *    write, in optimal sized blocks.
+     *    We complete the application write rightaway without waiting for the
+     *    flush to complete as we are not under memory pressure.
+     *
+     * Other than this we have a special case of "write beyond eof" (termed
+     * sparse write). In sparse write case also we perform "inline write" of
+     * all the dirty data. This is needed for correct read behaviour. Imagine
+     * a reader reading from the sparse part of the file which is not yet in
+     * the bytes_chunk_cache. This read will be issued to the server and since
+     * server doesn't know the updated file size (as the write may still be
+     * sitting in our bytes_chunk_cache) it will return eof. This is not correct
+     * as such reads issued after successful write, are valid and should return
+     * 0 bytes for the sparse range.
+     */
+
     /*
      * If the extent size exceeds the max allowed dirty size as returned by
      * max_dirty_extent_bytes(), then it's time to flush the extent.
@@ -3038,10 +2892,31 @@ void rpc_task::run_write()
     assert(max_dirty_extent > 0);
 
     /*
-     * Ok, we neither need inline writes nor commit in progress. See if we have
-     * enough dirty data and we need to start async flush.
+     * Check what kind of limit we have hit.
      */
-    if ((extent_right - extent_left) < max_dirty_extent) {
+    const bool need_inline_write =
+        (sparse_write || inode->get_filecache()->do_inline_write());
+    const bool need_commit =
+        inode->get_filecache()->commit_required();
+    const bool need_flush =
+        ((extent_right - extent_left) >= max_dirty_extent) ||
+        inode->get_filecache()->flush_required();
+
+    AZLogDebug("[{}] handle write (sparse={}, need_inline_write={}, "
+               "need_commit={}, need_flush={}, extent=[{}, {}))",
+               inode->get_fuse_ino(), sparse_write, need_inline_write,
+               need_commit, need_flush, extent_left, extent_right);
+
+    /*
+     * Nothing to do, we can complete the application write rightaway.
+     * This should be the happy path!
+     */
+    if (!need_inline_write && !need_commit && !need_flush) {
+        reply_write(length);
+        return;
+    }
+
+    if (need_flush && ((extent_right - extent_left) < max_dirty_extent)) {
         /*
          * This is the case of non-sequential writes causing enough dirty
          * data to be accumulated, need to flush all of that.
@@ -3050,12 +2925,139 @@ void rpc_task::run_write()
         extent_right = UINT64_MAX;
     }
 
-    if (handle_writes(inode, this, ino, sparse_write,
-                      extent_left, extent_right)) {
+
+    // Case 1: Inline writes
+    /*
+     * Do we need to perform "inline write"?
+     * Inline write implies, we flush dirty data and commit uncommitted data
+     * and wait for all the corresponding backend writes to complete.
+     */
+    if (need_inline_write) {
+        INC_GBL_STATS(inline_writes, 1);
+
+        // Free up the fuse thread without completing the application write.
+        perform_inline_writes(this, inode);
         return;
-    } else {
-        assert(false);
     }
+
+    // Case 2: Commit
+    /*
+     * We don't need to do inline writes. See if we need to commit the
+     * uncommitted data to the backend. We just need to stat the commit
+     * and not hold the current write task till the commit completes.
+     */
+    if (need_commit) {
+        inode->flush_lock();
+
+        /*
+         * Start commit if no flushing is in progress else mark commit
+         * pending, so that last flush can then start the commit.
+         */
+        inode->schedule_or_start_commit();
+
+        /*
+         * Note: Commit request sent by the above call can potentially
+         *       complete before we reach here. In that case the following
+         *       assert will fail. It's very unlikely, so leave the assert
+         *       as it's useful.
+         */
+        assert(inode->is_commit_in_progress());
+
+        inode->flush_unlock();
+        assert(!need_inline_write);
+    }
+
+    // Case 3: Flush
+    if (need_flush) {
+        /*
+         * We need to flush the dirty data in the range [extent_left, extent_right),
+         * get the membufs for the dirty data.
+         * We don't want to run over an inprogress truncate and resetting the file
+         * size set by truncate, so grab the is_flushing lock.
+         */
+        inode->flush_lock();
+
+        /*
+         * If flush is required but flush/commit already in progress, then
+         * add this task to the respective waiters list and return.
+         * As we already have 1GB worth of dirty data in the cache, we don't
+         * want to add more data to the cache. So we wait for the ongoing
+         * flush/commit.
+         *
+         * XXX What if is_commit_in_progress() returns false but some other
+         *     thread starts commit right after that?
+         *     Note that we hold flush_lock which only protects us against some
+         *     other thread starting a flush but commit is protected by
+         *     commit_lock_5.
+         */
+        if (inode->is_commit_in_progress()) {
+            inode->flush_unlock();
+            /*
+             * If commit is in progress, we need to wait for the commit
+             * to complete before we can complete the write.
+             */
+            inode->add_task_to_commit_waiters(this);
+            return;
+        }
+
+        if (!inode->is_stable_write() &&
+            inode->get_filecache()->is_flushing_in_progress()) {
+            inode->flush_unlock();
+            /*
+             * If flush is in progress, we need to wait for the flush to
+             * complete before we can proceed with the write.
+             * Note that for unstable write we don't start a new flush if
+             * there's an ongoing one, but for stable it's fine to start
+             * another flush.
+             */
+            inode->add_task_to_flush_waiters(this);
+            return;
+        }
+
+        /*
+         * If we are here, then we need to flush the dirty data to the backend.
+         * We don't need to wait for the flush to complete, we can complete the
+         * application write rightaway.
+         */
+        uint64_t size = 0;
+        std::vector<bytes_chunk> bc_vec;
+
+        /*
+        * If the inode is stable write, then get the dirty membufs in the range.
+        * otherwise get the dirty membufs contigious range.
+        */
+        if (inode->is_stable_write()) {
+            bc_vec =
+            inode->get_filecache()->get_dirty_nonflushing_bcs_range(extent_left,
+                                                                    extent_right);
+        } else {
+            bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(size);
+        }
+
+        if (bc_vec.size() == 0) {
+            inode->flush_unlock();
+            reply_write(length);
+            return;
+        }
+
+        /*
+        * Pass is_flush as false, since we don't want the writes to complete
+        * before returning.
+        */
+        inode->sync_membufs(bc_vec, false /* is_flush */);
+        inode->flush_unlock();
+
+        // Send reply to original request without waiting for the backend write to complete.
+        reply_write(length);
+        return;
+    }
+
+    /*
+     * If we reached true, then we have need_commit true, but need_flush false.
+     */
+    assert(need_commit && !need_flush);
+    reply_write(length);
+    return;
 }
 
 void rpc_task::run_flush()

From 2a9f3b686ccb725c1ce4e6971ca9765c1ff1997c Mon Sep 17 00:00:00 2001
From: Nitin Singla <nitinsingla@microsoft.com>
Date: Thu, 30 Jan 2025 14:57:53 +0000
Subject: [PATCH 8/9] Some fixes about deadlck

---
 turbonfs/inc/file_cache.h  |  2 +-
 turbonfs/inc/nfs_inode.h   | 11 ++++++++---
 turbonfs/src/nfs_inode.cpp | 14 ++++++++++----
 turbonfs/src/rpc_task.cpp  | 15 ++++++---------
 4 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index 38afcc89..867abe50 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -1632,7 +1632,7 @@ class bytes_chunk_cache
          * the second one.
          */
         static const uint64_t max_dirty_allowed_per_cache =
-            (max_dirty_extent_bytes() * 2);
+            (max_dirty_extent_bytes() * 4);
         const bool local_pressure =
             ((int64_t) get_bytes_to_flush() > (int64_t) max_dirty_allowed_per_cache);
 
diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
index e33e61f9..35e8ff54 100644
--- a/turbonfs/inc/nfs_inode.h
+++ b/turbonfs/inc/nfs_inode.h
@@ -559,7 +559,7 @@ struct nfs_inode
      * change the file size after truncate sets it.
      */
     bool truncate_start(size_t size);
-    void truncate_end() const;
+    void truncate_end();
 
     /**
      * This MUST be called only after has_filecache() returns true, else
@@ -989,7 +989,7 @@ struct nfs_inode
          */
         if (!non_append_writes_seen && (offset != attr.st_size)) {
             non_append_writes_seen = true;
-            AZLogInfo("[{}] Non-append write seen [{}, {}), file size: {}",
+            AZLogDebug("[{}] Non-append write seen [{}, {}), file size: {}",
                       ino, offset, offset+length, attr.st_size);
         }
 
@@ -1512,7 +1512,12 @@ struct nfs_inode
      * Lock the inode for flushing.
      */
     void flush_lock() const;
-    void flush_unlock() const;
+    void flush_unlock();
+
+    bool flushtry_lock()
+    {
+        return !is_flushing.exchange(true);
+    }
 
     /**
      * Revalidate the inode.
diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index 4776069f..bd7ce930 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -980,7 +980,6 @@ void nfs_inode::schedule_or_start_commit()
 {
     assert(is_flushing);
     assert(!is_stable_write());
-
     /*
      * We want to commit, but we need to wait for current flushing to complete.
      * 1. If flushing is going on, set commit_pending for the inode, when last
@@ -993,6 +992,7 @@ void nfs_inode::schedule_or_start_commit()
      */
     if (!is_commit_in_progress()) {
         if (get_filecache()->is_flushing_in_progress()) {
+            AZLogWarn("[{}] Flushing in progress, setting commit_pending", ino);
             set_commit_pending();
         } else {
             commit_membufs();
@@ -1233,6 +1233,10 @@ int nfs_inode::wait_for_ongoing_flush_commit(struct rpc_task *task)
          * we come here we should have flushing going on.
          */
         assert(!get_filecache()->is_flushing_in_progress());
+        if (is_commit_pending()) {
+            AZLogInfo("[{}] Commit is pending, we need to start the commit", ino);
+            commit_membufs();
+        }
         wait_for_ongoing_commit();
     }
 
@@ -1497,7 +1501,7 @@ void nfs_inode::flush_lock() const
     return;
 }
 
-void nfs_inode::flush_unlock() const
+void nfs_inode::flush_unlock()
 {
     AZLogDebug("[{}] flush_unlock() called", ino);
 
@@ -1506,7 +1510,9 @@ void nfs_inode::flush_unlock() const
      */
     assert(has_filecache());
     assert(is_flushing == true);
-
+    if (!filecache_handle->is_flushing_in_progress() && is_commit_pending()) {
+        commit_membufs();
+    }
     {
         std::unique_lock<std::mutex> _lock(iflush_lock_3);
         is_flushing = false;
@@ -1516,7 +1522,7 @@ void nfs_inode::flush_unlock() const
     flush_cv.notify_one();
 }
 
-void nfs_inode::truncate_end() const
+void nfs_inode::truncate_end()
 {
     AZLogDebug("[{}] truncate_end() called", ino);
 
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 62d3f5fa..c458955d 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -1430,16 +1430,13 @@ static void write_iov_callback(
      * completes, we need to start the commit.
      */
     if (!inode->get_filecache()->is_flushing_in_progress()) {
-        inode->flush_lock();
-        if (inode->is_commit_pending()) {
-            assert(!inode->is_stable_write());
-
-            /*
-             * We are issuing the commit rpc, so set the commit_in_progress flag.
-             */
-            inode->commit_membufs();
+        if(inode->flushtry_lock()) {
+            if (!inode->get_filecache()->is_flushing_in_progress() &&
+                inode->is_commit_pending()) {
+                inode->commit_membufs();
+            }
+            inode->flush_unlock();
         }
-        inode->flush_unlock();
     }
 
     /*

From e7fa84484a13a6306a90948b20425ab5ffaa5864 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Fri, 31 Jan 2025 10:31:01 +0000
Subject: [PATCH 9/9] Some comment changes.

---
 turbonfs/src/nfs_inode.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index bd7ce930..50886418 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -1307,12 +1307,11 @@ int nfs_inode::flush_cache_and_wait()
     }
 
     /*
-     * Grab the inode is_flushing lock to ensure that we doen't initiate
-     * any new flush operation while some truncate call is in progress
-     * (which must have taken the is_flushing lock).
-     * Once flush_lock() returns we have the is_flushing lock and we are
+     * Grab the flush_lock to ensure that we don't initiate any new flushes
+     * while some truncate call is in progress (which must have taken the
+     * flush_lock). Once flush_lock() returns we have the lock and we are
      * guaranteed that no new truncate operation can start till we release
-     * the is_flushing lock. We can safely start the flush then.
+     * the flush_lock. We can safely start the flush then.
      */
     flush_lock();