From 77fdade84cef837b7423414984cc11ebbe16880c Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Sun, 9 Feb 2025 02:48:59 +0000
Subject: [PATCH 1/9] Unaudited change.

---
 turbonfs/inc/fcsm.h        |  27 +-
 turbonfs/inc/file_cache.h  |   4 +-
 turbonfs/inc/nfs_inode.h   |  62 +++-
 turbonfs/src/fcsm.cpp      | 672 +++++++++++++++++++++++++++++++++----
 turbonfs/src/nfs_inode.cpp |   4 +-
 turbonfs/src/rpc_task.cpp  |   4 +-
 6 files changed, 690 insertions(+), 83 deletions(-)

diff --git a/turbonfs/inc/fcsm.h b/turbonfs/inc/fcsm.h
index 09ce10be..db034b4b 100644
--- a/turbonfs/inc/fcsm.h
+++ b/turbonfs/inc/fcsm.h
@@ -95,8 +95,7 @@ class fcsm
      *
      * LOCKS: flush_lock.
      */
-    void ensure_flush(uint64_t flush_bytes,
-                      uint64_t write_off,
+    void ensure_flush(uint64_t write_off,
                       uint64_t write_len,
                       struct rpc_task *task = nullptr);
 
@@ -107,8 +106,7 @@ class fcsm
      * it'll add a blocking commit target for completing task when given commit
      * goal is met.
      */
-    void ensure_commit(uint64_t commit_bytes,
-                       struct rpc_task *task = nullptr);
+    void ensure_commit(struct rpc_task *task = nullptr);
 
     /**
      * Callbacks to be called when flush/commit successfully complete.
@@ -116,7 +114,7 @@ class fcsm
      * targets from ftgtq/ctgtq as appropriate.
      */
     void on_flush_complete(uint64_t flush_bytes);
-    void on_commit_complete();
+    void on_commit_complete(uint64_t commit_bytes);
 
     /**
      * Is the state machine currently running, i.e. it has sent (one or more)
@@ -148,6 +146,10 @@ class fcsm
     void mark_running();
     void clear_running();
 
+    void run(struct rpc_task *task,
+             uint64_t extent_left,
+             uint64_t extent_right);
+
     /**
      * Call when more writes are dispatched, or prepared to be dispatched.
      * This MUST be called before the write callback can be called.
@@ -180,6 +182,21 @@ class fcsm
         return (fc_cb_count() > 0);
     }
 
+    /**
+     * Call when more commit are dispatched, or prepared to be dispatched.
+     * This MUST be called before the commit_callback can be called.
+     */
+    void add_committing(uint64_t bytes)
+    {
+        assert(committed_seq_num <= committing_seq_num);
+        committing_seq_num += bytes;
+    }
+
+    /*
+     * ctgtq_cleanup() is called when we switch to stable writes.
+     */
+    void ctgtq_cleanup();
+
 private:
     /*
      * The singleton nfs_client, for convenience.
diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index f91c7767..7b07ced8 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -1200,8 +1200,8 @@ class bytes_chunk_cache
      *       check for that after holding the membuf lock, before it tries to
      *       flush those membuf(s).
      */
-    std::vector<bytes_chunk> get_dirty_bc_range(uint64_t st_off,
-                                                uint64_t end_off) const;
+    std::vector<bytes_chunk> get_dirty_bc_range(
+            uint64_t st_off = 0, uint64_t end_off = UINT64_MAX) const;
 
     /*
      * Returns dirty chunks which are not already flushing, in the given range,
diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
index f374f98c..2b16b1b7 100644
--- a/turbonfs/inc/nfs_inode.h
+++ b/turbonfs/inc/nfs_inode.h
@@ -289,6 +289,29 @@ struct nfs_inode
      */
     struct stat attr;
 
+    /**
+     * We maintain following multiple views of the file and thus multiple file
+     * sizes for those views.
+     * - Cached.
+     *   This is the view of the file that comprises of data that has been
+     *   written by the application and saved in file cache. It may or may not
+     *   have been flushed and/or committed. This is the most uptodate view of
+     *   the file and applications must use this view.
+     *   cached_filesize tracks the file size for this view.
+     * - Uncommited.
+     *   This is the view of the file that tracks data that has been flushed
+     *   using UNSTABLE writes but not yet COMMITted to the Blob. This view of
+     *   the file is only used to see if the next PB call will write after the
+     *   last PB'ed byte and thus can be appended.
+     *   putblock_filesize tracks the file size for this view.
+     * - Committed.
+     *   This is the view of the file that tracks data committed to the Blob.
+     *   Other clients will see this view.
+     *   attr.st_size tracks the file size for this view.
+     */
+    off_t cached_filesize = 0;
+    mutable off_t putblock_filesize = 0;
+
     /*
      * Has this inode seen any non-append write?
      * This starts as false and remains false as long as copy_to_cache() only
@@ -310,7 +333,7 @@ struct nfs_inode
      * Note: As of now, we are not using this flag as commit changes not yet
      *       integrated, so we are setting this flag to true.
      */
-    bool stable_write = true;
+    bool stable_write = false;
 
 public:
     /*
@@ -1400,11 +1423,10 @@ struct nfs_inode
      *       initiate any new flush operations while some truncate call is in
      *       progress (which must have held the flush_lock).
      */
-    int flush_cache_and_wait(uint64_t start_off = 0,
-                             uint64_t end_off = UINT64_MAX);
+    int flush_cache_and_wait();
 
     /**
-     * Wait for currently flushing membufs to complete.
+     * Wait for currently flushing/committing membufs to complete.
      * Returns 0 on success and a positive errno value on error.
      *
      * Note : Caller must hold the inode flush_lock to ensure that
@@ -1416,6 +1438,38 @@ struct nfs_inode
     int wait_for_ongoing_flush(uint64_t start_off = 0,
                                uint64_t end_off = UINT64_MAX);
 
+    /**
+     * commit_membufs() is called to commit uncommitted membufs to the Blob.
+     * It creates commit RPC and sends it to the NFS server.
+     */
+    void commit_membufs(std::vector<bytes_chunk> &bcs)
+    {
+        // TODO: Provide implementation in nfs_inode.cpp.
+        assert(0);
+    }
+
+    /**
+     * switch_to_stable_write() is called to switch the inode to stable write
+     * mode. It waits for all ongoing flush and subsequent commit to complete.
+     * If not already scheduled, it'll perform an explicit commit after the
+     * flush complete.
+     * Post that it'll mark inode for stable write and return. From then on
+     * any writes to this inode will be sent as stable writes.
+     */
+    void switch_to_stable_write();
+
+    /**
+     * Check if stable write is required for the given offset.
+     * Given offset is the start of contiguous dirty membufs that need to be
+     * flushed to the Blob.
+     */
+    bool check_stable_write_required(off_t offset);
+
+    /**
+     * Wait for ongoing commit operation to complete.
+     */
+    void wait_for_ongoing_commit();
+
     /**
      * Sync the dirty membufs in the file cache to the NFS server.
      * All contiguous dirty membufs are clubbed together and sent to the
diff --git a/turbonfs/src/fcsm.cpp b/turbonfs/src/fcsm.cpp
index 7231591d..76ebb043 100644
--- a/turbonfs/src/fcsm.cpp
+++ b/turbonfs/src/fcsm.cpp
@@ -45,7 +45,7 @@ fcsm::fctgt::fctgt(struct fcsm *fcsm,
     }
 #endif
 
-    AZLogDebug("[{}] [FCSM] {} fctgt queued (F: {}, C: {}, T: {})",
+    AZLogInfo("[{}] [FCSM] {} fctgt queued (F: {}, C: {}, T: {})",
                fcsm->get_inode()->get_fuse_ino(),
                task ? "Blocking" : "Non-blocking",
                flush_seq,
@@ -88,12 +88,358 @@ void fcsm::add_flushing(uint64_t bytes)
     flushing_seq_num += bytes;
 }
 
-void fcsm::ensure_flush(uint64_t flush_bytes,
-                        uint64_t write_off,
+void fcsm::run(struct rpc_task *task,
+               uint64_t extent_left,
+               uint64_t extent_right)
+{
+#ifdef NDEBUG
+    assert(task->magic == RPC_TASK_MAGIC);
+    assert(task->get_op_type() == FUSE_WRITE);
+    assert(task->rpc_api->write_task.is_fe());
+    assert(task->rpc_api->write_task.get_size() > 0);
+#endif
+
+    const size_t length = task->rpc_api->write_task.get_size();
+    const off_t offset = task->rpc_api->write_task.get_offset();
+    const bool sparse_write = false;
+    const fuse_ino_t ino = task->rpc_api->write_task.get_ino();
+
+    /*
+     * We have successfully copied the user data into the cache.
+     * We can have the following cases (in decreasing order of criticality):
+     * 1. Cache has dirty+uncommitted data beyond the "inline write threshold",
+     *    ref do_inline_write().
+     *    In this case we begin flush of the dirty data and/or commit of the
+     *    uncommitted data.
+     *    This is a memory pressure situation and hence we do not complete the
+     *    application write till all the above backend writes complete.
+     *    This will happen when application is writing faster than our backend
+     *    write throughput, eventually dirty data will grow beyond the "inline
+     *    write threshold" and then we have to slow down the writers by delaying
+     *    completion.
+     * 2. Cache has uncommitted data beyond the "commit threshold", ref
+     *    commit_required().
+     *    In this case we free up space in the cache by committing data.
+     *    We just initiate a commit while the current write request is
+     *    completed. Note that we want to delay commits so that we can reduce
+     *    the commit calls (commit more data per call) as commit calls sort
+     *    of serialize the writes as we cannot send any other write to the
+     *    server while commit is on.
+     * 3. Cache has enough dirty data that we can flush.
+     *    For sequential writes, this means the new write caused a contiguous
+     *    extent to be greater than max_dirty_extent_bytes(), aka MDEB, while
+     *    for non-seq writes it would mean that the total dirty data grew beyond
+     *    MDEB.
+     *    In this case we begin flush of this contiguous extent (in multiple
+     *    parallel wsize sized blocks) since there's no benefit in waiting more
+     *    as the data is sufficient for the server scheduler to effectively
+     *    write, in optimal sized blocks.
+     *    We complete the application write rightaway without waiting for the
+     *    flush to complete as we are not under memory pressure.
+     *
+     * Other than this we have a special case of "write beyond eof" (termed
+     * sparse write). In sparse write case also we perform "inline write" of
+     * all the dirty data. This is needed for correct read behaviour. Imagine
+     * a reader reading from the sparse part of the file which is not yet in
+     * the bytes_chunk_cache. This read will be issued to the server and since
+     * server doesn't know the updated file size (as the write may still be
+     * sitting in our bytes_chunk_cache) it will return eof. This is not correct
+     * as such reads issued after successful write, are valid and should return
+     * 0 bytes for the sparse range.
+     */
+
+    /*
+     * If the extent size exceeds the max allowed dirty size as returned by
+     * max_dirty_extent_bytes(), then it's time to flush the extent.
+     * Note that this will cause sequential writes to be flushed at just the
+     * right intervals to optimize fewer write calls and also allowing the
+     * server scheduler to merge better.
+     * See bytes_to_flush for how random writes are flushed.
+     *
+     * Note: max_dirty_extent is static as it doesn't change after it's
+     *       queried for the first time.
+     */
+    static const uint64_t max_dirty_extent =
+        inode->get_filecache()->max_dirty_extent_bytes();
+    assert(max_dirty_extent > 0);
+
+    /*
+     * Check what kind of limit we have hit.
+     */
+    const bool need_inline_write =
+        (sparse_write || inode->get_filecache()->do_inline_write());
+    const bool need_commit =
+        inode->get_filecache()->commit_required();
+    const bool need_flush =
+        ((extent_right - extent_left) >= max_dirty_extent) ||
+        inode->get_filecache()->flush_required();
+
+    AZLogDebug("[{}] handle write (sparse={}, need_inline_write={}, "
+               "need_commit={}, need_flush={}, extent=[{}, {}))",
+               inode->get_fuse_ino(), sparse_write, need_inline_write,
+               need_commit, need_flush, extent_left, extent_right);
+
+    /*
+     * Nothing to do, we can complete the application write rightaway.
+     * This should be the happy path!
+     */
+    if (!need_inline_write && !need_commit && !need_flush) {
+        task->reply_write(length);
+        return;
+    }
+
+    /*
+     * Do we need to perform "inline write"?
+     * Inline write implies, we flush all the dirty data and wait for all the
+     * corresponding backend writes to complete.
+     */
+    if (need_inline_write) {
+        INC_GBL_STATS(inline_writes, 1);
+
+        AZLogDebug("[{}] Inline write (sparse={}), {} bytes, extent @ [{}, {})",
+                   ino, sparse_write, (extent_right - extent_left),
+                   extent_left, extent_right);
+
+        /*
+         * ensure_flush() will arrange to flush all the dirty data and complete
+         * the write task when the flush completes.
+         */
+        inode->flush_lock();
+        if (inode->is_stable_write()) {
+            inode->get_fcsm()->ensure_flush(offset, length, task);
+        } else {
+            inode->get_fcsm()->ensure_commit(task);
+        }
+        inode->flush_unlock();
+
+        // Free up the fuse thread without completing the application write.
+        return;
+    }
+
+    // Case 2: Commit
+    /*
+     * We don't need to do inline writes. See if we need to commit the
+     * uncommitted data to the backend. We just need to stat the commit
+     * and not hold the current write task till the commit completes.
+     */
+    if (need_commit) {
+        inode->flush_lock();
+        if (inode->is_commit_in_progress()) {
+            AZLogDebug("[{}] Commit already in progress, skipping commit",
+                       ino);
+        } else {
+            AZLogDebug("[{}] Committing {} bytes", ino,
+                       inode->get_filecache()->max_commit_bytes());
+            inode->get_fcsm()->ensure_commit(nullptr);
+        }
+        inode->flush_unlock();
+
+        assert(!need_inline_write);
+    }
+
+    /*
+     * Ok, we don't need to do inline writes. See if we have enough dirty
+     * data and we need to start async flush.
+     */
+
+    // Case 3: Flush
+    if ((extent_right - extent_left) < max_dirty_extent) {
+        /*
+         * This is the case of non-sequential writes causing enough dirty
+         * data to be accumulated, need to flush all of that.
+         */
+        extent_left = 0;
+        extent_right = UINT64_MAX;
+    }
+
+    /*
+     * Queue a non-blocking flush target for flushing all the dirty data.
+     */
+    inode->flush_lock();
+    inode->get_fcsm()->ensure_flush(offset, length, nullptr);
+    inode->flush_unlock();
+
+    /*
+     * Complete the write request without waiting for the backend flush to
+     * complete.
+     */
+    task->reply_write(length);
+}
+
+void fcsm::ctgtq_cleanup()
+{
+    assert(inode->is_flushing);
+    assert(inode->is_stable_write());
+
+    AZLogDebug("[FCSM][{}] ctgtq_cleanup()", inode->get_fuse_ino());
+    while (!ctgtq.empty()) {
+        struct fctgt &ctgt = ctgtq.front();
+        assert(ctgt.fcsm == this);
+
+        struct rpc_task *task = ctgt.task;
+        if (task) {
+        #ifdef NDEBUG
+            // Complete the commit task as we switch to stable writes.
+            assert(task->magic == RPC_TASK_MAGIC);
+            assert(task->get_op_type() == FUSE_WRITE);
+            assert(task->rpc_api->write_task.is_fe());
+            assert(task->rpc_api->write_task.get_size() > 0);
+        #endif
+            task->reply_write(task->rpc_api->write_task.get_size());
+        }
+
+        AZLogInfo("[FCSM][{}] ctgtq_cleanup(): completed task: {} commit_seq: {}",
+                  inode->get_fuse_ino(),
+                  fmt::ptr(task),
+                  ctgt.commit_seq);
+        ctgtq.pop();
+    }
+
+    assert(ctgtq.empty());
+}
+
+void fcsm::ensure_commit(struct rpc_task *task)
+{
+    uint64_t commit_bytes = 0;
+
+    assert(inode->is_flushing == true);
+    AZLogDebug("[{}] [FCSM] ensure_commit<{}> task: {}",
+               inode->get_fuse_ino(),
+               task ? "blocking" : "non-blocking",
+               fmt::ptr(task));
+
+    assert(!inode->is_stable_write());
+
+    // committed_seq_num can never be more than committing_seq_num.
+    assert(committed_seq_num <= committing_seq_num);
+
+#ifdef NDEBUG
+    if (task) {
+        // task provided must be a frontend write task.
+        assert(task->magic == RPC_TASK_MAGIC);
+        assert(task->get_op_type() == FUSE_WRITE);
+        assert(task->rpc_api->write_task.is_fe());
+        assert(task->rpc_api->write_task.get_size() > 0);
+    }
+#endif
+
+    /*
+     * What will be the committed_seq_num value after commit_bytes are committed?
+     */
+    if (inode->get_filecache()->get_bytes_to_commit() == 0) {
+        int64_t bytes = inode->get_filecache()->get_bytes_to_flush() -
+                          inode->get_filecache()->max_dirty_extent_bytes();
+        commit_bytes = std::max(bytes, (int64_t) 0);
+    } else {
+        commit_bytes = inode->get_filecache()->get_bytes_to_commit();
+    }
+
+    if (commit_bytes == 0) {
+        AZLogDebug("COMMIT BYTES ZERO");
+    }
+
+    const uint64_t target_committed_seq_num = committed_seq_num + commit_bytes;
+
+    /*
+     * If the state machine is already running, we just need to add an
+     * appropriate commit target and return. When the ongoing operation
+     * completes, this commit would be dispatched.
+     */
+    if (is_running()) {
+#ifdef NDEBUG
+        /*
+         * Make sure commit targets are always added in an increasing commit_seq.
+         */
+        if (!ctgtq.empty()) {
+            assert(ctgtq.front().commit_seq <= target_committed_seq_num);
+            assert(ctgtq.front().flush_seq == 0);
+        }
+#endif
+        ctgtq.emplace(this,
+                      0 /* target flush_seq */,
+                      target_committed_seq_num /* target commit_seq */,
+                      task);
+        assert(is_running());
+        assert(inode->is_flushing);
+        assert(!inode->is_stable_write());
+        return;
+    }
+
+    /*
+     * FCSM not running.
+     * Flushed_seq_num tells us how much data is already flushed, If it's less
+     * than the target_committed_seq_num, we need to schedule a flush to catch up
+     * with the target_committed_seq_num.
+     */
+    if (flushed_seq_num < target_committed_seq_num) {
+        AZLogDebug("[{}] [FCSM] not running, schedule a new flush to catch up, "
+                   "flushed_seq_num: {}, target_committed_seq_num: {}",
+                   inode->get_fuse_ino(),
+                   flushed_seq_num.load(),
+                   target_committed_seq_num);
+
+        ensure_flush(0, 0, nullptr);
+        assert(is_running());
+        assert(flushing_seq_num >= target_committed_seq_num);
+
+        if (!inode->is_stable_write()) {
+            /**
+             * Enqueue a commit target to be triggered once the flush completes.
+             */
+            ctgtq.emplace(this,
+                          0 /* target flush_seq */,
+                          target_committed_seq_num /* target commit_seq */,
+                          task);
+        } else {
+            task->reply_write(task->rpc_api->write_task.get_size());
+        }
+
+        return;
+    } else {
+        AZLogDebug("[{}] [FCSM] not running, schedule a new commit,"
+                   " flushed_seq_num: {}, "
+                   "target_committed_seq_num: {}",
+                   inode->get_fuse_ino(),
+                   flushed_seq_num.load(),
+                   target_committed_seq_num);
+
+        uint64_t bytes;
+        std::vector<bytes_chunk> bc_vec =
+            inode->get_filecache()->get_commit_pending_bcs(&bytes);
+        assert(bc_vec.empty() == (bytes == 0));
+
+       /*
+        * FCSM not running and nothing to commit, complete the task and return.
+        * It may happen due writes are changed to stable write.
+        */
+        if (bytes == 0) {
+            assert(inode->is_stable_write() || commit_bytes == 0);
+            if (task) {
+                AZLogDebug("[{}] [FCSM] not running and nothing to commit, "
+                           "completing fuse write rightaway",
+                           inode->get_fuse_ino());
+                task->reply_write(task->rpc_api->write_task.get_size());
+            }
+            return;
+        }
+
+        inode->commit_membufs(bc_vec);
+        assert(is_running());
+        assert(committing_seq_num >= commit_bytes);
+    }
+}
+
+/**
+ * Must be called with flush_lock() held.
+ */
+void fcsm::ensure_flush(uint64_t write_off,
                         uint64_t write_len,
                         struct rpc_task *task)
 {
-    AZLogDebug("[{}] [FCSM] ensure_flush<{}>({}), write req [{}, {}), task: {}",
+    uint64_t flush_bytes = 0;
+
+    assert(inode->is_flushing == true);
+    AZLogDebug("[{}] [FCSM] ensure_flush<{}>({}), write req [{}, {}], task: {}",
                inode->get_fuse_ino(),
                task ? "blocking" : "non-blocking",
                flush_bytes,
@@ -113,30 +459,29 @@ void fcsm::ensure_flush(uint64_t flush_bytes,
     if (task) {
         // task provided must be a frontend write task.
         assert(task->magic == RPC_TASK_MAGIC);
-        assert(task->get_op_type() == FUSE_WRITE);
-        assert(task->rpc_api->write_task.is_fe());
-        assert(task->rpc_api->write_task.get_size() > 0);
-
-        // write_len and write_off must match that of the task.
-        assert(task->rpc_api->write_task.get_size() == write_len);
-        assert(task->rpc_api->write_task.get_offset() == (off_t) write_off);
+        if (task->get_op_type() == FUSE_WRITE) {
+            assert(task->rpc_api->write_task.is_fe());
+            assert(task->rpc_api->write_task.get_size() > 0);
+            // write_len and write_off must match that of the task.
+            assert(task->rpc_api->write_task.get_size() == write_len);
+            assert((uint64_t) task->rpc_api->write_task.get_offset() ==
+                                                                 write_off);
+        } else {
+            assert(0);
+        }
     }
 #endif
 
-    /*
-     * Grab flush_lock to atomically get list of dirty chunks, which are not
-     * already being flushed. This also protects us racing with a truncate
-     * call and growing the file size after truncate shrinks the file.
-     */
-    inode->flush_lock();
-
     /*
      * What will be the flushed_seq_num value after all current dirty bytes are
      * flushed? That becomes our target flushed_seq_num.
      */
     const uint64_t bytes_to_flush =
         inode->get_filecache()->get_bytes_to_flush();
-    const uint64_t target_flushed_seq_num = flushing_seq_num + bytes_to_flush;
+    const uint64_t last_flush_seq =
+                !ftgtq.empty() ? ftgtq.front().flush_seq : 0;
+    const uint64_t target_flushed_seq_num =
+             std::max((flushing_seq_num + bytes_to_flush), last_flush_seq);
 
     /*
      * If the state machine is already running, we just need to add an
@@ -171,11 +516,17 @@ void fcsm::ensure_flush(uint64_t flush_bytes,
             }
         }
 #endif
+        if (task == nullptr &&
+             (target_flushed_seq_num == last_flush_seq)) {
+            assert(is_running());
+            return;
+        }
+
         ftgtq.emplace(this,
                       target_flushed_seq_num /* target flush_seq */,
                       0 /* commit_seq */,
                       task);
-        inode->flush_unlock();
+        assert(is_running());
         return;
     }
 
@@ -183,27 +534,16 @@ void fcsm::ensure_flush(uint64_t flush_bytes,
      * FCSM not running.
      */
     uint64_t bytes;
-    std::vector<bytes_chunk> bc_vec =
-        inode->get_filecache()->get_dirty_nonflushing_bcs_range(0, UINT64_MAX,
-                                                                &bytes);
-    assert(bc_vec.empty() == (bytes == 0));
+    std::vector<bytes_chunk> bc_vec;
 
-    /*
-     * FCSM not running and nothing to flush, complete the task and return.
-     * Note that moe dirty data can be added after we get the list of dirty
-     * and non-flushing bcs above, but they can be flushed at a later point.
-     */
-    if (bytes == 0) {
-        inode->flush_unlock();
-
-        if (task) {
-            AZLogDebug("[{}] [FCSM] not running and nothing to flush, "
-                       "completing fuse write [{}, {}) rightaway",
-                       inode->get_fuse_ino(), write_off, write_off+write_len);
-            task->reply_write(write_len);
-        }
-        return;
+    if (inode->is_stable_write()) {
+        bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range(
+                                                    0, UINT64_MAX, &bytes);
+    } else {
+        bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(&bytes);
     }
+    assert(bc_vec.empty() == (bytes == 0));
+    assert(bytes > 0);
 
     /*
      * Kickstart the state machine.
@@ -231,6 +571,152 @@ void fcsm::ensure_flush(uint64_t flush_bytes,
     assert(is_running());
     assert(flushing_seq_num == (flushing_seq_num_before + bytes));
     assert(flushed_seq_num <= flushing_seq_num);
+}
+
+/**
+ * TODO: We MUST ensure that on_commit_complete() doesn't block else it'll
+ *       block a libnfs thread which may stall further request processing
+ *       which may cause deadlock.
+ */
+void fcsm::on_commit_complete(uint64_t commit_bytes)
+{
+    // Must be called only for success.
+    assert(commit_bytes > 0);
+    assert(!inode->get_filecache()->is_flushing_in_progress());
+
+    // Commit callback can only be called if FCSM is running.
+    assert(is_running());
+
+    // Update committed_seq_num to account for the commit_bytes.
+    committed_seq_num += commit_bytes;
+
+    // committed_seq_num can never go more than committing_seq_num.
+    assert(committed_seq_num == committing_seq_num);
+
+    AZLogDebug("[{}] [FCSM] on_commit_complete, committed_seq_num now: {}, "
+               "committing_seq_num: {}",
+               inode->get_fuse_ino(),
+               committed_seq_num.load(),
+               committing_seq_num.load());
+
+    inode->flush_lock();
+
+    /*
+     * Go over all queued commit targets to see if any can be completed after
+     * the latest commit completed.
+     */
+    while (!ctgtq.empty()) {
+        struct fctgt& tgt = ctgtq.front();
+
+        assert(tgt.fcsm == this);
+
+        /*
+         * ftgtq has commit targets in increasing order of committed_seq_num, so
+         * as soon as we find one that's greater than committed_seq_num, we can
+         * safely skip the rest.
+         */
+        if (tgt.commit_seq > committed_seq_num) {
+            break;
+        }
+
+        if (tgt.task) {
+            assert(tgt.task->magic == RPC_TASK_MAGIC);
+            if (tgt.task->get_op_type() == FUSE_WRITE) {
+                assert(tgt.task->rpc_api->write_task.is_fe());
+                assert(tgt.task->rpc_api->write_task.get_size() > 0);
+
+                AZLogInfo("[{}] [FCSM] completing blocking commit target: {}, "
+                           "committed_seq_num: {}, write task: [{}, {})",
+                           inode->get_fuse_ino(),
+                           tgt.commit_seq,
+                           committed_seq_num.load(),
+                           tgt.task->rpc_api->write_task.get_offset(),
+                           tgt.task->rpc_api->write_task.get_offset() +
+                           tgt.task->rpc_api->write_task.get_size());
+
+                tgt.task->reply_write(
+                        tgt.task->rpc_api->write_task.get_size());
+            } else {
+                assert(0);
+            }
+        } else {
+            AZLogDebug("[{}] [FCSM] completing non-blocking flush target: {}, "
+                       "flushed_seq_num: {}",
+                       inode->get_fuse_ino(),
+                       tgt.flush_seq,
+                       flushed_seq_num.load());
+        }
+
+        // Flush target accomplished, remove from queue.
+        ctgtq.pop();
+    }
+
+    assert(!inode->get_filecache()->is_flushing_in_progress());
+    assert(!inode->is_commit_in_progress());
+    assert(committed_seq_num == committing_seq_num);
+    assert(flushed_seq_num == committed_seq_num);
+    assert(!inode->is_stable_write() || ctgtq.empty());
+
+    /*
+     * See if we have more commit targets and issue flush for the same.
+     */
+    if (!ftgtq.empty() || !ctgtq.empty()) {
+        assert(ftgtq.empty() || ftgtq.front().flush_seq > flushed_seq_num);
+        assert(ctgtq.empty() || ctgtq.front().commit_seq > committed_seq_num);
+       /*
+        * See if we have more flush targets and issue them.
+        */
+        uint64_t bytes;
+        std::vector<bytes_chunk> bc_vec;
+        if (inode->is_stable_write()) {
+            bc_vec =
+                inode->get_filecache()->get_dirty_nonflushing_bcs_range(
+                                                        0, UINT64_MAX, &bytes);
+        } else {
+            bc_vec =
+                inode->get_filecache()->get_contiguous_dirty_bcs(&bytes);
+        }
+
+        /*
+         * Since we have a flush target asking more data to be flushed, we must
+         * have the corresponding bcs in the file cache.
+         */
+        assert(!bc_vec.empty());
+        assert(bytes > 0);
+
+        // flushed_seq_num can never be more than flushing_seq_num.
+        assert(flushed_seq_num <= flushing_seq_num);
+
+        AZLogDebug("[{}] [FCSM] continuing, flushing_seq_num now: {}, "
+                   "flushed_seq_num: {}",
+                   inode->get_fuse_ino(),
+                   flushing_seq_num.load(),
+                   flushed_seq_num.load());
+
+        // sync_membufs() will update flushing_seq_num() and mark fcsm running.
+        inode->sync_membufs(bc_vec, false /* is_flush */);
+        assert(flushing_seq_num >= bytes);
+    } else {
+        AZLogDebug("[{}] [FCSM] idling, flushed_seq_num now: {}, "
+                   "committed_seq_num: {}",
+                   inode->get_fuse_ino(),
+                   flushed_seq_num.load(),
+                   committed_seq_num.load());
+
+        // FCSM should not idle when there's any ongoing flush.
+        assert(flushing_seq_num == flushed_seq_num);
+        assert(committed_seq_num == committing_seq_num);
+        assert(flushed_seq_num == committed_seq_num);
+
+        /*
+         * It may happen flush initiated from flush_cache_and_wait(), so
+         * we should not clear the running flag.
+         */
+        if(!inode->get_filecache()->is_flushing_in_progress()) {
+            assert(ctgtq.empty() && ftgtq.empty());
+            clear_running();
+        }
+    }
 
     inode->flush_unlock();
 }
@@ -252,7 +738,7 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
     assert(fc_cb_running());
 
     // Flush callback can only be called if FCSM is running.
-    assert(is_running());
+    // assert(is_running());
 
     // Update flushed_seq_num to account for the newly flushed bytes.
     flushed_seq_num += flush_bytes;
@@ -285,7 +771,14 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
      * queued flush targets which includes completing the waiting tasks and/or
      * trigger pending flush/commit.
      */
-    if (inode->get_filecache()->is_flushing_in_progress()) {
+    if (inode->get_filecache()->is_flushing_in_progress() ||
+        !is_running() ||
+        inode->is_commit_in_progress()) {
+        AZLogDebug("[{}] [FCSM] not running, flushing_seq_num now: {}, "
+                   "flushed_seq_num: {}",
+                   inode->get_fuse_ino(),
+                   flushing_seq_num.load(),
+                   flushed_seq_num.load());
         inode->flush_unlock();
         return;
     }
@@ -310,21 +803,24 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
 
         if (tgt.task) {
             assert(tgt.task->magic == RPC_TASK_MAGIC);
-            assert(tgt.task->get_op_type() == FUSE_WRITE);
-            assert(tgt.task->rpc_api->write_task.is_fe());
-            assert(tgt.task->rpc_api->write_task.get_size() > 0);
-
-            AZLogDebug("[{}] [FCSM] completing blocking flush target: {}, "
-                       "flushed_seq_num: {}, write task: [{}, {})",
-                       inode->get_fuse_ino(),
-                       tgt.flush_seq,
-                       flushed_seq_num.load(),
-                       tgt.task->rpc_api->write_task.get_offset(),
-                       tgt.task->rpc_api->write_task.get_offset() +
-                       tgt.task->rpc_api->write_task.get_size());
-
-            tgt.task->reply_write(
-                    tgt.task->rpc_api->write_task.get_size());
+            if (tgt.task->get_op_type() == FUSE_WRITE) {
+                assert(tgt.task->rpc_api->write_task.is_fe());
+                assert(tgt.task->rpc_api->write_task.get_size() > 0);
+
+                AZLogInfo("[{}] [FCSM] completing blocking flush target: {}, "
+                        "flushed_seq_num: {}, write task: [{}, {})",
+                        inode->get_fuse_ino(),
+                        tgt.flush_seq,
+                        flushed_seq_num.load(),
+                        tgt.task->rpc_api->write_task.get_offset(),
+                        tgt.task->rpc_api->write_task.get_offset() +
+                        tgt.task->rpc_api->write_task.get_size());
+
+                tgt.task->reply_write(
+                        tgt.task->rpc_api->write_task.get_size());
+            } else {
+                assert(0);
+            }
         } else {
             AZLogDebug("[{}] [FCSM] completing non-blocking flush target: {}, "
                        "flushed_seq_num: {}",
@@ -338,22 +834,56 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
     }
 
     /*
-     * See if we have more flush targets and check if the next flush target
-     * has its flush issued. If yes, then we need to wait for this flush to
-     * complete and we will take stock of flush targets when that completes.
+     * Check if there commit target to be triggered.
+     * If we flushed, we should trigger commit so that memory is released.
      */
-    if (!ftgtq.empty() && (ftgtq.front().flush_seq > flushing_seq_num)) {
+    if (!ctgtq.empty() && (ctgtq.front().commit_seq < flushed_seq_num)) {
+        assert(!inode->is_stable_write());
+
         uint64_t bytes;
         std::vector<bytes_chunk> bc_vec =
-            inode->get_filecache()->get_dirty_nonflushing_bcs_range(
-                    0, UINT64_MAX, &bytes);
+            inode->get_filecache()->get_commit_pending_bcs(&bytes);
+        assert(bc_vec.empty() == (bytes == 0));
+
+        /*
+         * Since we have a commit target asking more data to be committed, we
+         * must have the corresponding bcs in the file cache.
+         */
+        assert(!bc_vec.empty());
+        assert(bytes > 0);
+
+        // commit_membufs() will update committing_seq_num() and mark fcsm running.
+        inode->commit_membufs(bc_vec);
+        assert(committing_seq_num >= bytes);
+    } else if ((!ftgtq.empty() && (ftgtq.front().flush_seq > flushed_seq_num)) ||
+               (!ctgtq.empty() && (ctgtq.front().commit_seq > committed_seq_num))) {
+       /*
+        * See if we have more flush targets and check if the next flush target
+        * has its flush issued. If yes, then we need to wait for this flush to
+        * complete and we will take stock of flush targets when that completes.
+        */
+        uint64_t bytes;
+        std::vector<bytes_chunk> bc_vec;
+        if (inode->is_stable_write()) {
+            bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range(
+                                                                    0, UINT64_MAX,
+                                                                    &bytes);
+        } else {
+            bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(&bytes);
+        }
 
         /*
          * Since we have a flush target asking more data to be flushed, we must
          * have the corresponding bcs in the file cache.
          */
         assert(!bc_vec.empty());
-        assert(bytes >= (ftgtq.front().flush_seq - flushing_seq_num));
+
+        if (inode->is_stable_write()) {
+            // We should have all the dirty data in the chunkmap.
+            assert(bytes >= (ftgtq.front().flush_seq - flushed_seq_num));
+        } else {
+            assert(bytes > 0);
+        }
 
         // flushed_seq_num can never be more than flushing_seq_num.
         assert(flushed_seq_num <= flushing_seq_num);
@@ -367,7 +897,7 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
         // sync_membufs() will update flushing_seq_num() and mark fcsm running.
         inode->sync_membufs(bc_vec, false /* is_flush */);
         assert(flushing_seq_num >= bytes);
-    } else if (ftgtq.empty()) {
+    } else if (ftgtq.empty() && ctgtq.empty()) {
         AZLogDebug("[{}] [FCSM] idling, flushing_seq_num now: {}, "
                    "flushed_seq_num: {}",
                    inode->get_fuse_ino(),
@@ -375,13 +905,19 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
                    flushed_seq_num.load());
 
         // FCSM should not idle when there's any ongoing flush.
-        assert(flushing_seq_num == flushed_seq_num);
+        assert(flushing_seq_num >= flushed_seq_num);
 
         /*
-         * No more flush targets, pause the state machine.
-         * TODO: We need to clear running when flush fails.
+         * If commit is in progress, then we should not clear
+         * the running flag. Most likely it's issued from flush_cache_and_wait().
          */
-        clear_running();
+        if (!inode->is_commit_in_progress()) {
+            assert(ftgtq.empty() || ctgtq.empty());
+            clear_running();
+        }
+    } else {
+        assert(0);
+        AZLogDebug("SHould not reach here");
     }
 
     inode->flush_unlock();
diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index d7468eff..6c95de6e 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -985,7 +985,7 @@ int nfs_inode::wait_for_ongoing_flush(uint64_t start_off, uint64_t end_off)
 /**
  * Note: This takes shared lock on ilock_1.
  */
-int nfs_inode::flush_cache_and_wait(uint64_t start_off, uint64_t end_off)
+int nfs_inode::flush_cache_and_wait()
 {
     /*
      * MUST be called only for regular files.
@@ -1032,7 +1032,7 @@ int nfs_inode::flush_cache_and_wait(uint64_t start_off, uint64_t end_off)
      * bytes_chunks created but we don't want to wait for those.
      */
     std::vector<bytes_chunk> bc_vec =
-        filecache_handle->get_dirty_bc_range(start_off, end_off);
+        filecache_handle->get_dirty_bc_range();
 
     /*
      * sync_membufs() iterate over the bc_vec and starts flushing the dirty
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 8632e631..34a79715 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -2490,7 +2490,7 @@ void rpc_task::run_write()
          * ensure_flush() will arrange to flush all the dirty data and complete
          * the write task when the flush completes.
          */
-        inode->get_fcsm()->ensure_flush(0, offset, length, this);
+        inode->get_fcsm()->ensure_flush(offset, length, this);
 
         // Free up the fuse thread without completing the application write.
         return;
@@ -2556,7 +2556,7 @@ void rpc_task::run_write()
     /*
      * Queue a non-blocking flush target for flushing all the dirty data.
      */
-    inode->get_fcsm()->ensure_flush(0, offset, length, nullptr);
+    inode->get_fcsm()->ensure_flush(offset, length, nullptr);
 
     /*
      * Complete the write request without waiting for the backend flush to

From a19657cf9f0cd72a9d0aac1c29ccef85544477e8 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Sun, 9 Feb 2025 03:20:20 +0000
Subject: [PATCH 2/9] Audit changes

fcsm.h
fcsm.cpp
nfs_inode.h
---
 turbonfs/inc/fcsm.h      |  64 +++---
 turbonfs/inc/nfs_inode.h |   2 +-
 turbonfs/src/fcsm.cpp    | 435 +++++++++++++++++++++++----------------
 3 files changed, 303 insertions(+), 198 deletions(-)

diff --git a/turbonfs/inc/fcsm.h b/turbonfs/inc/fcsm.h
index db034b4b..4c40596b 100644
--- a/turbonfs/inc/fcsm.h
+++ b/turbonfs/inc/fcsm.h
@@ -69,9 +69,8 @@ class fcsm
          struct nfs_inode *_inode);
 
     /**
-     * Ensure 'flush_bytes' additional bytes are flushed from the cache, above
-     * and beyond what's already flushed or flushing now. flush_bytes==0 implies
-     * flush all dirty bytes.
+     * Ensure all or some dirty bytes are flushed or scheduled for flushing (if
+     * a flush or commit is already ongoing).
      * If the state machine is currently not running, it'll kick off the state
      * machine by calling sync_membufs(), else it'll add a new flush target to
      * ftgtq, which will be run by on_flush_complete() when the ongoing flush
@@ -79,34 +78,35 @@ class fcsm
      * initiated this flush and in that case a blocking target will be added
      * which means the specified task will block till the requested target
      * completes, else a non-blocking target will be added which just requests
-     * the specific amount of bytes to be flushed w/o having the application
-     * wait.
+     * dirty bytes to be flushed w/o having the application wait.
      *
      * ensure_flush() provides the following guarantees:
-     * - Additional flush_bytes bytes will be flushed. This is beyond what's
-     *   already flushed or scheduled for flush.
+     * - It'll flush all or part of cached dirty bytes starting from the lowest
+     *   offset. The actual number of bytes flushed is decided based on various
+     *   config settings and the memory pressure.
      * - On completion of that flush, task will be completed.
-     *   If after grabbing flush_lock it figures out that the requested flush
-     *   target is already met, it completes the task rightaway.
      *
      * write_off and write_len describe the current application write call.
-     * write_len is needed for completing the write rightaway for non-blocking
-     * cases, where 'task' is null.
+     * They are needed for logging when task is nullptr.
      *
-     * LOCKS: flush_lock.
+     * Caller MUST hold the flush_lock.
      */
     void ensure_flush(uint64_t write_off,
                       uint64_t write_len,
                       struct rpc_task *task = nullptr);
 
     /**
-     * Ensure 'commit_bytes' additional bytes are committed from the cache,
-     * above and beyond what's already committed or committing now.
+     * Ensure all or some commit-pending bytes are committed or scheduled for
+     * commit (if a flush or commit is already ongoing).
      * If 'task' is null it'll add a non-blocking commit target to ctgtq, else
      * it'll add a blocking commit target for completing task when given commit
      * goal is met.
+     *
+     * Caller MUST hold the flush_lock.
      */
-    void ensure_commit(struct rpc_task *task = nullptr);
+    void ensure_commit(uint64_t write_off,
+                       uint64_t write_len,
+                       struct rpc_task *task = nullptr);
 
     /**
      * Callbacks to be called when flush/commit successfully complete.
@@ -146,13 +146,27 @@ class fcsm
     void mark_running();
     void clear_running();
 
+    /**
+     * Nudge the flush-commit state machine.
+     * After the fuse thread copies the application data into the cache, it
+     * must call this to let FCSM know that some more dirty data has been
+     * added. It checks the dirty data against the configured limits and
+     * decides which of the following action to take:
+     * - Do nothing, as dirty data is still within limits.
+     * - Start flushing.
+     * - Start committing.
+     * - Flush/commit while blocking the task till flush/commit completes.
+     */
     void run(struct rpc_task *task,
              uint64_t extent_left,
              uint64_t extent_right);
 
     /**
      * Call when more writes are dispatched, or prepared to be dispatched.
-     * This MUST be called before the write callback can be called.
+     * These must correspond to membufs which are dirty and not already
+     * flushing.
+     * This MUST be called before the write_iov_callback() can be called, i.e.,
+     * before the actual write call is issued.
      */
     void add_flushing(uint64_t bytes);
 
@@ -184,16 +198,17 @@ class fcsm
 
     /**
      * Call when more commit are dispatched, or prepared to be dispatched.
-     * This MUST be called before the commit_callback can be called.
+     * These must correspond to membufs which are already flushed, i.e., they
+     * must not be dirty or flushing and must be commit pending.
+     * This MUST be called before the commit_callback can be called, i.e.,
+     * before the actual commit call is issued.
      */
-    void add_committing(uint64_t bytes)
-    {
-        assert(committed_seq_num <= committing_seq_num);
-        committing_seq_num += bytes;
-    }
+    void add_committing(uint64_t bytes);
 
-    /*
+    /**
      * ctgtq_cleanup() is called when we switch to stable writes.
+     * It clears up all the queued commit targets as stable writes will not
+     * cause a commit and those targets would never normally complete.
      */
     void ctgtq_cleanup();
 
@@ -279,7 +294,8 @@ class fcsm
     /*
      * Continually increasing seq number of the last byte successfully flushed
      * and committed. Flush-commit targets (fctgt) are expressed in terms of
-     * these.
+     * these. These are actually numerically one more than the last *ing and
+     * *ed byte's seq number.
      * Note that these are *not* offsets in the file but these are cumulative
      * values for total bytes flushed/committed till now since the file was
      * opened. In case of overwrite same byte(s) may be repeatedly flushed
diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
index 2b16b1b7..6119272e 100644
--- a/turbonfs/inc/nfs_inode.h
+++ b/turbonfs/inc/nfs_inode.h
@@ -333,7 +333,7 @@ struct nfs_inode
      * Note: As of now, we are not using this flag as commit changes not yet
      *       integrated, so we are setting this flag to true.
      */
-    bool stable_write = false;
+    bool stable_write = true;
 
 public:
     /*
diff --git a/turbonfs/src/fcsm.cpp b/turbonfs/src/fcsm.cpp
index 76ebb043..2481f748 100644
--- a/turbonfs/src/fcsm.cpp
+++ b/turbonfs/src/fcsm.cpp
@@ -35,7 +35,6 @@ fcsm::fctgt::fctgt(struct fcsm *fcsm,
     // At least one of flush/commit goals must be set.
     assert((flush_seq != 0) || (commit_seq != 0));
 
-#ifndef NDEBUG
     if (task) {
         // Only frontend write tasks must be specified.
         assert(task->magic == RPC_TASK_MAGIC);
@@ -43,7 +42,6 @@ fcsm::fctgt::fctgt(struct fcsm *fcsm,
         assert(task->rpc_api->write_task.is_fe());
         assert(task->rpc_api->write_task.get_size() > 0);
     }
-#endif
 
     AZLogInfo("[{}] [FCSM] {} fctgt queued (F: {}, C: {}, T: {})",
                fcsm->get_inode()->get_fuse_ino(),
@@ -88,16 +86,24 @@ void fcsm::add_flushing(uint64_t bytes)
     flushing_seq_num += bytes;
 }
 
+void fcsm::add_committing(uint64_t bytes)
+{
+    assert(inode->is_flushing);
+    assert(committed_seq_num <= committing_seq_num);
+    committing_seq_num += bytes;
+    // We can only commit a byte that's flushed.
+    assert(flushed_seq_num <= flushing_seq_num);
+    assert(committing_seq_num <= flushed_seq_num);
+}
+
 void fcsm::run(struct rpc_task *task,
                uint64_t extent_left,
                uint64_t extent_right)
 {
-#ifdef NDEBUG
     assert(task->magic == RPC_TASK_MAGIC);
     assert(task->get_op_type() == FUSE_WRITE);
     assert(task->rpc_api->write_task.is_fe());
     assert(task->rpc_api->write_task.get_size() > 0);
-#endif
 
     const size_t length = task->rpc_api->write_task.get_size();
     const off_t offset = task->rpc_api->write_task.get_offset();
@@ -105,8 +111,9 @@ void fcsm::run(struct rpc_task *task,
     const fuse_ino_t ino = task->rpc_api->write_task.get_ino();
 
     /*
-     * We have successfully copied the user data into the cache.
-     * We can have the following cases (in decreasing order of criticality):
+     * fcsm::run() is called after fuse thread successfully copies user data
+     * into the cache. We can have the following cases (in decreasing order of
+     * criticality):
      * 1. Cache has dirty+uncommitted data beyond the "inline write threshold",
      *    ref do_inline_write().
      *    In this case we begin flush of the dirty data and/or commit of the
@@ -174,7 +181,7 @@ void fcsm::run(struct rpc_task *task,
         ((extent_right - extent_left) >= max_dirty_extent) ||
         inode->get_filecache()->flush_required();
 
-    AZLogDebug("[{}] handle write (sparse={}, need_inline_write={}, "
+    AZLogDebug("[{}] fcsm::run() (sparse={}, need_inline_write={}, "
                "need_commit={}, need_flush={}, extent=[{}, {}))",
                inode->get_fuse_ino(), sparse_write, need_inline_write,
                need_commit, need_flush, extent_left, extent_right);
@@ -201,14 +208,17 @@ void fcsm::run(struct rpc_task *task,
                    extent_left, extent_right);
 
         /*
-         * ensure_flush() will arrange to flush all the dirty data and complete
-         * the write task when the flush completes.
+         * Queue a blocking flush/commit target, which will complete the fuse
+         * write after flush/commit completes.
+         * In case of stable writes we queue a flush target while in case of
+         * unstable writes we queue a commit target. Commit target implicitly
+         * performs flush before the commit.
          */
         inode->flush_lock();
         if (inode->is_stable_write()) {
             inode->get_fcsm()->ensure_flush(offset, length, task);
         } else {
-            inode->get_fcsm()->ensure_commit(task);
+            inode->get_fcsm()->ensure_commit(offset, length, task);
         }
         inode->flush_unlock();
 
@@ -223,18 +233,24 @@ void fcsm::run(struct rpc_task *task,
      * and not hold the current write task till the commit completes.
      */
     if (need_commit) {
+        assert(!inode->is_stable_write());
+
         inode->flush_lock();
+        /*
+         * Commit will only start after all ongoing flush complete and no new
+         * flush can start (as have the flush_lock). This means no new
+         * commit-pending data can be added and hence the current inprogress
+         * commit will finish committing all commit-pending bytes.
+         */
         if (inode->is_commit_in_progress()) {
             AZLogDebug("[{}] Commit already in progress, skipping commit",
                        ino);
         } else {
             AZLogDebug("[{}] Committing {} bytes", ino,
                        inode->get_filecache()->max_commit_bytes());
-            inode->get_fcsm()->ensure_commit(nullptr);
+            inode->get_fcsm()->ensure_commit(offset, length, nullptr);
         }
         inode->flush_unlock();
-
-        assert(!need_inline_write);
     }
 
     /*
@@ -255,14 +271,17 @@ void fcsm::run(struct rpc_task *task,
     /*
      * Queue a non-blocking flush target for flushing all the dirty data.
      */
-    inode->flush_lock();
-    inode->get_fcsm()->ensure_flush(offset, length, nullptr);
-    inode->flush_unlock();
+    if (need_flush) {
+        inode->flush_lock();
+        inode->get_fcsm()->ensure_flush(offset, length, nullptr);
+        inode->flush_unlock();
+    }
 
     /*
-     * Complete the write request without waiting for the backend flush to
-     * complete.
+     * Complete the write request without waiting for the backend flush/commit
+     * to complete. For need_inline_write we should not complete the task now.
      */
+    assert(!need_inline_write);
     task->reply_write(length);
 }
 
@@ -272,48 +291,57 @@ void fcsm::ctgtq_cleanup()
     assert(inode->is_stable_write());
 
     AZLogDebug("[FCSM][{}] ctgtq_cleanup()", inode->get_fuse_ino());
+
     while (!ctgtq.empty()) {
         struct fctgt &ctgt = ctgtq.front();
         assert(ctgt.fcsm == this);
 
         struct rpc_task *task = ctgt.task;
         if (task) {
-        #ifdef NDEBUG
-            // Complete the commit task as we switch to stable writes.
+            /*
+             * ctgtq_cleanup() is called when we have decided to swith to
+             * stable writes. Since commit is never called for stable writes,
+             * these tasks waiting for commit must be completed now.
+             */
             assert(task->magic == RPC_TASK_MAGIC);
             assert(task->get_op_type() == FUSE_WRITE);
             assert(task->rpc_api->write_task.is_fe());
             assert(task->rpc_api->write_task.get_size() > 0);
-        #endif
+
             task->reply_write(task->rpc_api->write_task.get_size());
         }
 
-        AZLogInfo("[FCSM][{}] ctgtq_cleanup(): completed task: {} commit_seq: {}",
+        AZLogInfo("[FCSM][{}] ctgtq_cleanup(): completed write task: {} "
+                  "commit_seq: {}",
                   inode->get_fuse_ino(),
                   fmt::ptr(task),
                   ctgt.commit_seq);
+
         ctgtq.pop();
     }
 
     assert(ctgtq.empty());
 }
 
-void fcsm::ensure_commit(struct rpc_task *task)
+void fcsm::ensure_commit(uint64_t write_off,
+                         uint64_t write_len,
+                         struct rpc_task *task)
 {
-    uint64_t commit_bytes = 0;
+    assert(inode->is_flushing);
+    assert(!inode->is_stable_write());
 
-    assert(inode->is_flushing == true);
-    AZLogDebug("[{}] [FCSM] ensure_commit<{}> task: {}",
+    AZLogDebug("[{}] [FCSM] ensure_commit<{}> write req [{}, {}], task: {}",
                inode->get_fuse_ino(),
                task ? "blocking" : "non-blocking",
+               write_off, write_off + write_len,
                fmt::ptr(task));
 
-    assert(!inode->is_stable_write());
-
     // committed_seq_num can never be more than committing_seq_num.
     assert(committed_seq_num <= committing_seq_num);
 
-#ifdef NDEBUG
+    // we can only commit bytes which are flushed.
+    assert(committing_seq_num <= flushed_seq_num);
+
     if (task) {
         // task provided must be a frontend write task.
         assert(task->magic == RPC_TASK_MAGIC);
@@ -321,23 +349,40 @@ void fcsm::ensure_commit(struct rpc_task *task)
         assert(task->rpc_api->write_task.is_fe());
         assert(task->rpc_api->write_task.get_size() > 0);
     }
-#endif
 
     /*
-     * What will be the committed_seq_num value after commit_bytes are committed?
+     * Find how many bytes we would like to commit.
+     * If there are some commit-pending bytes we commit all of those, else
+     * we set a commit target large enough to flush+commit all leaving
+     * one full sized dirty extent.
      */
-    if (inode->get_filecache()->get_bytes_to_commit() == 0) {
-        int64_t bytes = inode->get_filecache()->get_bytes_to_flush() -
-                          inode->get_filecache()->max_dirty_extent_bytes();
+    uint64_t commit_bytes =
+        inode->get_filecache()->get_bytes_to_commit();;
+
+    if (commit_bytes == 0) {
+        /*
+         * TODO: Make sure this doesn't result in small-blocks being written.
+         */
+        const int64_t bytes =
+            inode->get_filecache()->get_bytes_to_flush() -
+            inode->get_filecache()->max_dirty_extent_bytes();
         commit_bytes = std::max(bytes, (int64_t) 0);
-    } else {
-        commit_bytes = inode->get_filecache()->get_bytes_to_commit();
     }
 
+    /*
+     * No new bytes to commit, complete the task if it was a blocking call.
+     */
     if (commit_bytes == 0) {
         AZLogDebug("COMMIT BYTES ZERO");
+        if (task) {
+            task->reply_write(task->rpc_api->write_task.get_size());
+        }
+        return;
     }
 
+    /*
+     * What will be the committed_seq_num value after commit_bytes are committed?
+     */
     const uint64_t target_committed_seq_num = committed_seq_num + commit_bytes;
 
     /*
@@ -346,7 +391,7 @@ void fcsm::ensure_commit(struct rpc_task *task)
      * completes, this commit would be dispatched.
      */
     if (is_running()) {
-#ifdef NDEBUG
+#ifndef NDEBUG
         /*
          * Make sure commit targets are always added in an increasing commit_seq.
          */
@@ -378,8 +423,14 @@ void fcsm::ensure_commit(struct rpc_task *task)
                    flushed_seq_num.load(),
                    target_committed_seq_num);
 
-        ensure_flush(0, 0, nullptr);
+        ensure_flush(task ? task->rpc_api->write_task.get_offset() : 0,
+                     task ? task->rpc_api->write_task.get_size() : 0,
+                     nullptr);
         assert(is_running());
+        /*
+         * ensure_flush() must have scheduled flushing till
+         * target_committed_seq_num.
+         */
         assert(flushing_seq_num >= target_committed_seq_num);
 
         if (!inode->is_stable_write()) {
@@ -436,45 +487,35 @@ void fcsm::ensure_flush(uint64_t write_off,
                         uint64_t write_len,
                         struct rpc_task *task)
 {
-    uint64_t flush_bytes = 0;
+    assert(inode->is_flushing);
 
-    assert(inode->is_flushing == true);
-    AZLogDebug("[{}] [FCSM] ensure_flush<{}>({}), write req [{}, {}], task: {}",
+    AZLogDebug("[{}] [FCSM] ensure_flush<{}> write req [{}, {}], task: {}",
                inode->get_fuse_ino(),
                task ? "blocking" : "non-blocking",
-               flush_bytes,
                write_off, write_off + write_len,
                fmt::ptr(task));
 
-    /*
-     * TODO: Do we have a usecase for caller specifying how many bytes to
-     *       flush.
-     */
-    assert(flush_bytes == 0);
-
     // flushed_seq_num can never be more than flushing_seq_num.
     assert(flushed_seq_num <= flushing_seq_num);
 
-#ifndef NDEBUG
     if (task) {
         // task provided must be a frontend write task.
         assert(task->magic == RPC_TASK_MAGIC);
-        if (task->get_op_type() == FUSE_WRITE) {
-            assert(task->rpc_api->write_task.is_fe());
-            assert(task->rpc_api->write_task.get_size() > 0);
-            // write_len and write_off must match that of the task.
-            assert(task->rpc_api->write_task.get_size() == write_len);
-            assert((uint64_t) task->rpc_api->write_task.get_offset() ==
-                                                                 write_off);
-        } else {
-            assert(0);
-        }
+        assert(task->get_op_type() == FUSE_WRITE);
+        assert(task->rpc_api->write_task.is_fe());
+        assert(task->rpc_api->write_task.get_size() > 0);
+        // write_len and write_off must match that of the task.
+        assert(task->rpc_api->write_task.get_size() == write_len);
+        assert(task->rpc_api->write_task.get_offset() == (off_t) write_off);
     }
-#endif
 
     /*
      * What will be the flushed_seq_num value after all current dirty bytes are
      * flushed? That becomes our target flushed_seq_num.
+     * Since bytes_chunk_cache::{bytes_dirty,bytes_flushing} are not updated
+     * inside flush_lock, we can have race conditions where later values of
+     * target_flushed_seq_num may be less than what we have already queued in
+     * the latest flush target. In such case, just wait for the larger value.
      */
     const uint64_t bytes_to_flush =
         inode->get_filecache()->get_bytes_to_flush();
@@ -516,9 +557,8 @@ void fcsm::ensure_flush(uint64_t write_off,
             }
         }
 #endif
-        if (task == nullptr &&
-             (target_flushed_seq_num == last_flush_seq)) {
-            assert(is_running());
+
+        if (!task && (target_flushed_seq_num == last_flush_seq)) {
             return;
         }
 
@@ -526,7 +566,6 @@ void fcsm::ensure_flush(uint64_t write_off,
                       target_flushed_seq_num /* target flush_seq */,
                       0 /* commit_seq */,
                       task);
-        assert(is_running());
         return;
     }
 
@@ -580,24 +619,54 @@ void fcsm::ensure_flush(uint64_t write_off,
  */
 void fcsm::on_commit_complete(uint64_t commit_bytes)
 {
+    // Commit must be called only for unstable writes.
+    assert(!inode->is_stable_write());
+
     // Must be called only for success.
+    assert(inode->get_write_error() == 0);
     assert(commit_bytes > 0);
-    assert(!inode->get_filecache()->is_flushing_in_progress());
+
+    // Must be called from flush/write callback.
+    assert(fc_cb_running());
 
     // Commit callback can only be called if FCSM is running.
     assert(is_running());
 
+    /*
+     * Commit callback can be called only when commit is in progress, clear
+     * it now.
+     */
+    assert(inode->is_commit_in_progress());
+    inode->clear_commit_in_progress();
+
+    // If commit is running, flush cannot be running.
+    assert(!inode->get_filecache()->is_flushing_in_progress());
+
+    // a byte can only be committed after it's flushed successfully.
+    assert(committing_seq_num <= flushed_seq_num);
+    assert(committing_seq_num <= flushing_seq_num);
+
     // Update committed_seq_num to account for the commit_bytes.
     committed_seq_num += commit_bytes;
 
-    // committed_seq_num can never go more than committing_seq_num.
+    /*
+     * When a commit completes it commits everything that has been flushed
+     * till now also whatever has been scheduled for commit.
+     */
+    assert(flushed_seq_num == committed_seq_num);
     assert(committed_seq_num == committing_seq_num);
 
-    AZLogDebug("[{}] [FCSM] on_commit_complete, committed_seq_num now: {}, "
-               "committing_seq_num: {}",
+    AZLogDebug("[{}] [FCSM] on_commit_complete({}), Fd: {}, Fing: {}, "
+               "Cd: {}, Cing: {}, Fq: {}, Cq: {}, bytes_flushing: {}",
                inode->get_fuse_ino(),
+               commit_bytes,
+               flushed_seq_num.load(),
+               flushing_seq_num.load(),
                committed_seq_num.load(),
-               committing_seq_num.load());
+               committing_seq_num.load(),
+               ftgtq.size(),
+               ctgtq.size(),
+               inode->get_filecache()->bytes_flushing.load());
 
     inode->flush_lock();
 
@@ -621,51 +690,48 @@ void fcsm::on_commit_complete(uint64_t commit_bytes)
 
         if (tgt.task) {
             assert(tgt.task->magic == RPC_TASK_MAGIC);
-            if (tgt.task->get_op_type() == FUSE_WRITE) {
-                assert(tgt.task->rpc_api->write_task.is_fe());
-                assert(tgt.task->rpc_api->write_task.get_size() > 0);
-
-                AZLogInfo("[{}] [FCSM] completing blocking commit target: {}, "
-                           "committed_seq_num: {}, write task: [{}, {})",
-                           inode->get_fuse_ino(),
-                           tgt.commit_seq,
-                           committed_seq_num.load(),
-                           tgt.task->rpc_api->write_task.get_offset(),
-                           tgt.task->rpc_api->write_task.get_offset() +
-                           tgt.task->rpc_api->write_task.get_size());
-
-                tgt.task->reply_write(
-                        tgt.task->rpc_api->write_task.get_size());
-            } else {
-                assert(0);
-            }
+            assert(tgt.task->get_op_type() == FUSE_WRITE);
+            assert(tgt.task->rpc_api->write_task.is_fe());
+            assert(tgt.task->rpc_api->write_task.get_size() > 0);
+
+            AZLogInfo("[{}] [FCSM] completing blocking commit target: {}, "
+                      "committed_seq_num: {}, write task: [{}, {})",
+                      inode->get_fuse_ino(),
+                      tgt.commit_seq,
+                      committed_seq_num.load(),
+                      tgt.task->rpc_api->write_task.get_offset(),
+                      tgt.task->rpc_api->write_task.get_offset() +
+                      tgt.task->rpc_api->write_task.get_size());
+
+            tgt.task->reply_write(
+                    tgt.task->rpc_api->write_task.get_size());
         } else {
-            AZLogDebug("[{}] [FCSM] completing non-blocking flush target: {}, "
-                       "flushed_seq_num: {}",
+            AZLogDebug("[{}] [FCSM] completing non-blocking commit target: {}, "
+                       "committed_seq_num: {}",
                        inode->get_fuse_ino(),
-                       tgt.flush_seq,
-                       flushed_seq_num.load());
+                       tgt.commit_seq,
+                       committed_seq_num.load());
         }
 
-        // Flush target accomplished, remove from queue.
+        // Commit target accomplished, remove from queue.
         ctgtq.pop();
     }
 
-    assert(!inode->get_filecache()->is_flushing_in_progress());
-    assert(!inode->is_commit_in_progress());
-    assert(committed_seq_num == committing_seq_num);
-    assert(flushed_seq_num == committed_seq_num);
-    assert(!inode->is_stable_write() || ctgtq.empty());
-
     /*
      * See if we have more commit targets and issue flush for the same.
      */
     if (!ftgtq.empty() || !ctgtq.empty()) {
+        /*
+         * If we have any commit target here it must have commit_seq greater
+         * than committed_seq_num, else it would have been completed by the
+         * above loop.
+         * If we have any flush target it must have flush_seq greate than
+         * flushed_seq_num. This is because commit would have started after
+         * the flush and we would have completed all eligible flush targets.
+         */
         assert(ftgtq.empty() || ftgtq.front().flush_seq > flushed_seq_num);
         assert(ctgtq.empty() || ctgtq.front().commit_seq > committed_seq_num);
-       /*
-        * See if we have more flush targets and issue them.
-        */
+
         uint64_t bytes;
         std::vector<bytes_chunk> bc_vec;
         if (inode->is_stable_write()) {
@@ -694,8 +760,10 @@ void fcsm::on_commit_complete(uint64_t commit_bytes)
                    flushed_seq_num.load());
 
         // sync_membufs() will update flushing_seq_num() and mark fcsm running.
+        [[maybe_unused]]
+        const uint64_t prev_flushing_seq_num = flushing_seq_num;
         inode->sync_membufs(bc_vec, false /* is_flush */);
-        assert(flushing_seq_num >= bytes);
+        assert(flushing_seq_num == (prev_flushing_seq_num + bytes));
     } else {
         AZLogDebug("[{}] [FCSM] idling, flushed_seq_num now: {}, "
                    "committed_seq_num: {}",
@@ -703,19 +771,14 @@ void fcsm::on_commit_complete(uint64_t commit_bytes)
                    flushed_seq_num.load(),
                    committed_seq_num.load());
 
-        // FCSM should not idle when there's any ongoing flush.
+        // FCSM should not idle when there's any ongoing flush or commit.
         assert(flushing_seq_num == flushed_seq_num);
-        assert(committed_seq_num == committing_seq_num);
+        assert(committing_seq_num == committed_seq_num);
         assert(flushed_seq_num == committed_seq_num);
 
-        /*
-         * It may happen flush initiated from flush_cache_and_wait(), so
-         * we should not clear the running flag.
-         */
-        if(!inode->get_filecache()->is_flushing_in_progress()) {
-            assert(ctgtq.empty() && ftgtq.empty());
-            clear_running();
-        }
+        assert(!inode->get_filecache()->is_flushing_in_progress());
+
+        clear_running();
     }
 
     inode->flush_unlock();
@@ -734,11 +797,24 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
     // Must be called only for success.
     assert(inode->get_write_error() == 0);
     assert(flush_bytes > 0);
+
     // Must be called from flush/write callback.
     assert(fc_cb_running());
 
     // Flush callback can only be called if FCSM is running.
-    // assert(is_running());
+    assert(is_running());
+
+    /*
+     * Commit will only be run after current flush completes.
+     * Since we are inside flush completion callback, commit cannot be
+     * running yet.
+     */
+    assert(!inode->is_commit_in_progress());
+
+    // a byte can only be committed after it's flushed successfully.
+    assert(committing_seq_num <= flushed_seq_num);
+    assert(committed_seq_num <= committing_seq_num);
+    assert(committing_seq_num <= flushing_seq_num);
 
     // Update flushed_seq_num to account for the newly flushed bytes.
     flushed_seq_num += flush_bytes;
@@ -746,12 +822,17 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
     // flushed_seq_num can never go more than flushing_seq_num.
     assert(flushed_seq_num <= flushing_seq_num);
 
-    AZLogDebug("[{}] [FCSM] on_flush_complete({}), flushed_seq_num now: {}, "
-               "flushing_in_progress: {}",
+    AZLogDebug("[{}] [FCSM] on_flush_complete({}), Fd: {}, Fing: {}, "
+               "Cd: {}, Cing: {}, Fq: {}, Cq: {}, bytes_flushing: {}",
                inode->get_fuse_ino(),
                flush_bytes,
                flushed_seq_num.load(),
-               inode->get_filecache()->is_flushing_in_progress());
+               flushing_seq_num.load(),
+               committed_seq_num.load(),
+               committing_seq_num.load(),
+               ftgtq.size(),
+               ctgtq.size(),
+               inode->get_filecache()->bytes_flushing.load());
 
     /*
      * If this is not the last completing flush (of the multiple parallel
@@ -771,14 +852,7 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
      * queued flush targets which includes completing the waiting tasks and/or
      * trigger pending flush/commit.
      */
-    if (inode->get_filecache()->is_flushing_in_progress() ||
-        !is_running() ||
-        inode->is_commit_in_progress()) {
-        AZLogDebug("[{}] [FCSM] not running, flushing_seq_num now: {}, "
-                   "flushed_seq_num: {}",
-                   inode->get_fuse_ino(),
-                   flushing_seq_num.load(),
-                   flushed_seq_num.load());
+    if (inode->get_filecache()->is_flushing_in_progress()) {
         inode->flush_unlock();
         return;
     }
@@ -803,24 +877,21 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
 
         if (tgt.task) {
             assert(tgt.task->magic == RPC_TASK_MAGIC);
-            if (tgt.task->get_op_type() == FUSE_WRITE) {
-                assert(tgt.task->rpc_api->write_task.is_fe());
-                assert(tgt.task->rpc_api->write_task.get_size() > 0);
-
-                AZLogInfo("[{}] [FCSM] completing blocking flush target: {}, "
-                        "flushed_seq_num: {}, write task: [{}, {})",
-                        inode->get_fuse_ino(),
-                        tgt.flush_seq,
-                        flushed_seq_num.load(),
-                        tgt.task->rpc_api->write_task.get_offset(),
-                        tgt.task->rpc_api->write_task.get_offset() +
-                        tgt.task->rpc_api->write_task.get_size());
-
-                tgt.task->reply_write(
-                        tgt.task->rpc_api->write_task.get_size());
-            } else {
-                assert(0);
-            }
+            assert(tgt.task->get_op_type() == FUSE_WRITE);
+            assert(tgt.task->rpc_api->write_task.is_fe());
+            assert(tgt.task->rpc_api->write_task.get_size() > 0);
+
+            AZLogInfo("[{}] [FCSM] completing blocking flush target: {}, "
+                      "flushed_seq_num: {}, write task: [{}, {})",
+                      inode->get_fuse_ino(),
+                      tgt.flush_seq,
+                      flushed_seq_num.load(),
+                      tgt.task->rpc_api->write_task.get_offset(),
+                      tgt.task->rpc_api->write_task.get_offset() +
+                      tgt.task->rpc_api->write_task.get_size());
+
+            tgt.task->reply_write(
+                    tgt.task->rpc_api->write_task.get_size());
         } else {
             AZLogDebug("[{}] [FCSM] completing non-blocking flush target: {}, "
                        "flushed_seq_num: {}",
@@ -834,10 +905,13 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
     }
 
     /*
-     * Check if there commit target to be triggered.
-     * If we flushed, we should trigger commit so that memory is released.
+     * We just completed a flush. See if we have some commit targets that we
+     * should trigger now. A commit target can only be triggered if we have
+     * flushed all bytes till the commit target.
+     * We check commit target before any other flush targets as committing
+     * helps us free memory.
      */
-    if (!ctgtq.empty() && (ctgtq.front().commit_seq < flushed_seq_num)) {
+    if (!ctgtq.empty() && (flushed_seq_num >= ctgtq.front().commit_seq)) {
         assert(!inode->is_stable_write());
 
         uint64_t bytes;
@@ -852,15 +926,27 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
         assert(!bc_vec.empty());
         assert(bytes > 0);
 
-        // commit_membufs() will update committing_seq_num() and mark fcsm running.
+        /*
+         * commit_membufs() must increase committing_seq_num exactly by bytes,
+         * as all the bcs in bc_vec should be committed.
+         */
+        [[maybe_unused]]
+        const uint64_t prev_committing_seq_num = committing_seq_num;
         inode->commit_membufs(bc_vec);
-        assert(committing_seq_num >= bytes);
-    } else if ((!ftgtq.empty() && (ftgtq.front().flush_seq > flushed_seq_num)) ||
-               (!ctgtq.empty() && (ctgtq.front().commit_seq > committed_seq_num))) {
+        assert(committing_seq_num == (prev_committing_seq_num + bytes));
+
+    } else if ((!ftgtq.empty() && (ftgtq.front().flush_seq > flushing_seq_num)) ||
+               (!ctgtq.empty() && (ctgtq.front().commit_seq > flushing_seq_num))) {
        /*
-        * See if we have more flush targets and check if the next flush target
-        * has its flush issued. If yes, then we need to wait for this flush to
-        * complete and we will take stock of flush targets when that completes.
+        * Nothing to commit, or what we want to commit has not yet flushed
+        * successfully. Do we want to flush more? We check two things:
+        * 1. Is there an explicit flush target which has not yet started
+        *    flushing?
+        * 2. Is there an commit target which implies flushing?
+        *
+        * If the next flush or commit target has its flush issued, then we
+        * just have to wait for that flush to complete and then we will decide
+        * the next action, else issue it now.
         */
         uint64_t bytes;
         std::vector<bytes_chunk> bc_vec;
@@ -868,20 +954,16 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
             bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range(
                                                                     0, UINT64_MAX,
                                                                     &bytes);
+            /*
+             * Since we have a flush target asking more data to be flushed, we must
+             * have the corresponding bcs in the file cache.
+             */
+            assert(!bc_vec.empty());
+            // We should flush all the dirty data in the chunkmap.
+            assert(bytes >= (ftgtq.front().flush_seq - flushing_seq_num));
         } else {
             bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(&bytes);
-        }
-
-        /*
-         * Since we have a flush target asking more data to be flushed, we must
-         * have the corresponding bcs in the file cache.
-         */
-        assert(!bc_vec.empty());
-
-        if (inode->is_stable_write()) {
-            // We should have all the dirty data in the chunkmap.
-            assert(bytes >= (ftgtq.front().flush_seq - flushed_seq_num));
-        } else {
+            assert(!bc_vec.empty());
             assert(bytes > 0);
         }
 
@@ -894,10 +976,16 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
                    flushing_seq_num.load(),
                    flushed_seq_num.load());
 
-        // sync_membufs() will update flushing_seq_num() and mark fcsm running.
+        // sync_membufs() will update flushing_seq_num.
+        [[maybe_unused]]
+        const uint64_t prev_flushing_seq_num = flushing_seq_num;
         inode->sync_membufs(bc_vec, false /* is_flush */);
-        assert(flushing_seq_num >= bytes);
+        assert(flushing_seq_num == (prev_flushing_seq_num + bytes));
     } else if (ftgtq.empty() && ctgtq.empty()) {
+        /*
+         * No flush to issue, if we don't have any to wait for, then we can
+         * stop the state machine.
+         */
         AZLogDebug("[{}] [FCSM] idling, flushing_seq_num now: {}, "
                    "flushed_seq_num: {}",
                    inode->get_fuse_ino(),
@@ -908,16 +996,17 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
         assert(flushing_seq_num >= flushed_seq_num);
 
         /*
-         * If commit is in progress, then we should not clear
-         * the running flag. Most likely it's issued from flush_cache_and_wait().
+         * TODO: Modify flush_cache_and_wait() to also use the FCSM for
+         *       performing the flush. Then we have any flush or commit
+         *       only peformed by the state machine.
          */
-        if (!inode->is_commit_in_progress()) {
-            assert(ftgtq.empty() || ctgtq.empty());
-            clear_running();
-        }
+        assert(!inode->get_filecache()->is_flushing_in_progress());
+        assert(!inode->is_commit_in_progress());
+
+        clear_running();
     } else {
+        AZLogCrit("Should not reach here");
         assert(0);
-        AZLogDebug("SHould not reach here");
     }
 
     inode->flush_unlock();

From d31198fb6fe6c875e1a814af3bacdce72a9ac885 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Sun, 9 Feb 2025 11:33:28 +0000
Subject: [PATCH 3/9] Original version of rpc_task.h rpc_task.cpp

---
 turbonfs/inc/rpc_task.h   |   1 +
 turbonfs/src/rpc_task.cpp | 361 +++++++++++++++++++++++---------------
 2 files changed, 224 insertions(+), 138 deletions(-)

diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h
index 2468d60d..94f09d18 100644
--- a/turbonfs/inc/rpc_task.h
+++ b/turbonfs/inc/rpc_task.h
@@ -2428,6 +2428,7 @@ struct rpc_task
      */
     bool add_bc(const bytes_chunk& bc);
     void issue_write_rpc();
+    void issue_commit_rpc();
 
 #ifdef ENABLE_NO_FUSE
     /*
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 34a79715..f7e5b3ef 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -828,6 +828,212 @@ void access_callback(
     }
 }
 
+static void commit_callback(
+    struct rpc_context *rpc,
+    int rpc_status,
+    void *data,
+    void *private_data)
+{
+    rpc_task *task = (rpc_task*) private_data;
+
+    /*
+     * Commit is issued as a FUSE_FLUSH.
+     * TODO: Maybe it should have a type of its own.
+     */
+    assert(task->magic == RPC_TASK_MAGIC);
+    assert(task->get_op_type() == FUSE_FLUSH);
+    assert(task->rpc_api->pvt != nullptr);
+    // Commit is never called for a fuse request.
+    assert(task->get_fuse_req() == nullptr);
+
+    auto res = (COMMIT3res*) data;
+
+    INJECT_JUKEBOX(res, task);
+
+    const fuse_ino_t ino = task->rpc_api->flush_task.get_ino();
+    struct nfs_inode *inode =
+        task->get_client()->get_nfs_inode_from_ino(ino);
+    // List of bcs committed by this commit call.
+    auto bc_vec_ptr = (std::vector<bytes_chunk> *) task->rpc_api->pvt;
+
+    // We should call commit only when inode is doing unstable+commit.
+    assert(!inode->is_stable_write());
+    // Caller must be inprogress.
+    assert(inode->is_commit_in_progress());
+    assert(!inode->get_filecache()->is_flushing_in_progress());
+
+    const int status = task->status(rpc_status, NFS_STATUS(res));
+    UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc);
+    FC_CB_TRACKER fccbt(inode);
+
+    AZLogDebug("[{}] commit_callback", ino);
+    
+    uint64_t commit_bytes = 0;
+    /*
+     * Now that the request has completed, we can query libnfs for the
+     * dispatch time.
+     */
+    task->get_stats().on_rpc_complete(rpc_get_pdu(rpc), NFS_STATUS(res));
+
+    if (status == 0) {
+        uint64_t offset = 0;
+        uint64_t length = 0;
+
+        /*
+         * Go over all the successfully committed bcs and release them from
+         * file cache (note that successful commit confirms that server has
+         * persisted the data and client can fee it).
+         * Also complete any tasks waiting for these membufs to be committed.
+         */
+        for (auto &bc : *bc_vec_ptr) {
+            struct membuf *mb = bc.get_membuf();
+            assert(mb->is_inuse());
+            assert(mb->is_locked());
+            assert(mb->is_commit_pending());
+            assert(mb->is_uptodate());
+            // Dirty membufs must not be committed.
+            assert(!mb->is_dirty());
+
+            mb->clear_commit_pending();
+            mb->clear_locked();
+            mb->clear_inuse();
+
+            /**
+             * Collect the contiguous range of bc's to release.
+             */
+            if (offset == 0 && length == 0) {
+                offset = bc.offset;
+                length = bc.length;
+            } else if (offset + length == bc.offset) {
+                length += bc.length;
+            } else {
+                commit_bytes += length;
+                const uint64_t released =
+                    inode->get_filecache()->release(offset, length);
+                AZLogInfo("[{}] commit_callback releasing bc [{}, {}), "
+                           "released {} bytes",
+                           ino, offset, offset+length, released);
+                offset = bc.offset;
+                length = bc.length;
+            }
+        }
+
+        // Release the last bc not released by the above loop.
+        if (length != 0) {
+            commit_bytes += length;
+            const uint64_t released =
+                inode->get_filecache()->release(offset, length);
+            AZLogInfo("[{}] commit_callback releasing bc [{}, {}), "
+                       "released {} bytes",
+                       ino, offset, offset+length, released);
+        }
+
+    } else if (NFS_STATUS(res) == NFS3ERR_JUKEBOX) {
+        task->get_client()->jukebox_retry(task);
+        return;
+    } else {
+        /*
+         * Commit has failed.
+         * Go over all the bcs that this commit was targetting, mark the
+         * membufs back as dirty and clear the commit_pending flag.
+         * Next write will initiate the flush again with stable write.
+         */
+        for (auto &bc : *bc_vec_ptr) {
+            struct membuf *mb = bc.get_membuf();
+            assert(mb->is_inuse());
+            assert(mb->is_locked());
+            assert(mb->is_commit_pending());
+            assert(mb->is_uptodate());
+            assert(!mb->is_dirty());
+
+            mb->clear_commit_pending();
+            mb->set_dirty();
+            mb->clear_locked();
+            mb->clear_inuse();
+        }
+        /*
+         * Set the inode to stable write, so that next write will initiate
+         * the flush again with stable write.
+         * There should be no flush in progress as this moment.
+         */
+        assert(inode->get_filecache()->is_flushing_in_progress() == false);
+        inode->set_stable_write();
+    }
+
+    /*
+     * Clear the commit in progress flag.
+     */
+    inode->clear_commit_in_progress();
+
+    if (status == 0) {
+        assert(commit_bytes > 0);
+
+        // Update the commit bytes in the inode.
+        inode->get_fcsm()->on_commit_complete(commit_bytes);
+    } else {
+        assert(0);
+    }
+
+    delete bc_vec_ptr;
+    task->rpc_api->pvt = nullptr;
+    task->free_rpc_task();
+}
+
+void rpc_task::issue_commit_rpc()
+{
+    // Must only be called for a flush task.
+    assert(get_op_type() == FUSE_FLUSH);
+    // Commit is never called for a fuse request.
+    assert(get_fuse_req() == nullptr);
+
+    const fuse_ino_t ino = rpc_api->flush_task.get_ino();
+    struct nfs_inode *inode = get_client()->get_nfs_inode_from_ino(ino);
+
+    // Commit must not be called if we are doing stable writes.
+    assert(!inode->is_stable_write());
+    // Commit must mot be sent with ongoing flushes.
+    assert(!inode->get_filecache()->is_flushing_in_progress());
+    // Caller must have marked commit inprogress.
+    assert(inode->is_commit_in_progress());
+
+    COMMIT3args args;
+    ::memset(&args, 0, sizeof(args));
+    bool rpc_retry = false;
+
+    AZLogDebug("[{}] issue_commit_rpc", ino);
+
+    /*
+     * Get the bcs marked for commit_pending.
+     */
+    assert(rpc_api->pvt != nullptr);
+
+    args.file = inode->get_fh();
+    args.offset = 0;
+    args.count = 0;
+
+    do {
+        rpc_retry = false;
+        stats.on_rpc_issue();
+
+        if (rpc_nfs3_commit_task(get_rpc_ctx(),
+                                 commit_callback, &args, this) == NULL) {
+            stats.on_rpc_cancel();
+            /*
+             * Most common reason for this is memory allocation failure,
+             * hence wait for some time before retrying. Also block the
+             * current thread as we really want to slow down things.
+             *
+             * TODO: For soft mount should we fail this?
+             */
+            rpc_retry = true;
+
+            AZLogWarn("rpc_nfs3_write_task failed to issue, retrying "
+                        "after 5 secs!");
+            ::sleep(5);
+        }
+    } while (rpc_retry);
+}
+
 /**
  * Must be called when bytes_completed bytes are successfully read/written.
  */
@@ -1085,7 +1291,7 @@ static void write_iov_callback(
                        bciov->orig_offset + bciov->orig_length);
 
             // Update bciov after the current write.
-            bciov->on_io_complete(res->WRITE3res_u.resok.count);
+            bciov->on_io_complete(res->WRITE3res_u.resok.count, !inode->is_stable_write());
 
             // Create a new write_task for the remaining bc_iovec.
             struct rpc_task *write_task =
@@ -1122,7 +1328,7 @@ static void write_iov_callback(
             return;
         } else {
             // Complete bc_iovec IO completed.
-            bciov->on_io_complete(res->WRITE3res_u.resok.count);
+            bciov->on_io_complete(res->WRITE3res_u.resok.count, !inode->is_stable_write());
 
             // Complete data writen to blob.
             AZLogDebug("[{}] <{}> Completed write, off: {}, len: {}",
@@ -1205,6 +1411,8 @@ static void write_iov_callback(
      */
     if (inode->get_write_error() == 0) {
         inode->get_fcsm()->on_flush_complete(bciov->orig_length);
+    } else {
+        assert(0);
     }
 
     delete bciov;
@@ -1278,7 +1486,16 @@ void rpc_task::issue_write_rpc()
     args.file = inode->get_fh();
     args.offset = offset;
     args.count = length;
-    args.stable = FILE_SYNC;
+    args.stable = inode->is_stable_write() ? FILE_SYNC : UNSTABLE;
+
+    /*
+     * Unstable writes want to make use of multiple connections for better perf.
+     * Note that they aren't affected by the optimistic concurrency backoff
+     * issues as seen by stable writes.
+     */
+    if (!inode->is_stable_write()) {
+        set_csched(CONN_SCHED_RR);
+    }
 
     do {
         rpc_retry = false;
@@ -2339,7 +2556,6 @@ void rpc_task::run_write()
     const size_t length = rpc_api->write_task.get_size();
     struct fuse_bufvec *const bufv = rpc_api->write_task.get_buffer_vector();
     const off_t offset = rpc_api->write_task.get_offset();
-    const bool sparse_write = (offset > inode->get_file_size(true));
     uint64_t extent_left = 0;
     uint64_t extent_right = 0;
 
@@ -2429,140 +2645,8 @@ void rpc_task::run_write()
 
     assert(extent_right >= (extent_left + length));
 
-    /*
-     * We have successfully copied the user data into the cache.
-     * We can have the following cases:
-     * 1. This new write caused a contiguous extent to be greater than
-     *    max_dirty_extent_bytes(), aka MDEB.
-     *    In this case we begin flush of this contiguous extent (in multiple
-     *    parallel wsize sized blocks) since there's no benefit in waiting more
-     *    as the data is sufficient for the server scheduler to effectively
-     *    write, in optimal sized blocks.
-     *    We complete the application write rightaway without waiting for the
-     *    flush to complete as we are not under memory pressure.
-     *    This will happen when user is sequentially writing to the file.
-     * 2. A single contiguous extent is not greater than MDEB but total dirty
-     *    bytes waiting to be flushed is more than MDEB.
-     *    In this case we begin flush of the entire dirty data from the cache
-     *    as we have sufficient data for the server scheduler to perform
-     *    batched writes effectively.
-     *    We complete the application write rightaway without waiting for the
-     *    flush to complete as we are not under memory pressure.
-     *    This will happen when user is writing to the file in a random or
-     *    pseudo-sequential fashion.
-     * 3. Cache has dirty data beyond the "inline write" threshold, ref
-     *    do_inline_write().
-     *    In this case we begin flush of the entire dirty data.
-     *    This is a memory pressure situation and hence we do not complete the
-     *    application write till all the backend writes complete.
-     *    This will happen when user is writing faster than our backend write
-     *    throughput, eventually dirty data will grow beyond the "inline write"
-     *    threshold and then we have to slow down the writers by delaying
-     *    completion.
-     *
-     * Other than this we have a special case of "write beyond eof" (termed
-     * sparse write). In sparse write case also we perform "inline write" of
-     * all the dirty data. This is needed for correct read behaviour. Imagine
-     * a reader reading from the sparse part of the file which is not yet in
-     * the bytes_chunk_cache. This read will be issued to the server and since
-     * server doesn't know the updated file size (as the write may still be
-     * sitting in our bytes_chunk_cache) it will return eof. This is not correct
-     * as such reads issued after successful write, are valid and should return
-     * 0 bytes for the sparse range.
-     */
-
-    /*
-     * Do we need to perform "inline write"?
-     * Inline write implies, we flush all the dirty data and wait for all the
-     * corresponding backend writes to complete.
-     */
-    const bool need_inline_write =
-        (sparse_write || inode->get_filecache()->do_inline_write());
-
-    if (need_inline_write) {
-        INC_GBL_STATS(inline_writes, 1);
-
-        AZLogDebug("[{}] Inline write (sparse={}), {} bytes, extent @ [{}, {})",
-                   ino, sparse_write, (extent_right - extent_left),
-                   extent_left, extent_right);
-
-        /*
-         * ensure_flush() will arrange to flush all the dirty data and complete
-         * the write task when the flush completes.
-         */
-        inode->get_fcsm()->ensure_flush(offset, length, this);
-
-        // Free up the fuse thread without completing the application write.
-        return;
-    }
-
-    /*
-     * Ok, we don't need to do inline writes. See if we have enough dirty
-     * data and we need to start async flush.
-     */
-
-    /*
-     * If the extent size exceeds the max allowed dirty size as returned by
-     * max_dirty_extent_bytes(), then it's time to flush the extent.
-     * Note that this will cause sequential writes to be flushed at just the
-     * right intervals to optimize fewer write calls and also allowing the
-     * server scheduler to merge better.
-     * See bytes_to_flush for how random writes are flushed.
-     *
-     * Note: max_dirty_extent is static as it doesn't change after it's
-     *       queried for the first time.
-     */
-    static const uint64_t max_dirty_extent =
-        inode->get_filecache()->max_dirty_extent_bytes();
-    assert(max_dirty_extent > 0);
-
-    /*
-     * How many bytes in the cache need to be flushed. These are dirty chunks
-     * which have not started flushing yet.
-     */
-    const uint64_t bytes_to_flush =
-        inode->get_filecache()->get_bytes_to_flush();
-
-    AZLogDebug("[{}] extent_left: {}, extent_right: {}, size: {}, "
-               "bytes_to_flush: {} (max_dirty_extent: {})",
-               ino, extent_left, extent_right,
-               (extent_right - extent_left),
-               bytes_to_flush,
-               max_dirty_extent);
-
-    if ((extent_right - extent_left) < max_dirty_extent) {
-        /*
-         * Current extent is not large enough to be flushed, see if we have
-         * enough total dirty data that needs to be flushed. This is to cause
-         * random writes to be periodically flushed.
-         */
-        if (bytes_to_flush < max_dirty_extent) {
-            /*
-             * No memory pressure.
-             */
-            AZLogDebug("Reply write without syncing to Blob");
-            reply_write(length);
-            return;
-        }
-
-        /*
-         * This is the case of non-sequential writes causing enough dirty
-         * data to be accumulated, need to flush all of that.
-         */
-        extent_left = 0;
-        extent_right = UINT64_MAX;
-    }
-
-    /*
-     * Queue a non-blocking flush target for flushing all the dirty data.
-     */
-    inode->get_fcsm()->ensure_flush(offset, length, nullptr);
-
-    /*
-     * Complete the write request without waiting for the backend flush to
-     * complete.
-     */
-    reply_write(length);
+    // Run this task through flush commit state machine
+    inode->get_fcsm()->run(this, extent_left, extent_right);
 }
 
 void rpc_task::run_flush()
@@ -2575,6 +2659,7 @@ void rpc_task::run_flush()
      * to wait. Waiting in libnfs thread context will cause deadlocks.
      */
     assert(get_fuse_req() != nullptr);
+
     reply_error(inode->flush_cache_and_wait());
 }
 

From f11346f145029c3383f14b431c0bbd02343f4d67 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Mon, 10 Feb 2025 00:25:40 +0000
Subject: [PATCH 4/9] Audited

rpc_stats.cpp
---
 turbonfs/inc/file_cache.h   |   5 ++
 turbonfs/src/file_cache.cpp |   8 +++
 turbonfs/src/rpc_task.cpp   | 109 ++++++++++++++++++++++++++++--------
 3 files changed, 98 insertions(+), 24 deletions(-)

diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index 7b07ced8..e408d00a 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -694,6 +694,11 @@ struct bytes_chunk
      */
     bool is_new = false;
 
+    /**
+     * Get the inode corresponding to this bc.
+     */
+    struct nfs_inode *get_inode() const;
+
     /**
      * Return membuf corresponding to this bytes_chunk.
      * This will be used by caller to synchronize operations on the membuf.
diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp
index 46040037..ab78ac22 100644
--- a/turbonfs/src/file_cache.cpp
+++ b/turbonfs/src/file_cache.cpp
@@ -923,6 +923,14 @@ bytes_chunk::bytes_chunk(bytes_chunk_cache *_bcc,
     assert(get_buffer() != nullptr);
 }
 
+/**
+ * Get the inode corresponding to this bc.
+ */
+struct nfs_inode *bytes_chunk::get_inode() const
+{
+    return bcc->get_inode();
+}
+
 bytes_chunk_cache::bytes_chunk_cache(struct nfs_inode *_inode,
                                      const char *_backing_file_name) :
     inode(_inode),
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index f7e5b3ef..335a7503 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -839,9 +839,12 @@ static void commit_callback(
     /*
      * Commit is issued as a FUSE_FLUSH.
      * TODO: Maybe it should have a type of its own.
+     *
+     * Note that flush/write is issued as FUSE_WRITE.
      */
     assert(task->magic == RPC_TASK_MAGIC);
     assert(task->get_op_type() == FUSE_FLUSH);
+    // rpc_api->pvt must contain the list of bcs which were committed.
     assert(task->rpc_api->pvt != nullptr);
     // Commit is never called for a fuse request.
     assert(task->get_fuse_req() == nullptr);
@@ -858,16 +861,34 @@ static void commit_callback(
 
     // We should call commit only when inode is doing unstable+commit.
     assert(!inode->is_stable_write());
-    // Caller must be inprogress.
+    // Inode must be marked "commit in progress".
     assert(inode->is_commit_in_progress());
+    // Commit and flush/write are exclusive.
     assert(!inode->get_filecache()->is_flushing_in_progress());
 
     const int status = task->status(rpc_status, NFS_STATUS(res));
     UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc);
+
+    // Set "in commit callback".
     FC_CB_TRACKER fccbt(inode);
 
-    AZLogDebug("[{}] commit_callback", ino);
+    AZLogDebug("[{}] commit_callback, number of bc committed: {}",
+               ino, bc_vec_ptr->size());
     
+#ifdef ENABLE_PARANOID
+    /*
+     * We always pick *all* commit-pending bcs for committing, and while commit
+     * is going on no other flush can run, hence new commit pending bcs cannot
+     * be added, so when an ongoing commit completes we we must not have any
+     * commit pending bcs in the cache.
+     */
+    {
+        std::vector<bytes_chunk> bc_vec =
+            inode->get_filecache()->get_commit_pending_bcs();
+        assert(bc_vec.empty());
+    }
+#endif
+
     uint64_t commit_bytes = 0;
     /*
      * Now that the request has completed, we can query libnfs for the
@@ -885,7 +906,7 @@ static void commit_callback(
          * persisted the data and client can fee it).
          * Also complete any tasks waiting for these membufs to be committed.
          */
-        for (auto &bc : *bc_vec_ptr) {
+        for (auto& bc : *bc_vec_ptr) {
             struct membuf *mb = bc.get_membuf();
             assert(mb->is_inuse());
             assert(mb->is_locked());
@@ -894,18 +915,25 @@ static void commit_callback(
             // Dirty membufs must not be committed.
             assert(!mb->is_dirty());
 
+            // We should not have a zero length bc.
+            assert(bc.length > 0);
+            assert(bc.length <= AZNFSC_MAX_CHUNK_SIZE);
+            assert((bc.offset + bc.length) <= AZNFSC_MAX_FILE_SIZE);
+
             mb->clear_commit_pending();
             mb->clear_locked();
             mb->clear_inuse();
 
             /**
-             * Collect the contiguous range of bc's to release.
+             * Release commited data from file cache, one contiguous range at
+             * a time.
              */
             if (offset == 0 && length == 0) {
                 offset = bc.offset;
                 length = bc.length;
             } else if (offset + length == bc.offset) {
                 length += bc.length;
+                assert(length <= AZNFSC_MAX_FILE_SIZE);
             } else {
                 commit_bytes += length;
                 const uint64_t released =
@@ -927,7 +955,6 @@ static void commit_callback(
                        "released {} bytes",
                        ino, offset, offset+length, released);
         }
-
     } else if (NFS_STATUS(res) == NFS3ERR_JUKEBOX) {
         task->get_client()->jukebox_retry(task);
         return;
@@ -938,12 +965,16 @@ static void commit_callback(
          * membufs back as dirty and clear the commit_pending flag.
          * Next write will initiate the flush again with stable write.
          */
-        for (auto &bc : *bc_vec_ptr) {
+        for (auto& bc : *bc_vec_ptr) {
             struct membuf *mb = bc.get_membuf();
             assert(mb->is_inuse());
             assert(mb->is_locked());
             assert(mb->is_commit_pending());
             assert(mb->is_uptodate());
+            /*
+             * TODO: What happens if application writes over these
+             *       commit_pending bcs?
+             */
             assert(!mb->is_dirty());
 
             mb->clear_commit_pending();
@@ -951,30 +982,37 @@ static void commit_callback(
             mb->clear_locked();
             mb->clear_inuse();
         }
+
         /*
          * Set the inode to stable write, so that next write will initiate
          * the flush again with stable write.
-         * There should be no flush in progress as this moment.
+         * There should be no flush in progress as this moment, also since
+         * we always commit *all* the commit pending bcs, there should not
+         * be any more commit pending bcs, so we can safely just enable
+         * stable writes for the inode w/o the elaborate
+         * switch_to_stable_write().
          */
         assert(inode->get_filecache()->is_flushing_in_progress() == false);
         inode->set_stable_write();
     }
 
-    /*
-     * Clear the commit in progress flag.
-     */
-    inode->clear_commit_in_progress();
-
     if (status == 0) {
         assert(commit_bytes > 0);
-
-        // Update the commit bytes in the inode.
+        /*
+         * Update the commit bytes in the inode.
+         * This will also clear the commit-in-progress flag in the inode as
+         * the very first thing.
+         */
         inode->get_fcsm()->on_commit_complete(commit_bytes);
     } else {
+        // TODO: Add fcsm::on_commit_fail() and call it from here.
+        // Clear the commit in progress flag.
+        inode->clear_commit_in_progress();
         assert(0);
     }
 
     delete bc_vec_ptr;
+
     task->rpc_api->pvt = nullptr;
     task->free_rpc_task();
 }
@@ -996,17 +1034,30 @@ void rpc_task::issue_commit_rpc()
     // Caller must have marked commit inprogress.
     assert(inode->is_commit_in_progress());
 
+    // List of bcs to be committed by this commit call.
+    assert(rpc_api->pvt != nullptr);
+    auto bc_vec_ptr = (std::vector<bytes_chunk> *) rpc_api->pvt;
+
+    assert(bc_vec_ptr->size() > 0);
+    AZLogDebug("[{}] issue_commit_rpc, num bc: {}", ino, bc_vec_ptr->size());
+
+#ifdef ENABLE_PARANOID
+    // Make sure bcs being committed have correct state.
+    for (auto& bc : *bc_vec_ptr) {
+        struct membuf *mb = bc.get_membuf();
+        assert(mb->is_inuse());
+        assert(mb->is_locked());
+        assert(mb->is_commit_pending());
+        assert(mb->is_uptodate());
+        // Dirty membufs must not be committed.
+        assert(!mb->is_dirty());
+    }
+#endif
+
     COMMIT3args args;
     ::memset(&args, 0, sizeof(args));
     bool rpc_retry = false;
 
-    AZLogDebug("[{}] issue_commit_rpc", ino);
-
-    /*
-     * Get the bcs marked for commit_pending.
-     */
-    assert(rpc_api->pvt != nullptr);
-
     args.file = inode->get_fh();
     args.offset = 0;
     args.count = 0;
@@ -1059,6 +1110,9 @@ void bc_iovec::on_io_complete(uint64_t bytes_completed, bool is_unstable_write)
         assert(!bcq.empty());
         struct bytes_chunk& bc = bcq.front();
 
+        // Caller must not send incorrect value for unstable write.
+        assert(bc.get_inode()->is_stable_write() == !is_unstable_write);
+
         /*
          * Absolute offset and length represented by this bytes_chunk.
          * bc->pvt is adjusted as partial reads/writes complete part of
@@ -1291,7 +1345,8 @@ static void write_iov_callback(
                        bciov->orig_offset + bciov->orig_length);
 
             // Update bciov after the current write.
-            bciov->on_io_complete(res->WRITE3res_u.resok.count, !inode->is_stable_write());
+            bciov->on_io_complete(res->WRITE3res_u.resok.count,
+                                  !inode->is_stable_write());
 
             // Create a new write_task for the remaining bc_iovec.
             struct rpc_task *write_task =
@@ -1328,7 +1383,8 @@ static void write_iov_callback(
             return;
         } else {
             // Complete bc_iovec IO completed.
-            bciov->on_io_complete(res->WRITE3res_u.resok.count, !inode->is_stable_write());
+            bciov->on_io_complete(res->WRITE3res_u.resok.count,
+                                  !inode->is_stable_write());
 
             // Complete data writen to blob.
             AZLogDebug("[{}] <{}> Completed write, off: {}, len: {}",
@@ -1412,6 +1468,7 @@ static void write_iov_callback(
     if (inode->get_write_error() == 0) {
         inode->get_fcsm()->on_flush_complete(bciov->orig_length);
     } else {
+        // TODO: Add fcsm::on_flush_fail() and call it from here.
         assert(0);
     }
 
@@ -2645,7 +2702,11 @@ void rpc_task::run_write()
 
     assert(extent_right >= (extent_left + length));
 
-    // Run this task through flush commit state machine
+    /*
+     * copy_to_cache() has added some more dirty data to the file cache,
+     * let FCSM know about it. It may want to flush and/or commit if we are
+     * above some configured limit.
+     */
     inode->get_fcsm()->run(this, extent_left, extent_right);
 }
 

From baf9bb537cd36a6d47dddfa49dce74a674b913c4 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Mon, 10 Feb 2025 00:31:48 +0000
Subject: [PATCH 5/9] Original version of nfs_inode.cpp

---
 turbonfs/src/nfs_inode.cpp | 444 +++++++++++++++++++++++++++++++------
 1 file changed, 382 insertions(+), 62 deletions(-)

diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index 6c95de6e..55f5c3f9 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -34,7 +34,7 @@ nfs_inode::nfs_inode(const struct nfs_fh3 *filehandle,
     assert(write_error == 0);
 
     // TODO: Revert this to false once commit changes integrated.
-    assert(stable_write == true);
+    assert(stable_write == false);
     assert(commit_state == commit_state_t::COMMIT_NOT_NEEDED);
 
 #ifndef ENABLE_NON_AZURE_NFS
@@ -380,6 +380,175 @@ int nfs_inode::get_actimeo_max() const
     }
 }
 
+/*
+ * Caller should hold flush_lock().
+ */
+void nfs_inode::wait_for_ongoing_commit()
+{
+    /*
+     * TODO: See if we can eliminate inline sleep.
+     */
+    if (is_commit_in_progress()) {
+        AZLogWarn("[{}] wait_for_ongoing_commit() will sleep inline!!",
+                  get_fuse_ino());
+    }
+
+    while (is_commit_in_progress()) {
+        assert(!get_filecache()->is_flushing_in_progress());
+        ::usleep(1000);
+    }
+
+    assert(!is_commit_in_progress());
+}
+
+/*
+ * This function is called with flush_lock() held.
+ * This should be called whenever we figure out that we cannot proceed with
+ * unstable writes (most common reason being, next write is not an append
+ * write). Once this function returns, following is guaranteed:
+ * - There will be no flushes in progress.
+ * - There will be no commit_pending data and no commit inprogress.
+ * - inode->stable_write will be set to true.
+ */
+void nfs_inode::switch_to_stable_write()
+{
+    assert(is_flushing);
+    assert(!is_stable_write());
+
+    AZLogInfo("[{}] Switching to stable write", ino);
+
+    /*
+     * switch_to_stable_write() is called from places where we are about to
+     * start a flush operation. Before that we check to see if we need to
+     * change to stable write. Since we are not flushing yet and since we
+     * do not support multiple ongoing flushes, we are guaranteed that no
+     * flush should be in progress when we reach here.
+     * Similarly commit should not be in progress.
+     */
+    assert(!is_commit_in_progress());
+    assert(!get_filecache()->is_flushing_in_progress());
+
+    /*
+     * Check if there is anything to commit, if not then
+     * switch to stable write.
+     */
+    if (get_filecache()->get_bytes_to_commit() == 0) {
+        AZLogDebug("[{}] Nothing to commit, switching to stable write", ino);
+        set_stable_write();
+
+        /*
+         * Now we moved to stable write, cleanup the commit target queue.
+         */
+        get_fcsm()->ctgtq_cleanup();
+        return;
+    }
+
+    uint64_t bytes;
+    std::vector<bytes_chunk> bc_vec =
+        get_filecache()->get_commit_pending_bcs(&bytes);
+    assert(bc_vec.empty() == (bytes == 0));
+    assert(bytes > 0);
+
+    /*
+     * Issue the commit RPC to commit the pending data.
+     */
+    commit_membufs(bc_vec);
+
+    /*
+     * Wait for the commit to complete.
+     */
+    wait_for_ongoing_commit();
+
+    assert(get_filecache()->get_bytes_to_commit() == 0);
+    assert(!get_filecache()->is_flushing_in_progress());
+    assert(!is_commit_in_progress());
+
+    set_stable_write();
+
+    /*
+     * Now we moved to stable write, cleanup the commit target queue.
+     */
+    get_fcsm()->ctgtq_cleanup();
+    return;
+}
+
+/*
+ * This function checks, whether switch to stable write or not.
+ */
+bool nfs_inode::check_stable_write_required(off_t offset)
+{
+    /*
+     * If stable_write is already set, we don't need to do anything.
+     * We don't need lock here as once stable_write is set it's never
+     * unset.
+     */
+    if (is_stable_write()) {
+        return false;
+    }
+
+    /*
+     * If current write is not append write, then we can't go for unstable writes
+     * It may be overwrite to existing data and we don't have the knowldege of
+     * existing block list, it maye require read modified write. So, we can't go
+     * for unstable write. Similarly, if the offset is more than end of the file,
+     * we need to write zero block in between the current end of the file and the
+     * offset.
+     */
+    if (putblock_filesize != offset) {
+        AZLogInfo("Stable write required as putblock_filesize:{} is not at the"
+            "offset:{}", putblock_filesize, offset);
+
+        return true;
+    }
+
+    return false;
+}
+
+/**
+ * commit_membufs() is called by writer thread to commit flushed membufs.
+ * It's always issued under flush_lock().
+ */
+void nfs_inode::commit_membufs(std::vector<bytes_chunk> &bc_vec)
+{
+    assert(is_flushing);
+
+    set_commit_in_progress();
+
+    uint64_t prev_offset = 0;
+    for (bytes_chunk& bc : bc_vec) {
+        [[maybe_unused]] struct membuf *mb = bc.get_membuf();
+        assert(mb != nullptr);
+        assert(mb->is_inuse());
+        assert(mb->is_commit_pending());
+
+        if (prev_offset == 0) {
+            prev_offset = bc.offset + bc.length;
+        } else {
+            assert(prev_offset == bc.offset);
+            prev_offset += bc.length;
+        }
+
+        /*
+         * We have at least one commit to issue, mark fcsm as running,
+         * if not already marked.
+         */
+        get_fcsm()->mark_running();
+        get_fcsm()->add_committing(bc.length);
+    }
+
+    /*
+     * Create the commit task to carry out the write.
+     */
+    struct rpc_task *commit_task =
+                get_client()->get_rpc_task_helper()->alloc_rpc_task(FUSE_FLUSH);
+    commit_task->init_flush(nullptr /* fuse_req */, ino);
+    assert(commit_task->rpc_api->pvt == nullptr);
+
+    commit_task->rpc_api->pvt = static_cast<void *>(new std::vector<bytes_chunk>(bc_vec));
+
+    commit_task->issue_commit_rpc();
+}
+
 /**
  * Note: We dispatch WRITE RPCs as we gather full wsize sized data bytes,
  *       while there may be more bcs that we have not yet processed. This means
@@ -394,6 +563,20 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
     // Caller must hold the flush_lock.
     assert(is_flushing);
 
+    if (!is_stable_write()) {
+        /*
+         * We do not allow a new flush while there's an ongoing one, in case
+         * of unstable writes.
+         */
+        assert(!get_filecache()->is_flushing_in_progress());
+    }
+
+    /*
+     * Stable won't have commit and for unstable we cannot flush while
+     * commit is going on.
+     */
+    assert(!is_commit_in_progress());
+
     if (bc_vec.empty()) {
         return;
     }
@@ -422,6 +605,14 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
         parent_task->num_ongoing_backend_writes = 1;
     }
 
+    /*
+     * If new offset is not at the end of the file,
+     * then we need to switch to stable write.
+     */
+    if (check_stable_write_required(bc_vec[0].offset)) {
+        switch_to_stable_write();
+    }
+
     /*
      * Create the flush task to carry out the write.
      */
@@ -564,6 +755,7 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
          * Once packed completely, then dispatch the write.
          */
         if (write_task->add_bc(bc)) {
+            putblock_filesize += bc.length;
             continue;
         } else {
             /*
@@ -589,6 +781,8 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
             // Single bc addition should not fail.
             [[maybe_unused]] bool res = write_task->add_bc(bc);
             assert(res == true);
+
+            putblock_filesize += bc.length;
         }
     }
 
@@ -845,9 +1039,25 @@ int nfs_inode::wait_for_ongoing_flush(uint64_t start_off, uint64_t end_off)
      * Flushing not in progress and no new flushing can be started as we hold
      * the flush_lock(), and callback drained.
      */
-    if (!get_filecache()->is_flushing_in_progress() &&
-        !get_fcsm()->fc_cb_running()) {
-        AZLogDebug("[{}] No flush in progress, returning", ino);
+    /*
+     * Stable writes do not need commit, so no commit inprogress and no pending
+     * commit data.
+     */
+    if (is_stable_write()) {
+        assert(!is_commit_in_progress());
+        assert(get_filecache()->bytes_commit_pending == 0);
+    }
+
+    if (get_filecache()->is_flushing_in_progress()) {
+        assert(!is_commit_in_progress());
+    } else if (!is_commit_in_progress() && !get_fcsm()->fc_cb_running() &&
+               get_filecache()->bytes_commit_pending == 0) {
+        /*
+         * Flushing not in progress and no new flushing can be started as we hold
+         * the flush_lock(), and callback drained.
+         * No commit inprogress and no pending commit data, return.
+         */
+        AZLogDebug("[{}] No flush or commit in progress, returning", ino);
         return 0;
     }
 
@@ -979,14 +1189,57 @@ int nfs_inode::wait_for_ongoing_flush(uint64_t start_off, uint64_t end_off)
     assert(is_flushing);
     assert(!get_fcsm()->fc_cb_running());
 
-    return err;
+    /*
+     * Unstable write case, we need to wait for the commit to complete.
+     */
+    if (get_filecache()->get_bytes_to_commit() > 0) {
+        assert(!is_stable_write());
+
+        if (!is_commit_in_progress()) {
+            uint64_t bytes = 0;
+            std::vector<bytes_chunk> bc_vec =
+                get_filecache()->get_commit_pending_bcs(&bytes);
+            assert(bc_vec.empty() == (bytes == 0));
+            assert(bytes > 0);
+
+            /*
+             * Issue the commit RPC to commit the pending data.
+             */
+            commit_membufs(bc_vec);
+        }
+    }
+
+    if (is_commit_in_progress()) {
+        wait_for_ongoing_commit();
+    }
+
+    return get_write_error();
 }
 
 /**
- * Note: This takes shared lock on ilock_1.
+ * flush_cache_and_wait() is called only from the release/flush call.
+ * It's called with flush_lock() held.
+ * Things it does:
+ * - Before flushing it needs to wait for any ongoing commit to complete.
+ * - First it try to get membufs through get_contiguous_dirty_bcs() or
+ *   get_dirty_bc_range() based on the unstable write or stable write.
+ * - Issue flush for the dirty membufs and wait for the flush to complete.
+ * - If it's unstable write, it issue commit for commit pending membufs.
+ * - Wait for the commit to complete.
+ * - Returns the error code if any.
+ *
+ * Note: Flush_cache_and_wait() blocks the fuse thread till the flush completes.
+ *       It's called from the release(), flush() and getattr() calls. It's ok
+ *       as of now as it's not very often called. We can optimize to complete
+ *       the flush in background and return immediately. For that we need to add
+ *       special handling for the getattr() call.
  */
 int nfs_inode::flush_cache_and_wait()
 {
+    assert(!is_stable_write() ||
+           get_filecache()->bytes_commit_pending == 0);
+    assert(!is_stable_write() || !is_commit_in_progress());
+
     /*
      * MUST be called only for regular files.
      * Leave the assert to catch if fuse ever calls flush() on non-reg files.
@@ -1016,85 +1269,147 @@ int nfs_inode::flush_cache_and_wait()
     }
 
     /*
-     * Grab the inode flush_lock to ensure that we don't initiate any new flush
-     * operation while some truncate call is in progress (which must have taken
-     * the flush_lock).
-     * Once flush_lock() returns we have the flush_lock and we are
+     * Grab the flush_lock to ensure that we don't initiate any new flushes
+     * while some truncate call is in progress (which must have taken the
+     * flush_lock). Once flush_lock() returns we have the lock and we are
      * guaranteed that no new truncate operation can start till we release
      * the flush_lock. We can safely start the flush then.
      */
     flush_lock();
 
     /*
-     * Get the dirty bytes_chunk from the filecache handle.
-     * This will grab an exclusive lock on the file cache and return the list
-     * of dirty bytes_chunks at that point. Note that we can have new dirty
-     * bytes_chunks created but we don't want to wait for those.
+     * If flush is in progress, wait for it to complete.
+     * It may happen that last flush is still in progress and we need to wait
+     * for it to complete before we start the new flush.
+     * As we have the flush_lock() we are guaranteed that no new flush can be
+     * started till we release the flush_lock().
+     *
+     * wait_for_ongoing_flush() will wait for the ongoing flush/commt to
+     * complete.
+     *
+     * Note: For stable write case, we don't need to wait for the ongoing flush,
+     *       as there is no commit required for stable write.
      */
-    std::vector<bytes_chunk> bc_vec =
-        filecache_handle->get_dirty_bc_range();
+    if (!is_stable_write()) {
+        int retry, err = 0;
+        const int max_retry = 200;
+        for (retry = 0; retry < max_retry; retry++) {
+            assert(is_flushing);
+            err = wait_for_ongoing_flush();
+            assert(is_flushing);
+
+            if (!get_fcsm()->fc_cb_running()) {
+                break;
+            }
+            flush_unlock();
+
+            /*
+             * We issue commit as part of wait_for_ongoing_flush()
+             * with flush_lock() held. This cause on_commit_complete()
+             * not run completely.
+             * Give 10ms to the callback to drain completely.
+             */
+            if (get_fcsm()->fc_cb_running()) {
+                ::usleep(10 * 1000);
+            }
+
+            /*
+             * Lock again to check if something flushing again,
+             * and wait for it to complete in wait_for_ongoing_flush().
+             * Repeat again to check if on_commit_complete() completed.
+             */
+            flush_lock();
+        }
+
+        if (err != 0) {
+            flush_unlock();
+            AZLogError("[{}] Failed to flush cache to stable storage, "
+                        "error={}", ino, err);
+            return err;
+        }
+    }
+    assert(is_flushing);
 
     /*
-     * sync_membufs() iterate over the bc_vec and starts flushing the dirty
-     * membufs. It batches the contiguous dirty membufs and issues a single
-     * write RPC for them.
+     * For unstable write, If no bytes to flush, then return.
+     * For stable write, If no bytes to flush and no flush in
+     * progress, then return. As for stable write, we didn't
+     * wait for the flush to complete as above.
+     *
+     * TODO: Need to check getattr() call. As that can cause this
+     *       assert to fail.
      */
-    sync_membufs(bc_vec, true);
-
-    flush_unlock();
+    assert(!get_filecache()->is_flushing_in_progress() ||
+           is_stable_write());
+    assert(!is_commit_in_progress());
 
     /*
-     * Our caller expects us to return only after the flush completes.
-     * Wait for all the membufs to flush and get result back.
+     * For stable case we didn't wait for ongoing flush to complete.
+     * And, it may be last flush initiated to BLOB, we need to wait
+     * for this flush to complete. Application not writing anymore
+     * get_bytes_to_flush() can return 0.
      */
-    for (bytes_chunk &bc : bc_vec) {
-        struct membuf *mb = bc.get_membuf();
+    if (!get_filecache()->is_flushing_in_progress() &&
+        get_filecache()->get_bytes_to_flush() == 0) {
+        assert(get_filecache()->bytes_dirty == 0);
+        assert(get_filecache()->bytes_flushing == 0);
 
-        assert(mb != nullptr);
-        assert(mb->is_inuse());
-        mb->set_locked();
+        AZLogDebug("[{}] Nothing to flush, returning", ino);
+        flush_unlock();
+        return 0;
+    }
 
-        /*
-         * If still dirty after we get the lock, it may mean two things:
-         * - Write failed.
-         * - Some other thread got the lock before us and it made the
-         *   membuf dirty again.
-         */
-        if (mb->is_dirty() && get_write_error()) {
-            AZLogError("[{}] Flush [{}, {}) failed with error: {}",
-                       ino,
-                       bc.offset, bc.offset + bc.length,
-                       get_write_error());
-        }
+    /*
+     * Get the dirty bytes_chunk from the filecache handle.
+     * This will grab an exclusive lock on the file cache and return the list
+     * of dirty bytes_chunks at that point.
+     */
+    std::vector<bytes_chunk> bc_vec;
+    uint64_t size = 0;
+    if (!is_stable_write()) {
+        bc_vec = filecache_handle->get_contiguous_dirty_bcs(&size);
 
-        mb->clear_locked();
-        mb->clear_inuse();
+        if (size != get_filecache()->get_bytes_to_flush()) {
+            AZLogInfo("[{}] Flushed size: {} != bytes to flush: {}",
+                        ino, size, get_filecache()->get_bytes_to_flush());
+            AZLogInfo("SWITCHING TO STABLE WRITE");
 
-        /*
-         * Release the bytes_chunk back to the filecache.
-         * These bytes_chunks are not needed anymore as the flush is done.
-         *
-         * Note: We come here for bytes_chunks which were found dirty by the
-         *       above loop. These writes may or may not have been issued by
-         *       us (if not issued by us it was because some other thread,
-         *       mostly the writer issued the write so we found it flushing
-         *       and hence didn't issue). In any case since we have an inuse
-         *       count, release() called from write_callback() would not have
-         *       released it, so we need to release it now.
-         */
-        filecache_handle->release(bc.offset, bc.length);
+            /*
+             * This is rare and no-op as we already completed the pending
+             * flush and commit.
+             */
+            switch_to_stable_write();
+            for (auto& bc : bc_vec) {
+                bc.get_membuf()->clear_inuse();
+            }
+            bc_vec = filecache_handle->get_dirty_bc_range(0, UINT64_MAX);
+        }
+    } else {
+        bc_vec = filecache_handle->get_dirty_bc_range(0, UINT64_MAX);
     }
 
     /*
-     * If the file is deleted while we still have data in the cache, don't
-     * treat it as a failure to flush. The file has gone and we don't really
-     * care about the unwritten data.
+     * sync_membufs() iterate over the bc_vec and starts flushing the dirty
+     * membufs. It batches the contiguous dirty membufs and issues a single
+     * write RPC for them.
      */
-    if (get_write_error() == ENOENT || get_write_error() == ESTALE) {
-        return 0;
+    sync_membufs(bc_vec, false /* is_flush */);
+
+    if (wait_for_ongoing_flush()) {
+        AZLogError("[{}] Failed to flush cache to stable storage, "
+            "error={}", ino, get_write_error());
+        flush_unlock();
+        return get_write_error();
     }
 
-    return get_write_error();
+    assert(!is_commit_in_progress());
+    assert(!get_filecache()->is_flushing_in_progress());
+    assert(get_filecache()->bytes_dirty == 0);
+    assert(get_filecache()->bytes_commit_pending == 0);
+    assert(get_filecache()->bytes_flushing == 0);
+
+    flush_unlock();
+    return 0;
 }
 
 void nfs_inode::flush_lock() const
@@ -1152,6 +1467,11 @@ void nfs_inode::truncate_end(size_t size) const
     [[maybe_unused]]
     const uint64_t bytes_truncated = filecache_handle->truncate(size, true /* post */);
 
+    /*
+     * Update the in cache putblock_filesize to reflect the new size.
+     */
+    putblock_filesize = size;
+
     AZLogDebug("[{}] <truncate_end> Filecache truncated to size={} "
                "(bytes truncated: {})",
                ino, size, bytes_truncated);

From 8965daea2a42a9ccc84e1134b01452ee12c5e750 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Mon, 10 Feb 2025 01:03:36 +0000
Subject: [PATCH 6/9] Partially audited nfs_inode.cpp

---
 turbonfs/inc/nfs_inode.h   |  6 +-----
 turbonfs/src/fcsm.cpp      |  8 +++++++-
 turbonfs/src/nfs_inode.cpp | 42 +++++++++++++++++++++++++++++---------
 turbonfs/src/rpc_task.cpp  |  2 ++
 4 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
index 6119272e..6a163357 100644
--- a/turbonfs/inc/nfs_inode.h
+++ b/turbonfs/inc/nfs_inode.h
@@ -1442,11 +1442,7 @@ struct nfs_inode
      * commit_membufs() is called to commit uncommitted membufs to the Blob.
      * It creates commit RPC and sends it to the NFS server.
      */
-    void commit_membufs(std::vector<bytes_chunk> &bcs)
-    {
-        // TODO: Provide implementation in nfs_inode.cpp.
-        assert(0);
-    }
+    void commit_membufs(std::vector<bytes_chunk> &bcs);
 
     /**
      * switch_to_stable_write() is called to switch the inode to stable write
diff --git a/turbonfs/src/fcsm.cpp b/turbonfs/src/fcsm.cpp
index 2481f748..05b52234 100644
--- a/turbonfs/src/fcsm.cpp
+++ b/turbonfs/src/fcsm.cpp
@@ -288,6 +288,7 @@ void fcsm::run(struct rpc_task *task,
 void fcsm::ctgtq_cleanup()
 {
     assert(inode->is_flushing);
+    // Must be called after switching the inode to stable write.
     assert(inode->is_stable_write());
 
     AZLogDebug("[FCSM][{}] ctgtq_cleanup()", inode->get_fuse_ino());
@@ -634,7 +635,9 @@ void fcsm::on_commit_complete(uint64_t commit_bytes)
 
     /*
      * Commit callback can be called only when commit is in progress, clear
-     * it now.
+     * it now. Must do it before grabbing the flush_lock, note that
+     * wait_for_ongoing_commit() is waiting for commit-in-progress to be
+     * cleared, with flush_lock held.
      */
     assert(inode->is_commit_in_progress());
     inode->clear_commit_in_progress();
@@ -642,6 +645,9 @@ void fcsm::on_commit_complete(uint64_t commit_bytes)
     // If commit is running, flush cannot be running.
     assert(!inode->get_filecache()->is_flushing_in_progress());
 
+    // commit_pending_bytes must be 0 here.
+    assert(inode->get_filecache()->get_bytes_to_commit() == 0);
+
     // a byte can only be committed after it's flushed successfully.
     assert(committing_seq_num <= flushed_seq_num);
     assert(committing_seq_num <= flushing_seq_num);
diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index 55f5c3f9..c419e40c 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -33,7 +33,7 @@ nfs_inode::nfs_inode(const struct nfs_fh3 *filehandle,
     assert(client->magic == NFS_CLIENT_MAGIC);
     assert(write_error == 0);
 
-    // TODO: Revert this to false once commit changes integrated.
+    // We start doing unstable writes untill proven o/w.
     assert(stable_write == false);
     assert(commit_state == commit_state_t::COMMIT_NOT_NEEDED);
 
@@ -385,6 +385,8 @@ int nfs_inode::get_actimeo_max() const
  */
 void nfs_inode::wait_for_ongoing_commit()
 {
+    assert(is_flushing);
+
     /*
      * TODO: See if we can eliminate inline sleep.
      */
@@ -393,8 +395,15 @@ void nfs_inode::wait_for_ongoing_commit()
                   get_fuse_ino());
     }
 
+    int iter = 0;
     while (is_commit_in_progress()) {
+        // Flush can commit are mutually exclusive operations.
         assert(!get_filecache()->is_flushing_in_progress());
+
+        if (++iter % 1000 == 0) {
+            AZLogWarn("[{}] wait_for_ongoing_commit() still waiting, iter: {}",
+                    get_fuse_ino(), iter);
+        }
         ::usleep(1000);
     }
 
@@ -423,14 +432,15 @@ void nfs_inode::switch_to_stable_write()
      * change to stable write. Since we are not flushing yet and since we
      * do not support multiple ongoing flushes, we are guaranteed that no
      * flush should be in progress when we reach here.
-     * Similarly commit should not be in progress.
+     * Similarly commit should not be in progress as flush and commit are
+     * mutually exclusive.
      */
     assert(!is_commit_in_progress());
     assert(!get_filecache()->is_flushing_in_progress());
 
     /*
-     * Check if there is anything to commit, if not then
-     * switch to stable write.
+     * Check if there is anything to commit, if not then simply update the
+     * inode state to "doing stable writes".
      */
     if (get_filecache()->get_bytes_to_commit() == 0) {
         AZLogDebug("[{}] Nothing to commit, switching to stable write", ino);
@@ -473,10 +483,12 @@ void nfs_inode::switch_to_stable_write()
 }
 
 /*
- * This function checks, whether switch to stable write or not.
+ * This function checks whether we need to switch to stable write or not.
  */
 bool nfs_inode::check_stable_write_required(off_t offset)
 {
+    assert(offset <= (off_t) AZNFSC_MAX_FILE_SIZE);
+
     /*
      * If stable_write is already set, we don't need to do anything.
      * We don't need lock here as once stable_write is set it's never
@@ -495,8 +507,9 @@ bool nfs_inode::check_stable_write_required(off_t offset)
      * offset.
      */
     if (putblock_filesize != offset) {
-        AZLogInfo("Stable write required as putblock_filesize:{} is not at the"
-            "offset:{}", putblock_filesize, offset);
+        AZLogInfo("[{}] Non-append write detected (expected: {}, got: {}), "
+                  "will switch to stable writes",
+                  get_fuse_ino(), putblock_filesize, offset);
 
         return true;
     }
@@ -508,7 +521,7 @@ bool nfs_inode::check_stable_write_required(off_t offset)
  * commit_membufs() is called by writer thread to commit flushed membufs.
  * It's always issued under flush_lock().
  */
-void nfs_inode::commit_membufs(std::vector<bytes_chunk> &bc_vec)
+void nfs_inode::commit_membufs(std::vector<bytes_chunk>& bc_vec)
 {
     assert(is_flushing);
 
@@ -524,6 +537,7 @@ void nfs_inode::commit_membufs(std::vector<bytes_chunk> &bc_vec)
         if (prev_offset == 0) {
             prev_offset = bc.offset + bc.length;
         } else {
+            // Caller must pass us contiguous membufs for committing.
             assert(prev_offset == bc.offset);
             prev_offset += bc.length;
         }
@@ -531,6 +545,8 @@ void nfs_inode::commit_membufs(std::vector<bytes_chunk> &bc_vec)
         /*
          * We have at least one commit to issue, mark fcsm as running,
          * if not already marked.
+         *
+         * XXX Shouldn't it be running already?
          */
         get_fcsm()->mark_running();
         get_fcsm()->add_committing(bc.length);
@@ -541,6 +557,7 @@ void nfs_inode::commit_membufs(std::vector<bytes_chunk> &bc_vec)
      */
     struct rpc_task *commit_task =
                 get_client()->get_rpc_task_helper()->alloc_rpc_task(FUSE_FLUSH);
+    // XXX Do we need to every call flush with fuse_req?
     commit_task->init_flush(nullptr /* fuse_req */, ino);
     assert(commit_task->rpc_api->pvt == nullptr);
 
@@ -606,8 +623,8 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
     }
 
     /*
-     * If new offset is not at the end of the file,
-     * then we need to switch to stable write.
+     * If the new data being written is not right after the last one written
+     * we need to switch to stable write.
      */
     if (check_stable_write_required(bc_vec[0].offset)) {
         switch_to_stable_write();
@@ -750,6 +767,11 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
             get_fcsm()->mark_running();
         }
 
+        /*
+         * XXX Add an assert that unstable writes should only have contiguous
+         *     bcs .
+         */
+
         /*
          * Add as many bytes_chunk to the write_task as it allows.
          * Once packed completely, then dispatch the write.
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 335a7503..15194680 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -865,6 +865,8 @@ static void commit_callback(
     assert(inode->is_commit_in_progress());
     // Commit and flush/write are exclusive.
     assert(!inode->get_filecache()->is_flushing_in_progress());
+    // bytes_commit_pending will be reduced later below.
+    assert(inode->get_filecache()->get_bytes_to_commit() > 0);
 
     const int status = task->status(rpc_status, NFS_STATUS(res));
     UPDATE_INODE_WCC(inode, res->COMMIT3res_u.resok.file_wcc);

From 935a777738d9ee02aede337a46261b382b2c2a65 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Mon, 10 Feb 2025 08:30:09 +0000
Subject: [PATCH 7/9] Joint audit Nitin and Tomar

fcsm.h
fcsm.cpp
---
 turbonfs/inc/fcsm.h         |   6 +-
 turbonfs/inc/file_cache.h   |   2 +-
 turbonfs/src/fcsm.cpp       | 189 ++++++++++++++++++++++++++----------
 turbonfs/src/file_cache.cpp |   3 +-
 turbonfs/src/rpc_task.cpp   |   2 +-
 5 files changed, 146 insertions(+), 56 deletions(-)

diff --git a/turbonfs/inc/fcsm.h b/turbonfs/inc/fcsm.h
index 4c40596b..aaae0080 100644
--- a/turbonfs/inc/fcsm.h
+++ b/turbonfs/inc/fcsm.h
@@ -294,8 +294,10 @@ class fcsm
     /*
      * Continually increasing seq number of the last byte successfully flushed
      * and committed. Flush-commit targets (fctgt) are expressed in terms of
-     * these. These are actually numerically one more than the last *ing and
-     * *ed byte's seq number.
+     * these. These are actually numerically one more than the last
+     * flushing/commiting and flushed/committed byte's seq number, e.g., if the
+     * last byte flushed was byte #1 (0 and 1 are flushed), then flushed_seq_num
+     * would be 2.
      * Note that these are *not* offsets in the file but these are cumulative
      * values for total bytes flushed/committed till now since the file was
      * opened. In case of overwrite same byte(s) may be repeatedly flushed
diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index e408d00a..713cc412 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -1531,7 +1531,7 @@ class bytes_chunk_cache
          * e.g.,
          * if the max_dirty_extent_bytes() is 1GB, then we have
          * flush_required() @ 1GB
-         * commit_required() @ 1GB
+         * commit_required() @ 2GB
          * do_inline_write() @ 4GB.
          *
          * Assuming backend flush speed of 1GB/s and memory write speed of
diff --git a/turbonfs/src/fcsm.cpp b/turbonfs/src/fcsm.cpp
index 05b52234..10abea60 100644
--- a/turbonfs/src/fcsm.cpp
+++ b/turbonfs/src/fcsm.cpp
@@ -83,14 +83,21 @@ void fcsm::add_flushing(uint64_t bytes)
 {
     assert(inode->is_flushing);
     assert(flushed_seq_num <= flushing_seq_num);
+    assert(committed_seq_num <= committing_seq_num);
+    assert(committing_seq_num <= flushed_seq_num);
+
     flushing_seq_num += bytes;
 }
 
 void fcsm::add_committing(uint64_t bytes)
 {
     assert(inode->is_flushing);
+    // Must be called only for unstable writes.
+    assert(!inode->is_stable_write());
     assert(committed_seq_num <= committing_seq_num);
+
     committing_seq_num += bytes;
+
     // We can only commit a byte that's flushed.
     assert(flushed_seq_num <= flushing_seq_num);
     assert(committing_seq_num <= flushed_seq_num);
@@ -238,11 +245,13 @@ void fcsm::run(struct rpc_task *task,
         inode->flush_lock();
         /*
          * Commit will only start after all ongoing flush complete and no new
-         * flush can start (as have the flush_lock). This means no new
+         * flush can start (as we have the flush_lock). This means no new
          * commit-pending data can be added and hence the current inprogress
          * commit will finish committing all commit-pending bytes.
          */
         if (inode->is_commit_in_progress()) {
+            assert(!inode->get_filecache()->is_flushing_in_progress());
+
             AZLogDebug("[{}] Commit already in progress, skipping commit",
                        ino);
         } else {
@@ -331,6 +340,14 @@ void fcsm::ensure_commit(uint64_t write_off,
     assert(inode->is_flushing);
     assert(!inode->is_stable_write());
 
+    /*
+     * If any of the flush/commit targets are waiting completion, state machine
+     * must be running.
+     */
+    assert(is_running() || (ctgtq.empty() && ftgtq.empty()));
+    assert(is_running() || (flushed_seq_num == flushing_seq_num));
+    assert(is_running() || (committed_seq_num == committing_seq_num));
+
     AZLogDebug("[{}] [FCSM] ensure_commit<{}> write req [{}, {}], task: {}",
                inode->get_fuse_ino(),
                task ? "blocking" : "non-blocking",
@@ -358,7 +375,7 @@ void fcsm::ensure_commit(uint64_t write_off,
      * one full sized dirty extent.
      */
     uint64_t commit_bytes =
-        inode->get_filecache()->get_bytes_to_commit();;
+        inode->get_filecache()->get_bytes_to_commit();
 
     if (commit_bytes == 0) {
         /*
@@ -383,8 +400,14 @@ void fcsm::ensure_commit(uint64_t write_off,
 
     /*
      * What will be the committed_seq_num value after commit_bytes are committed?
+     * Since commit_pending_bytes can reduce as another thread could be parallely
+     * running commit completion, so we may set target_commited_seq_num lower than
+     * the last queued commit_seq, so take the max.
      */
-    const uint64_t target_committed_seq_num = committed_seq_num + commit_bytes;
+    const uint64_t last_commit_seq =
+                !ctgtq.empty() ? ctgtq.front().commit_seq : 0;
+    const uint64_t target_committed_seq_num =
+             std::max((committed_seq_num + commit_bytes), last_commit_seq);
 
     /*
      * If the state machine is already running, we just need to add an
@@ -405,9 +428,6 @@ void fcsm::ensure_commit(uint64_t write_off,
                       0 /* target flush_seq */,
                       target_committed_seq_num /* target commit_seq */,
                       task);
-        assert(is_running());
-        assert(inode->is_flushing);
-        assert(!inode->is_stable_write());
         return;
     }
 
@@ -427,13 +447,19 @@ void fcsm::ensure_commit(uint64_t write_off,
         ensure_flush(task ? task->rpc_api->write_task.get_offset() : 0,
                      task ? task->rpc_api->write_task.get_size() : 0,
                      nullptr);
-        assert(is_running());
         /*
          * ensure_flush() must have scheduled flushing till
          * target_committed_seq_num.
+         *
+         * TODO: We need to ensure that ensure_flush() does indeed flush till
+         *       target_committed_seq_num, maybe convey it to ensure_flush().
          */
         assert(flushing_seq_num >= target_committed_seq_num);
 
+        /*
+         * ensure_flush()->sync_membufs() might have converted this inode to
+         * stable writes.
+         */
         if (!inode->is_stable_write()) {
             /**
              * Enqueue a commit target to be triggered once the flush completes.
@@ -443,13 +469,21 @@ void fcsm::ensure_commit(uint64_t write_off,
                           target_committed_seq_num /* target commit_seq */,
                           task);
         } else {
-            task->reply_write(task->rpc_api->write_task.get_size());
+            if (task) {
+                task->reply_write(task->rpc_api->write_task.get_size());
+            }
         }
 
         return;
     } else {
-        AZLogDebug("[{}] [FCSM] not running, schedule a new commit,"
-                   " flushed_seq_num: {}, "
+        /*
+         * No new data to flush for the current commit goal, just add a commit.
+         * target and we are done.
+         * Since FCSM is not running and we discovered that we have one or more
+         * bytes to be committed, get_commit_pending_bcs() MUST return those.
+         */
+        AZLogDebug("[{}] [FCSM] not running, schedule a new commit, "
+                   "flushed_seq_num: {}, "
                    "target_committed_seq_num: {}",
                    inode->get_fuse_ino(),
                    flushed_seq_num.load(),
@@ -458,26 +492,18 @@ void fcsm::ensure_commit(uint64_t write_off,
         uint64_t bytes;
         std::vector<bytes_chunk> bc_vec =
             inode->get_filecache()->get_commit_pending_bcs(&bytes);
-        assert(bc_vec.empty() == (bytes == 0));
-
-       /*
-        * FCSM not running and nothing to commit, complete the task and return.
-        * It may happen due writes are changed to stable write.
-        */
-        if (bytes == 0) {
-            assert(inode->is_stable_write() || commit_bytes == 0);
-            if (task) {
-                AZLogDebug("[{}] [FCSM] not running and nothing to commit, "
-                           "completing fuse write rightaway",
-                           inode->get_fuse_ino());
-                task->reply_write(task->rpc_api->write_task.get_size());
-            }
-            return;
-        }
+        assert(!bc_vec.empty());
+        assert(bytes > 0);
 
+        // With FCSM not running, these should be same.
+        assert(committing_seq_num == committed_seq_num);
+        [[maybe_unused]]
+        const uint64_t prev_committing_seq_num = committing_seq_num;
         inode->commit_membufs(bc_vec);
         assert(is_running());
-        assert(committing_seq_num >= commit_bytes);
+
+        assert(committing_seq_num == (prev_committing_seq_num + bytes));
+        assert(committing_seq_num > committed_seq_num);
     }
 }
 
@@ -489,6 +515,13 @@ void fcsm::ensure_flush(uint64_t write_off,
                         struct rpc_task *task)
 {
     assert(inode->is_flushing);
+    /*
+     * If any of the flush/commit targets are waiting completion, state machine
+     * must be running.
+     */
+    assert(is_running() || (ctgtq.empty() && ftgtq.empty()));
+    assert(is_running() || (flushed_seq_num == flushing_seq_num));
+    assert(is_running() || (committed_seq_num == committing_seq_num));
 
     AZLogDebug("[{}] [FCSM] ensure_flush<{}> write req [{}, {}], task: {}",
                inode->get_fuse_ino(),
@@ -559,6 +592,9 @@ void fcsm::ensure_flush(uint64_t write_off,
         }
 #endif
 
+        /*
+         * If no task and no new flush target, don't add a dup target.
+         */
         if (!task && (target_flushed_seq_num == last_flush_seq)) {
             return;
         }
@@ -573,6 +609,17 @@ void fcsm::ensure_flush(uint64_t write_off,
     /*
      * FCSM not running.
      */
+    assert(flushed_seq_num == flushing_seq_num);
+    assert(target_flushed_seq_num >= flushing_seq_num);
+
+    // No new data to flush.
+    if (target_flushed_seq_num == flushed_seq_num) {
+        if (task) {
+            task->reply_write(task->rpc_api->write_task.get_size());
+        }
+        return;
+    }
+
     uint64_t bytes;
     std::vector<bytes_chunk> bc_vec;
 
@@ -596,13 +643,14 @@ void fcsm::ensure_flush(uint64_t write_off,
      *       writes complete before sync_membufs() can return.
      *       DO NOT access rpc_task after sync_membufs() call.
      */
-    AZLogDebug("[{}] [FCSM] kicking, flushing_seq_num now: {}",
+    AZLogDebug("[{}] [FCSM] kicking, flushing_seq_num now: {} "
+               "flushed_seq_num: {}",
                inode->get_fuse_ino(),
-               flushing_seq_num.load());
+               flushing_seq_num.load(),
+               flushed_seq_num.load());
 
-#ifndef NDEBUG
+    [[maybe_unused]]
     const uint64_t flushing_seq_num_before = flushing_seq_num;
-#endif
     assert(flushed_seq_num <= flushing_seq_num);
 
     // sync_membufs() will update flushing_seq_num() and mark fcsm running.
@@ -676,6 +724,14 @@ void fcsm::on_commit_complete(uint64_t commit_bytes)
 
     inode->flush_lock();
 
+    /*
+     * This can only come here with stable write true when
+     * switch_to_stable_write() was waiting for ongoing commits to complete
+     * and it went ahead and set inode stable write after we cleared the
+     * commit_in_progress above.
+     */
+    assert(!inode->is_stable_write() || ctgtq.empty());
+
     /*
      * Go over all queued commit targets to see if any can be completed after
      * the latest commit completed.
@@ -731,7 +787,7 @@ void fcsm::on_commit_complete(uint64_t commit_bytes)
          * If we have any commit target here it must have commit_seq greater
          * than committed_seq_num, else it would have been completed by the
          * above loop.
-         * If we have any flush target it must have flush_seq greate than
+         * If we have any flush target it must have flush_seq greater than
          * flushed_seq_num. This is because commit would have started after
          * the flush and we would have completed all eligible flush targets.
          */
@@ -740,22 +796,33 @@ void fcsm::on_commit_complete(uint64_t commit_bytes)
 
         uint64_t bytes;
         std::vector<bytes_chunk> bc_vec;
+
+        /*
+         * This means we are here after switch_to_stable_write() switched to
+         * stable, we need to handle that.
+         */
         if (inode->is_stable_write()) {
+            assert(ctgtq.empty());
             bc_vec =
                 inode->get_filecache()->get_dirty_nonflushing_bcs_range(
                                                         0, UINT64_MAX, &bytes);
+            // Here bc_vec can be empty, sync_membufs() can handle that.
         } else {
             bc_vec =
                 inode->get_filecache()->get_contiguous_dirty_bcs(&bytes);
+            /*
+             * Since we have a flush target asking more data to be flushed, we
+             * must have the corresponding bcs in the file cache.
+             *
+             * Note: We cannot have this assert for the stable write case, as
+             *       sync_membufs() that called switch_to_stable_write(), might
+             *       have consumed all these bcs and marked them flushing. When
+             *       we come here we won't find any dirty-and-not-flushing bcs.
+             */
+            assert(!bc_vec.empty());
+            assert(bytes > 0);
         }
 
-        /*
-         * Since we have a flush target asking more data to be flushed, we must
-         * have the corresponding bcs in the file cache.
-         */
-        assert(!bc_vec.empty());
-        assert(bytes > 0);
-
         // flushed_seq_num can never be more than flushing_seq_num.
         assert(flushed_seq_num <= flushing_seq_num);
 
@@ -783,6 +850,7 @@ void fcsm::on_commit_complete(uint64_t commit_bytes)
         assert(flushed_seq_num == committed_seq_num);
 
         assert(!inode->get_filecache()->is_flushing_in_progress());
+        assert(!inode->is_commit_in_progress());
 
         clear_running();
     }
@@ -807,8 +875,11 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
     // Must be called from flush/write callback.
     assert(fc_cb_running());
 
+    // See below why we cannot assert this.
+#if 0
     // Flush callback can only be called if FCSM is running.
-    assert(is_running());
+    assert(is_running);
+#endif
 
     /*
      * Commit will only be run after current flush completes.
@@ -856,13 +927,27 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
      * Multiple libnfs (callback) threads can find is_flushing_in_progress()
      * return false. The first one to get the flush_lock, gets to run the
      * queued flush targets which includes completing the waiting tasks and/or
-     * trigger pending flush/commit.
+     * trigger pending flush/commit. Other flush callback threads which get
+     * the lock after the first one, should simply return. They check for
+     * one of the following conditions to avoid duplicating work:
+     * 1. The first one didn't find anything to do, so it stopped the FSCM.
+     * 2. The first one triggered a flush target.
+     * 3. The first one triggered a commit target.
      */
-    if (inode->get_filecache()->is_flushing_in_progress()) {
+    if (inode->get_filecache()->is_flushing_in_progress() ||
+        inode->is_commit_in_progress() ||
+        !is_running()) {
+        assert(is_running() || ftgtq.empty());
         inode->flush_unlock();
         return;
     }
 
+    /*
+     * Entire flush is done and no new flush can start, so flushed_seq_num must
+     * match flushing_seq_num.
+     */
+    assert(flushed_seq_num == flushing_seq_num);
+
     /*
      * Go over all queued flush targets to see if any can be completed after
      * the latest flush completed.
@@ -960,19 +1045,21 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
             bc_vec = inode->get_filecache()->get_dirty_nonflushing_bcs_range(
                                                                     0, UINT64_MAX,
                                                                     &bytes);
-            /*
-             * Since we have a flush target asking more data to be flushed, we must
-             * have the corresponding bcs in the file cache.
-             */
-            assert(!bc_vec.empty());
-            // We should flush all the dirty data in the chunkmap.
-            assert(bytes >= (ftgtq.front().flush_seq - flushing_seq_num));
         } else {
             bc_vec = inode->get_filecache()->get_contiguous_dirty_bcs(&bytes);
-            assert(!bc_vec.empty());
-            assert(bytes > 0);
         }
 
+        /*
+         * Since we have a flush target asking more data to be flushed, we must
+         * have the corresponding bcs in the file cache.
+         */
+        assert(!bc_vec.empty());
+        // We should flush all the dirty data in the chunkmap.
+        const uint64_t next_goal =
+            std::max((ftgtq.empty() ? 0 : ftgtq.front().flush_seq),
+                     (ctgtq.empty() ? 0 : ctgtq.front().commit_seq));
+        assert(bytes >= (next_goal - flushing_seq_num));
+
         // flushed_seq_num can never be more than flushing_seq_num.
         assert(flushed_seq_num <= flushing_seq_num);
 
diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp
index ab78ac22..30c488c9 100644
--- a/turbonfs/src/file_cache.cpp
+++ b/turbonfs/src/file_cache.cpp
@@ -601,7 +601,8 @@ void membuf::clear_flushing()
     bcc->bytes_flushing_g -= length;
 
     AZLogDebug("Clear flushing membuf [{}, {}), fd={}",
-               offset.load(), offset.load()+length.load(), backing_file_fd);
+               offset.load(), offset.load()+length.load(),
+               backing_file_fd);
 }
 
 /**
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 15194680..63239661 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -988,7 +988,7 @@ static void commit_callback(
         /*
          * Set the inode to stable write, so that next write will initiate
          * the flush again with stable write.
-         * There should be no flush in progress as this moment, also since
+         * There should be no flush in progress at this moment, also since
          * we always commit *all* the commit pending bcs, there should not
          * be any more commit pending bcs, so we can safely just enable
          * stable writes for the inode w/o the elaborate

From ca237c748907c0c17093e8a203f9e990468823d1 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Mon, 10 Feb 2025 10:02:58 +0000
Subject: [PATCH 8/9] oint review by Nitin and Tomar

nfs_inode.cpp
---
 turbonfs/inc/file_cache.h  |  4 +--
 turbonfs/inc/nfs_inode.h   |  8 +++--
 turbonfs/src/nfs_inode.cpp | 72 ++++++++++++++++++++++++++------------
 turbonfs/src/rpc_task.cpp  |  3 ++
 4 files changed, 61 insertions(+), 26 deletions(-)

diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index 713cc412..5d63bd02 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -1244,8 +1244,8 @@ class bytes_chunk_cache
      *       check for that after holding the membuf lock, before it tries to
      *       commit those membuf(s).
      */
-    std::vector<bytes_chunk> get_flushing_bc_range(uint64_t st_off,
-                                                   uint64_t end_off) const;
+    std::vector<bytes_chunk> get_flushing_bc_range(
+            uint64_t st_off = 0, uint64_t end_off = UINT64_MAX) const;
 
     /**
      * Returns contiguous dirty (and not flushing) chunks from chunmap, starting
diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
index 6a163357..d3b194fc 100644
--- a/turbonfs/inc/nfs_inode.h
+++ b/turbonfs/inc/nfs_inode.h
@@ -1427,7 +1427,12 @@ struct nfs_inode
 
     /**
      * Wait for currently flushing/committing membufs to complete.
+     * It will wait till the currently flushing membufs complete and then
+     * issue a commit and wait for that. If no flush is ongoing but there's
+     * commit_pending data, it'll commit that and return after the commit
+     * completes.
      * Returns 0 on success and a positive errno value on error.
+     * Once it returns, commit_pending will be 0.
      *
      * Note : Caller must hold the inode flush_lock to ensure that
      *        no new membufs are added till this call completes.
@@ -1435,8 +1440,7 @@ struct nfs_inode
      *        flush/write requests to complete, but it'll exit with flush_lock
      *        held.
      */
-    int wait_for_ongoing_flush(uint64_t start_off = 0,
-                               uint64_t end_off = UINT64_MAX);
+    int wait_for_ongoing_flush();
 
     /**
      * commit_membufs() is called to commit uncommitted membufs to the Blob.
diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index c419e40c..141a7af7 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -33,7 +33,7 @@ nfs_inode::nfs_inode(const struct nfs_fh3 *filehandle,
     assert(client->magic == NFS_CLIENT_MAGIC);
     assert(write_error == 0);
 
-    // We start doing unstable writes untill proven o/w.
+    // We start doing unstable writes until proven o/w.
     assert(stable_write == false);
     assert(commit_state == commit_state_t::COMMIT_NOT_NEEDED);
 
@@ -386,6 +386,11 @@ int nfs_inode::get_actimeo_max() const
 void nfs_inode::wait_for_ongoing_commit()
 {
     assert(is_flushing);
+    /*
+     * FCSM must be running, we only FCSM can complete commit that we are
+     * waiting for.
+     */
+    assert(get_fcsm()->is_running());
 
     /*
      * TODO: See if we can eliminate inline sleep.
@@ -402,7 +407,7 @@ void nfs_inode::wait_for_ongoing_commit()
 
         if (++iter % 1000 == 0) {
             AZLogWarn("[{}] wait_for_ongoing_commit() still waiting, iter: {}",
-                    get_fuse_ino(), iter);
+                      get_fuse_ino(), iter);
         }
         ::usleep(1000);
     }
@@ -444,6 +449,7 @@ void nfs_inode::switch_to_stable_write()
      */
     if (get_filecache()->get_bytes_to_commit() == 0) {
         AZLogDebug("[{}] Nothing to commit, switching to stable write", ino);
+
         set_stable_write();
 
         /*
@@ -453,6 +459,11 @@ void nfs_inode::switch_to_stable_write()
         return;
     }
 
+    /*
+     * There is some commit_pending data that we need to commit before we can
+     * make the switch to stable writes.
+     */
+
     uint64_t bytes;
     std::vector<bytes_chunk> bc_vec =
         get_filecache()->get_commit_pending_bcs(&bytes);
@@ -466,6 +477,11 @@ void nfs_inode::switch_to_stable_write()
 
     /*
      * Wait for the commit to complete.
+     *
+     * TODO: Since switch_to_stable_write() can be called from libnfs threads
+     *       too, we have a risk that it may block the only libnfs thread and
+     *       everything will stall. To avoid this, let's not try to commit, but
+     *       instead make all the commit_pending data back to dirty.
      */
     wait_for_ongoing_commit();
 
@@ -475,6 +491,8 @@ void nfs_inode::switch_to_stable_write()
 
     set_stable_write();
 
+    putblock_filesize = 0;
+
     /*
      * Now we moved to stable write, cleanup the commit target queue.
      */
@@ -487,6 +505,8 @@ void nfs_inode::switch_to_stable_write()
  */
 bool nfs_inode::check_stable_write_required(off_t offset)
 {
+    // Caller must hold the flush_lock.
+    assert(is_flushing);
     assert(offset <= (off_t) AZNFSC_MAX_FILE_SIZE);
 
     /*
@@ -524,6 +544,8 @@ bool nfs_inode::check_stable_write_required(off_t offset)
 void nfs_inode::commit_membufs(std::vector<bytes_chunk>& bc_vec)
 {
     assert(is_flushing);
+    // Commit is only called by FCSM, so it must be running.
+    assert(get_fcsm()->is_running());
 
     set_commit_in_progress();
 
@@ -542,13 +564,6 @@ void nfs_inode::commit_membufs(std::vector<bytes_chunk>& bc_vec)
             prev_offset += bc.length;
         }
 
-        /*
-         * We have at least one commit to issue, mark fcsm as running,
-         * if not already marked.
-         *
-         * XXX Shouldn't it be running already?
-         */
-        get_fcsm()->mark_running();
         get_fcsm()->add_committing(bc.length);
     }
 
@@ -557,7 +572,7 @@ void nfs_inode::commit_membufs(std::vector<bytes_chunk>& bc_vec)
      */
     struct rpc_task *commit_task =
                 get_client()->get_rpc_task_helper()->alloc_rpc_task(FUSE_FLUSH);
-    // XXX Do we need to every call flush with fuse_req?
+    // XXX Do we need to ever call flush with fuse_req?
     commit_task->init_flush(nullptr /* fuse_req */, ino);
     assert(commit_task->rpc_api->pvt == nullptr);
 
@@ -777,7 +792,11 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
          * Once packed completely, then dispatch the write.
          */
         if (write_task->add_bc(bc)) {
-            putblock_filesize += bc.length;
+            if (!is_stable_write()) {
+                putblock_filesize += bc.length;
+            } else {
+                assert(putblock_filesize == 0);
+            }
             continue;
         } else {
             /*
@@ -804,7 +823,11 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
             [[maybe_unused]] bool res = write_task->add_bc(bc);
             assert(res == true);
 
-            putblock_filesize += bc.length;
+            if (!is_stable_write()) {
+                putblock_filesize += bc.length;
+            } else {
+                assert(putblock_filesize == 0);
+            }
         }
     }
 
@@ -1034,10 +1057,8 @@ int nfs_inode::copy_to_cache(const struct fuse_bufvec* bufv,
  *       It will release the flush_lock if it has to wait for flush to
  *       complete. Before returning it'll re-acquire the flush_lock.
  */
-int nfs_inode::wait_for_ongoing_flush(uint64_t start_off, uint64_t end_off)
+int nfs_inode::wait_for_ongoing_flush()
 {
-    assert(start_off < end_off);
-
     // Caller must call us with flush_lock held.
     assert(is_flushing);
 
@@ -1072,8 +1093,9 @@ int nfs_inode::wait_for_ongoing_flush(uint64_t start_off, uint64_t end_off)
 
     if (get_filecache()->is_flushing_in_progress()) {
         assert(!is_commit_in_progress());
-    } else if (!is_commit_in_progress() && !get_fcsm()->fc_cb_running() &&
-               get_filecache()->bytes_commit_pending == 0) {
+    } else if (!is_commit_in_progress() &&
+               !get_fcsm()->fc_cb_running() &&
+               (get_filecache()->bytes_commit_pending == 0)) {
         /*
          * Flushing not in progress and no new flushing can be started as we hold
          * the flush_lock(), and callback drained.
@@ -1105,7 +1127,7 @@ int nfs_inode::wait_for_ongoing_flush(uint64_t start_off, uint64_t end_off)
          * new dirty bytes_chunks created but we don't want to wait for those.
          */
         std::vector<bytes_chunk> bc_vec =
-            filecache_handle->get_flushing_bc_range(start_off, end_off);
+            filecache_handle->get_flushing_bc_range();
 
         /*
          * Nothing to flush and callback drained, job done!
@@ -1258,9 +1280,15 @@ int nfs_inode::wait_for_ongoing_flush(uint64_t start_off, uint64_t end_off)
  */
 int nfs_inode::flush_cache_and_wait()
 {
-    assert(!is_stable_write() ||
-           get_filecache()->bytes_commit_pending == 0);
-    assert(!is_stable_write() || !is_commit_in_progress());
+    /*
+     * TODO: Let's use ensure_flush()/ensure_commit() here so perform the
+     *       flush+commit using the FCSM. It can convey pointer to an atomic
+     *       boolean which the callback can set and it can wait here.
+     */
+    if (is_stable_write()) {
+        assert(get_filecache()->bytes_commit_pending == 0);
+        assert(!is_commit_in_progress());
+    }
 
     /*
      * MUST be called only for regular files.
@@ -1549,7 +1577,7 @@ bool nfs_inode::truncate_start(size_t size)
      */
     flush_lock();
 
-    wait_for_ongoing_flush(0, UINT64_MAX);
+    wait_for_ongoing_flush();
 
     AZLogDebug("[{}] Ongoing flush operations completed", ino);
 
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 63239661..b47fae69 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -1036,6 +1036,9 @@ void rpc_task::issue_commit_rpc()
     // Caller must have marked commit inprogress.
     assert(inode->is_commit_in_progress());
 
+    // Must be called by FCSM.
+    assert(inode->get_fcsm()->is_running());
+
     // List of bcs to be committed by this commit call.
     assert(rpc_api->pvt != nullptr);
     auto bc_vec_ptr = (std::vector<bytes_chunk> *) rpc_api->pvt;

From 76f82d3772832d2e4197d56c97b1152204582831 Mon Sep 17 00:00:00 2001
From: Nitin Singla <nitinsingla@microsoft.com>
Date: Mon, 10 Feb 2025 13:10:38 +0000
Subject: [PATCH 9/9] Addressed the review comments.

---
 turbonfs/inc/fcsm.h        |  16 ++-
 turbonfs/inc/nfs_inode.h   |   2 +-
 turbonfs/inc/rpc_stats.h   |   9 --
 turbonfs/src/fcsm.cpp      |  98 +++++++++++++++---
 turbonfs/src/nfs_inode.cpp | 204 +++++++++++--------------------------
 turbonfs/src/rpc_task.cpp  |  28 ++---
 6 files changed, 173 insertions(+), 184 deletions(-)

diff --git a/turbonfs/inc/fcsm.h b/turbonfs/inc/fcsm.h
index aaae0080..0582ce25 100644
--- a/turbonfs/inc/fcsm.h
+++ b/turbonfs/inc/fcsm.h
@@ -93,7 +93,8 @@ class fcsm
      */
     void ensure_flush(uint64_t write_off,
                       uint64_t write_len,
-                      struct rpc_task *task = nullptr);
+                      struct rpc_task *task = nullptr,
+                      std::atomic<bool> *conditional_variable = nullptr);
 
     /**
      * Ensure all or some commit-pending bytes are committed or scheduled for
@@ -106,7 +107,9 @@ class fcsm
      */
     void ensure_commit(uint64_t write_off,
                        uint64_t write_len,
-                       struct rpc_task *task = nullptr);
+                       struct rpc_task *task = nullptr,
+                       std::atomic<bool> *conditional_variable = nullptr,
+                       bool commit_full = false);
 
     /**
      * Callbacks to be called when flush/commit successfully complete.
@@ -248,7 +251,8 @@ class fcsm
         fctgt(struct fcsm *fcsm,
               uint64_t _flush_seq,
               uint64_t _commit_seq,
-              struct rpc_task *_task = nullptr);
+              struct rpc_task *_task = nullptr,
+              std::atomic<bool> *conditional_variable = nullptr);
 
         /*
          * Flush and commit targets (in terms of flushed_seq_num/committed_seq_num)
@@ -267,6 +271,12 @@ class fcsm
          * Pointer to the containing fcsm.
          */
         struct fcsm *const fcsm = nullptr;
+
+        /*
+         * If non-null, it's initial value is false.
+         * Caller who enqueue the target waiting for it to be true.
+         */
+        std::atomic<bool> *conditional_variable = nullptr;
 #if 0
         /*
          * Has the required flush/commit task started?
diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
index d3b194fc..bae13533 100644
--- a/turbonfs/inc/nfs_inode.h
+++ b/turbonfs/inc/nfs_inode.h
@@ -333,7 +333,7 @@ struct nfs_inode
      * Note: As of now, we are not using this flag as commit changes not yet
      *       integrated, so we are setting this flag to true.
      */
-    bool stable_write = true;
+    bool stable_write = false;
 
 public:
     /*
diff --git a/turbonfs/inc/rpc_stats.h b/turbonfs/inc/rpc_stats.h
index 626914b1..d0a7bbed 100644
--- a/turbonfs/inc/rpc_stats.h
+++ b/turbonfs/inc/rpc_stats.h
@@ -139,9 +139,6 @@ class rpc_stats_az
      */
     void on_rpc_issue()
     {
-        // FUSE_FLUSH is never issued as an RPC task to the server. FUSE_WRITE is issued instead.
-        assert(optype != FUSE_FLUSH);
-
         assert(stamp.issue == 0);
         stamp.issue = get_current_usecs();
         assert(stamp.issue >= stamp.create);
@@ -156,9 +153,6 @@ class rpc_stats_az
      */
     void on_rpc_cancel()
     {
-        // FUSE_FLUSH is never issued as an RPC task to the server. FUSE_WRITE is issued instead.
-        assert(optype != FUSE_FLUSH);
-
         assert(stamp.issue != 0);
         assert((int64_t) stamp.issue <= get_current_usecs());
         assert(stamp.dispatch == 0);
@@ -178,9 +172,6 @@ class rpc_stats_az
      */
     void on_rpc_complete(struct rpc_pdu *pdu, nfsstat3 status)
     {
-        // FUSE_FLUSH is never issued as an RPC task to the server. FUSE_WRITE is issued instead.
-        assert(optype != FUSE_FLUSH);
-
         assert((status == NFS3ERR_RPC_ERROR) ||
                (nfsstat3_to_errno(status) != -ERANGE));
 
diff --git a/turbonfs/src/fcsm.cpp b/turbonfs/src/fcsm.cpp
index 10abea60..c86a4e93 100644
--- a/turbonfs/src/fcsm.cpp
+++ b/turbonfs/src/fcsm.cpp
@@ -25,16 +25,21 @@ fcsm::fcsm(struct nfs_client *_client,
 fcsm::fctgt::fctgt(struct fcsm *fcsm,
                    uint64_t _flush_seq,
                    uint64_t _commit_seq,
-                   struct rpc_task *_task) :
+                   struct rpc_task *_task,
+                   std::atomic<bool> *_cv) :
     flush_seq(_flush_seq),
     commit_seq(_commit_seq),
     task(_task),
-    fcsm(fcsm)
+    fcsm(fcsm),
+    conditional_variable(_cv)
 {
     assert(fcsm->magic == FCSM_MAGIC);
     // At least one of flush/commit goals must be set.
     assert((flush_seq != 0) || (commit_seq != 0));
 
+    // If conditional variable, it's initial value should be false.
+    assert(!conditional_variable || *conditional_variable == false);
+
     if (task) {
         // Only frontend write tasks must be specified.
         assert(task->magic == RPC_TASK_MAGIC);
@@ -43,7 +48,7 @@ fcsm::fctgt::fctgt(struct fcsm *fcsm,
         assert(task->rpc_api->write_task.get_size() > 0);
     }
 
-    AZLogInfo("[{}] [FCSM] {} fctgt queued (F: {}, C: {}, T: {})",
+    AZLogDebug("[{}] [FCSM] {} fctgt queued (F: {}, C: {}, T: {})",
                fcsm->get_inode()->get_fuse_ino(),
                task ? "Blocking" : "Non-blocking",
                flush_seq,
@@ -335,10 +340,13 @@ void fcsm::ctgtq_cleanup()
 
 void fcsm::ensure_commit(uint64_t write_off,
                          uint64_t write_len,
-                         struct rpc_task *task)
+                         struct rpc_task *task,
+                         std::atomic<bool> *conditional_variable,
+                         bool commit_full)
 {
     assert(inode->is_flushing);
     assert(!inode->is_stable_write());
+    assert(!commit_full || task == nullptr);
 
     /*
      * If any of the flush/commit targets are waiting completion, state machine
@@ -377,13 +385,23 @@ void fcsm::ensure_commit(uint64_t write_off,
     uint64_t commit_bytes =
         inode->get_filecache()->get_bytes_to_commit();
 
+    /*
+     * If commit_full flag is true, wait_for_ongoing_flush()
+     * committed the commit_pending bytes. Now we need to flush
+     * dirty bytes and commit them.
+     */
+    if (commit_full) {
+        assert(commit_bytes == 0);
+        commit_bytes = inode->get_filecache()->get_bytes_to_flush();
+    }
+
     if (commit_bytes == 0) {
         /*
          * TODO: Make sure this doesn't result in small-blocks being written.
          */
-        const int64_t bytes =
-            inode->get_filecache()->get_bytes_to_flush() -
-            inode->get_filecache()->max_dirty_extent_bytes();
+        const int64_t bytes = (inode->get_filecache()->get_bytes_to_flush() -
+             inode->get_filecache()->max_dirty_extent_bytes());
+
         commit_bytes = std::max(bytes, (int64_t) 0);
     }
 
@@ -395,6 +413,9 @@ void fcsm::ensure_commit(uint64_t write_off,
         if (task) {
             task->reply_write(task->rpc_api->write_task.get_size());
         }
+        if (conditional_variable) {
+            *conditional_variable = true;
+        }
         return;
     }
 
@@ -427,7 +448,8 @@ void fcsm::ensure_commit(uint64_t write_off,
         ctgtq.emplace(this,
                       0 /* target flush_seq */,
                       target_committed_seq_num /* target commit_seq */,
-                      task);
+                      task,
+                      conditional_variable);
         return;
     }
 
@@ -467,11 +489,21 @@ void fcsm::ensure_commit(uint64_t write_off,
             ctgtq.emplace(this,
                           0 /* target flush_seq */,
                           target_committed_seq_num /* target commit_seq */,
-                          task);
+                          task,
+                          conditional_variable);
         } else {
             if (task) {
                 task->reply_write(task->rpc_api->write_task.get_size());
             }
+
+            /*
+             * Flush_cache_and_wait() waiting for dirty_bytes to flushed,
+             * can't complete until all dirty bytes flushed. so add the
+             * flush target.
+             */
+            if (commit_full) {
+                ensure_flush(0, 0, nullptr, conditional_variable);
+            }
         }
 
         return;
@@ -504,6 +536,17 @@ void fcsm::ensure_commit(uint64_t write_off,
 
         assert(committing_seq_num == (prev_committing_seq_num + bytes));
         assert(committing_seq_num > committed_seq_num);
+
+        /*
+         * Enqueue this target, on_commit_callback() completes
+         * this target. Otherwise task/conditional_variable
+         * waiting for this to complete never called.
+         */
+        ctgtq.emplace(this,
+                      0 /* target flush_seq */,
+                      target_committed_seq_num /* target commit_seq */,
+                      task,
+                      conditional_variable);
     }
 }
 
@@ -512,7 +555,8 @@ void fcsm::ensure_commit(uint64_t write_off,
  */
 void fcsm::ensure_flush(uint64_t write_off,
                         uint64_t write_len,
-                        struct rpc_task *task)
+                        struct rpc_task *task,
+                        std::atomic<bool> *conditional_variable)
 {
     assert(inode->is_flushing);
     /*
@@ -595,14 +639,16 @@ void fcsm::ensure_flush(uint64_t write_off,
         /*
          * If no task and no new flush target, don't add a dup target.
          */
-        if (!task && (target_flushed_seq_num == last_flush_seq)) {
+        if (!task && conditional_variable &&
+            (target_flushed_seq_num == last_flush_seq)) {
             return;
         }
 
         ftgtq.emplace(this,
                       target_flushed_seq_num /* target flush_seq */,
                       0 /* commit_seq */,
-                      task);
+                      task,
+                      conditional_variable);
         return;
     }
 
@@ -617,6 +663,10 @@ void fcsm::ensure_flush(uint64_t write_off,
         if (task) {
             task->reply_write(task->rpc_api->write_task.get_size());
         }
+
+        if (conditional_variable) {
+            *conditional_variable = true;
+        }
         return;
     }
 
@@ -653,12 +703,26 @@ void fcsm::ensure_flush(uint64_t write_off,
     const uint64_t flushing_seq_num_before = flushing_seq_num;
     assert(flushed_seq_num <= flushing_seq_num);
 
-    // sync_membufs() will update flushing_seq_num() and mark fcsm running.
-    inode->sync_membufs(bc_vec, false /* is_flush */, task);
+    /*
+     * sync_membufs() will update flushing_seq_num() and mark fcsm running.
+     * Task is not passed to sync_membufs, but enqueued to ftgtq.
+     */
+    inode->sync_membufs(bc_vec, false /* is_flush */, nullptr);
 
     assert(is_running());
     assert(flushing_seq_num == (flushing_seq_num_before + bytes));
     assert(flushed_seq_num <= flushing_seq_num);
+
+    /*
+     * Enqueue this target, on_flush_complete() completes
+     * this target. Otherwise task/conditional_variable
+     * waiting for this to complete never called.
+     */
+    ftgtq.emplace(this,
+                 target_flushed_seq_num /* target flush_seq */,
+                 0 /* commit_seq */,
+                 task,
+                 conditional_variable);
 }
 
 /**
@@ -767,6 +831,9 @@ void fcsm::on_commit_complete(uint64_t commit_bytes)
 
             tgt.task->reply_write(
                     tgt.task->rpc_api->write_task.get_size());
+        } else if (tgt.conditional_variable) {
+            assert(*tgt.conditional_variable == false);
+            *tgt.conditional_variable = true;
         } else {
             AZLogDebug("[{}] [FCSM] completing non-blocking commit target: {}, "
                        "committed_seq_num: {}",
@@ -983,6 +1050,9 @@ void fcsm::on_flush_complete(uint64_t flush_bytes)
 
             tgt.task->reply_write(
                     tgt.task->rpc_api->write_task.get_size());
+        } else if (tgt.conditional_variable) {
+            assert(*tgt.conditional_variable == false);
+            *tgt.conditional_variable = true;
         } else {
             AZLogDebug("[{}] [FCSM] completing non-blocking flush target: {}, "
                        "flushed_seq_num: {}",
diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index 141a7af7..312ed324 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -471,20 +471,30 @@ void nfs_inode::switch_to_stable_write()
     assert(bytes > 0);
 
     /*
-     * Issue the commit RPC to commit the pending data.
+     * Since switch_to_stable_write() can be called from libnfs threads
+     * too, we have a risk that it may block the only libnfs thread and
+     * everything will stall. To avoid this, let's not try to commit, but
+     * instead make all the commit_pending data back to dirty.
      */
-    commit_membufs(bc_vec);
-
-    /*
-     * Wait for the commit to complete.
-     *
-     * TODO: Since switch_to_stable_write() can be called from libnfs threads
-     *       too, we have a risk that it may block the only libnfs thread and
-     *       everything will stall. To avoid this, let's not try to commit, but
-     *       instead make all the commit_pending data back to dirty.
-     */
-    wait_for_ongoing_commit();
+    for (bytes_chunk& bc : bc_vec) {
+        [[maybe_unused]] struct membuf *mb = bc.get_membuf();
+        assert(mb != nullptr);
+        assert(mb->is_inuse());
+        assert(mb->is_locked());
+        assert(mb->is_uptodate());
+        assert(mb->is_commit_pending());
+        assert(!mb->is_dirty());
 
+        /*
+         * Clear the commit_pending bit, now these are rewritten
+         * as stable writes.
+         */
+        mb->clear_commit_pending();
+        mb->set_dirty();
+        mb->clear_locked();
+        mb->clear_inuse();
+    }
+    
     assert(get_filecache()->get_bytes_to_commit() == 0);
     assert(!get_filecache()->is_flushing_in_progress());
     assert(!is_commit_in_progress());
@@ -544,8 +554,12 @@ bool nfs_inode::check_stable_write_required(off_t offset)
 void nfs_inode::commit_membufs(std::vector<bytes_chunk>& bc_vec)
 {
     assert(is_flushing);
-    // Commit is only called by FCSM, so it must be running.
-    assert(get_fcsm()->is_running());
+
+    /*
+     * Commit is only called by FCSM, either callback of completion
+     * or inline as part of ensure_commit.
+     */
+    assert(get_fcsm()->is_running() || !get_fcsm()->fc_cb_count());
 
     set_commit_in_progress();
 
@@ -565,6 +579,7 @@ void nfs_inode::commit_membufs(std::vector<bytes_chunk>& bc_vec)
         }
 
         get_fcsm()->add_committing(bc.length);
+        get_fcsm()->mark_running();
     }
 
     /*
@@ -1257,6 +1272,9 @@ int nfs_inode::wait_for_ongoing_flush()
         wait_for_ongoing_commit();
     }
 
+    assert(!is_commit_in_progress());
+    assert(!get_filecache()->is_flushing_in_progress());
+
     return get_write_error();
 }
 
@@ -1280,6 +1298,13 @@ int nfs_inode::wait_for_ongoing_flush()
  */
 int nfs_inode::flush_cache_and_wait()
 {
+    /*
+     * If flush() is called w/o open(), there won't be any cache, skip.
+     */
+    if (!has_filecache()) {
+        return 0;
+    }
+
     /*
      * TODO: Let's use ensure_flush()/ensure_commit() here so perform the
      *       flush+commit using the FCSM. It can convey pointer to an atomic
@@ -1311,13 +1336,6 @@ int nfs_inode::flush_cache_and_wait()
         return error_code;
     }
 
-    /*
-     * If flush() is called w/o open(), there won't be any cache, skip.
-     */
-    if (!has_filecache()) {
-        return 0;
-    }
-
     /*
      * Grab the flush_lock to ensure that we don't initiate any new flushes
      * while some truncate call is in progress (which must have taken the
@@ -1327,139 +1345,39 @@ int nfs_inode::flush_cache_and_wait()
      */
     flush_lock();
 
-    /*
-     * If flush is in progress, wait for it to complete.
-     * It may happen that last flush is still in progress and we need to wait
-     * for it to complete before we start the new flush.
-     * As we have the flush_lock() we are guaranteed that no new flush can be
-     * started till we release the flush_lock().
-     *
-     * wait_for_ongoing_flush() will wait for the ongoing flush/commt to
-     * complete.
-     *
-     * Note: For stable write case, we don't need to wait for the ongoing flush,
-     *       as there is no commit required for stable write.
-     */
-    if (!is_stable_write()) {
-        int retry, err = 0;
-        const int max_retry = 200;
-        for (retry = 0; retry < max_retry; retry++) {
-            assert(is_flushing);
-            err = wait_for_ongoing_flush();
-            assert(is_flushing);
-
-            if (!get_fcsm()->fc_cb_running()) {
-                break;
-            }
-            flush_unlock();
-
-            /*
-             * We issue commit as part of wait_for_ongoing_flush()
-             * with flush_lock() held. This cause on_commit_complete()
-             * not run completely.
-             * Give 10ms to the callback to drain completely.
-             */
-            if (get_fcsm()->fc_cb_running()) {
-                ::usleep(10 * 1000);
-            }
-
-            /*
-             * Lock again to check if something flushing again,
-             * and wait for it to complete in wait_for_ongoing_flush().
-             * Repeat again to check if on_commit_complete() completed.
-             */
-            flush_lock();
-        }
-
-        if (err != 0) {
-            flush_unlock();
-            AZLogError("[{}] Failed to flush cache to stable storage, "
-                        "error={}", ino, err);
-            return err;
-        }
-    }
-    assert(is_flushing);
-
-    /*
-     * For unstable write, If no bytes to flush, then return.
-     * For stable write, If no bytes to flush and no flush in
-     * progress, then return. As for stable write, we didn't
-     * wait for the flush to complete as above.
-     *
-     * TODO: Need to check getattr() call. As that can cause this
-     *       assert to fail.
-     */
-    assert(!get_filecache()->is_flushing_in_progress() ||
-           is_stable_write());
-    assert(!is_commit_in_progress());
 
     /*
-     * For stable case we didn't wait for ongoing flush to complete.
-     * And, it may be last flush initiated to BLOB, we need to wait
-     * for this flush to complete. Application not writing anymore
-     * get_bytes_to_flush() can return 0.
+     * Wait for ongoing flush/commit to complete, so that
+     * ensure_flush() able to get right dirty_bytes, as
+     * bytes_flushing = 0, and nobody changes those values.
      */
-    if (!get_filecache()->is_flushing_in_progress() &&
-        get_filecache()->get_bytes_to_flush() == 0) {
-        assert(get_filecache()->bytes_dirty == 0);
-        assert(get_filecache()->bytes_flushing == 0);
-
-        AZLogDebug("[{}] Nothing to flush, returning", ino);
-        flush_unlock();
-        return 0;
-    }
+    wait_for_ongoing_flush();
+    std::atomic_bool complete = false;
 
     /*
-     * Get the dirty bytes_chunk from the filecache handle.
-     * This will grab an exclusive lock on the file cache and return the list
-     * of dirty bytes_chunks at that point.
+     * Set the flush target for both stable/unstable write.
+     * ensure_commit() doesn't set target for whole unflushed
+     * bytes.
      */
-    std::vector<bytes_chunk> bc_vec;
-    uint64_t size = 0;
-    if (!is_stable_write()) {
-        bc_vec = filecache_handle->get_contiguous_dirty_bcs(&size);
-
-        if (size != get_filecache()->get_bytes_to_flush()) {
-            AZLogInfo("[{}] Flushed size: {} != bytes to flush: {}",
-                        ino, size, get_filecache()->get_bytes_to_flush());
-            AZLogInfo("SWITCHING TO STABLE WRITE");
-
-            /*
-             * This is rare and no-op as we already completed the pending
-             * flush and commit.
-             */
-            switch_to_stable_write();
-            for (auto& bc : bc_vec) {
-                bc.get_membuf()->clear_inuse();
-            }
-            bc_vec = filecache_handle->get_dirty_bc_range(0, UINT64_MAX);
-        }
+    if (is_stable_write()) {
+        get_fcsm()->ensure_flush(0, 0, nullptr, &complete);
     } else {
-        bc_vec = filecache_handle->get_dirty_bc_range(0, UINT64_MAX);
+        get_fcsm()->ensure_commit(0, 0, nullptr, &complete, true);
     }
+    flush_unlock();
 
-    /*
-     * sync_membufs() iterate over the bc_vec and starts flushing the dirty
-     * membufs. It batches the contiguous dirty membufs and issues a single
-     * write RPC for them.
-     */
-    sync_membufs(bc_vec, false /* is_flush */);
-
-    if (wait_for_ongoing_flush()) {
-        AZLogError("[{}] Failed to flush cache to stable storage, "
-            "error={}", ino, get_write_error());
-        flush_unlock();
-        return get_write_error();
+    // Wait for flush to complete.
+    int iter = 0;
+    while (complete != true) {
+        if (++iter % 1000 == 0) {
+            AZLogWarn("[{}] flush_cache_and_wait() waiting for ongoing"
+                      " flush to complete still waiting, iter: {}",
+                      get_fuse_ino(), iter);
+        }
+        ::usleep(1000);
     }
 
-    assert(!is_commit_in_progress());
-    assert(!get_filecache()->is_flushing_in_progress());
-    assert(get_filecache()->bytes_dirty == 0);
-    assert(get_filecache()->bytes_commit_pending == 0);
-    assert(get_filecache()->bytes_flushing == 0);
-
-    flush_unlock();
-    return 0;
+    return get_write_error();
 }
 
 void nfs_inode::flush_lock() const
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index b47fae69..3836d20e 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -877,20 +877,6 @@ static void commit_callback(
     AZLogDebug("[{}] commit_callback, number of bc committed: {}",
                ino, bc_vec_ptr->size());
     
-#ifdef ENABLE_PARANOID
-    /*
-     * We always pick *all* commit-pending bcs for committing, and while commit
-     * is going on no other flush can run, hence new commit pending bcs cannot
-     * be added, so when an ongoing commit completes we we must not have any
-     * commit pending bcs in the cache.
-     */
-    {
-        std::vector<bytes_chunk> bc_vec =
-            inode->get_filecache()->get_commit_pending_bcs();
-        assert(bc_vec.empty());
-    }
-#endif
-
     uint64_t commit_bytes = 0;
     /*
      * Now that the request has completed, we can query libnfs for the
@@ -998,6 +984,20 @@ static void commit_callback(
         inode->set_stable_write();
     }
 
+#ifdef ENABLE_PARANOID
+    /*
+     * We always pick *all* commit-pending bcs for committing, and while commit
+     * is going on no other flush can run, hence new commit pending bcs cannot
+     * be added, so when an ongoing commit completes we we must not have any
+     * commit pending bcs in the cache.
+     */
+    {
+        std::vector<bytes_chunk> bc_vec =
+            inode->get_filecache()->get_commit_pending_bcs();
+        assert(bc_vec.empty());
+    }
+#endif
+
     if (status == 0) {
         assert(commit_bytes > 0);
         /*