linuxsmiths · nitin-deamon · Feb 5, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/turbonfs/inc/fcsm.h b/turbonfs/inc/fcsm.h
@@ -95,8 +95,7 @@ class fcsm
      *
      * LOCKS: flush_lock.
      */
-    void ensure_flush(uint64_t flush_bytes,
-                      uint64_t write_off,
+    void ensure_flush(uint64_t write_off,
                       uint64_t write_len,
                       struct rpc_task *task = nullptr);
 
@@ -107,16 +106,15 @@ class fcsm
      * it'll add a blocking commit target for completing task when given commit
      * goal is met.
      */
-    void ensure_commit(uint64_t commit_bytes,
-                       struct rpc_task *task = nullptr);
+    void ensure_commit(struct rpc_task *task = nullptr);
 
     /**
      * Callbacks to be called when flush/commit successfully complete.
      * These will update flushed_seq_num/committed_seq_num and run flush/commit
      * targets from ftgtq/ctgtq as appropriate.
      */
     void on_flush_complete(uint64_t flush_bytes);
-    void on_commit_complete();
+    void on_commit_complete(uint64_t commit_bytes);
 
     /**
      * Is the state machine currently running, i.e. it has sent (one or more)
@@ -148,6 +146,10 @@ class fcsm
     void mark_running();
     void clear_running();
 
+    void run(struct rpc_task *task,
+             uint64_t extent_left,
+             uint64_t extent_right);
+
     /**
      * Call when more writes are dispatched, or prepared to be dispatched.
      * This MUST be called before the write callback can be called.
@@ -180,6 +182,21 @@ class fcsm
         return (fc_cb_count() > 0);
     }
 
+    /**
+     * Call when more commit are dispatched, or prepared to be dispatched.
+     * This MUST be called before the commit_callback can be called.
+     */
+    void add_committing(uint64_t bytes)
+    {
+        assert(committed_seq_num <= committing_seq_num);
+        committing_seq_num += bytes;
+    }
+
+    /*
+     * ctgtq_cleanup() is called when we switch to stable writes.
+     */
+    void ctgtq_cleanup();
+
 private:
     /*
      * The singleton nfs_client, for convenience.

diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
@@ -1239,6 +1239,16 @@ class bytes_chunk_cache
     std::vector<bytes_chunk> get_flushing_bc_range(uint64_t st_off,
                                                    uint64_t end_off) const;
 
+    /**
+     * Returns contiguous dirty (and not flushing) chunks from chunmap, starting
+     * with the lowest dirty offset, and returns the total number of (dirty)
+     * bytes contained in the returned chunks.
+     * Before returning it increases the inuse count of underlying membuf(s).
+     * Caller will typically flush these to the backing Blob as UNSTABLE
+     * writes.
+     */
+    std::vector<bytes_chunk> get_contiguous_dirty_bcs(uint64_t *bytes) const;
+
     /*
      * Returns *all* commit pending chunks in chunkmap.
      * Before returning it increases the inuse count of underlying membuf(s)
@@ -1257,7 +1267,7 @@ class bytes_chunk_cache
      *       MUST check for that after holding the membuf lock, before it tries
      *       to commit those membuf(s).
      */
-    std::vector<bytes_chunk> get_commit_pending_bcs() const;
+    std::vector<bytes_chunk> get_commit_pending_bcs(uint64_t *bytes) const;
 
     /**
      * Drop cached data in the given range.
@@ -1341,7 +1351,7 @@ class bytes_chunk_cache
      * We want to send as prompt as possible to utilize the n/w b/w but slow
      * enough to give the write scheduler an opportunity to merge better.
      */
-    uint64_t max_dirty_extent_bytes() const
+    static uint64_t max_dirty_extent_bytes()
     {
         // Maximum cache size allowed in bytes.
         static const uint64_t max_total =
@@ -1407,24 +1417,156 @@ class bytes_chunk_cache
         return bytes_flushing > 0;
     }
 
+    /**
+     * Maximum size of commit_pending data that can be in cache, before we
+     * must commit it to Blob.
+     * It should be greater than or equal to the flush threshold (as returned
+     * by max_dirty_extent_bytes()) and smaller than the inline write threshold
+     * (as suggested by do_inline_write()), to minimize inline flush waits as
+     * much as possible, in steady state.
+     */
+    static uint64_t max_commit_bytes()
+    {
+        // Maximum cache size allowed in bytes.
+        static const uint64_t max_total =
+            (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
+        assert(max_total != 0);
+
+        /*
+         * Minimum of 60% of max cache and 2 times the flush limit.
+         * We want to commit as soon as possible w/o affecting performance.
+         * If we commit too often, since commit is a serializing operation,
+         * it'll affect the write throughput, otoh, if we commit too late
+         * then we might hit the inline write threshold, which again would
+         * serialize writes, bringing down throughput.
+         */
+        static const uint64_t max_commit_bytes =
+            std::min((uint64_t)(max_total * 0.6),
+                    2 * max_dirty_extent_bytes());
+        assert(max_commit_bytes > 0);
+
+        return max_commit_bytes;
+    }
+
+    /**
+     * Check if we must initiate a COMMIT RPC now. Note that the caller would
+     * just send the COMMIT RPC and not necessarily block the user write
+     * request till the COMMIT RPC completes, i.e., it's not an inline commit.
+     *
+     * We must start commit if:
+     * 1. We have enough commit_pending data for this file/cache, or,
+     * 2. Global memory pressure dictates that we commit now to free up
+     *    memory. In this case we might be committing more frequently which
+     *    won't necessarily be optimal, but we have no choice due to the
+     *    memory pressure.
+     */
+    bool commit_required() const
+    {
+        const bool local_pressure =
+            (bytes_commit_pending >= max_commit_bytes());
+
+        if (local_pressure) {
+            return true;
+        }
+
+        /*
+         * TODO: Take cue from global memory pressure.
+         */
+        return false;
+    }
+
+    /**
+     * Check if we must initiate flush of some cached data. Note that the caller
+     * would just send the corresponding WRITE RPC and not necessarily block the
+     * user write request till the WRITE RPC completes, i.e., it's not an inline
+     * write.
+     *
+     * We must start flush/write if:
+     * 1. We have enough bytes to flush so that we can write a full sized
+     *    block, or for the case of stable write, we have enough data to fill
+     *    the scheduler queue.
+     * 2. Global memory pressure dictates that we flush now to free up memory.
+     *    In this case we might be flushing more frequently which won't
+     *    necessarily be optimal, but we have no choice due to the memory
+     *    pressure.
+     */
+    bool flush_required() const
+    {
+        const bool local_pressure =
+            (get_bytes_to_flush() >= max_dirty_extent_bytes());
+
+        if (local_pressure) {
+            return true;
+        }
+
+        /*
+         * TODO: Take cue from global memory pressure.
+         */
+        return false;
+    }
+
     /**
      * This should be called by writer threads to find out if they must wait
      * for the write to complete. This will check both the cache specific and
      * global memory pressure.
      */
     bool do_inline_write() const
+    {
+        /*
+         * Allow four dirty extents before we force inline write.
+         * This way 2 extents are commit_pending, 1 is dirty. We can issue commit
+         * for the commit_pending extents and accumulate new writes in the dirty.
+         * Before we hit inline limit, 2GB worth of space is freed up. This cycle
+         * should be good enough to keep the cache size in check.
+         */
+        static const uint64_t max_dirty_allowed_per_cache =
+            max_dirty_extent_bytes() * 4;
+        const bool local_pressure = (bytes_dirty + bytes_commit_pending) > max_dirty_allowed_per_cache;
+
+        if (local_pressure) {
+            return true;
+        }
+
+        /*
+         * Global pressure is when get_prune_goals() returns non-zero bytes
+         * to be pruned inline.
+         */
+        uint64_t inline_bytes;
+
+        get_prune_goals(&inline_bytes, nullptr);
+        return (inline_bytes > 0);
+    }
+
+    /**
+     * Inline write/flush means that we are under sufficient memory pressure
+     * that we want to slow down the application writes by not completing them
+     * after copying the data to cache (using copy_to_cache()) but instead
+     * complete application writes only after the flush completes.
+     *
+     * This function returns a non-zero number representing the number of bytes
+     * that we need to write inline, else if not under memory pressure returns
+     * zero.
+     */
+    uint64_t get_inline_flush_bytes() const
     {
         /*
          * Allow two dirty extents before we force inline write.
          * This way one of the extent can be getting flushed and we can populate
          * the second one.
          */
         static const uint64_t max_dirty_allowed_per_cache =
-            max_dirty_extent_bytes() * 2;
-        const bool local_pressure = bytes_dirty > max_dirty_allowed_per_cache;
+            (max_dirty_extent_bytes() * 2);
+        const bool local_pressure =
+            ((int64_t) bytes_dirty  > (int64_t) max_dirty_allowed_per_cache);
 
         if (local_pressure) {
-            return true;
+            /*
+             * Leave one max_dirty_extent_bytes worth of dirty bytes, and
+             * flush the rest.
+             */
+            const int64_t flush_now =
+                (bytes_dirty - max_dirty_extent_bytes());
+            return flush_now;
         }
 
         /*
@@ -1433,8 +1575,20 @@ class bytes_chunk_cache
          */
         uint64_t inline_bytes;
 
+        /*
+         * TODO: Noisy neighbor syndrome, where one file is hogging the cache,
+         *       inline pruning will be triggered for other files.
+         */
         get_prune_goals(&inline_bytes, nullptr);
-        return (inline_bytes > 0);
+
+        /*
+         * (bytes_flushing + bytes_commit_pending) represents the data which
+         * is either already flushed or being flushed. Exclude that from the
+         * needs-to-be-flushed data.
+         */
+        const int64_t flush_now =
+            (inline_bytes - (bytes_flushing + bytes_commit_pending));
+        return std::max((int64_t) 0, flush_now);
     }
 
     /**

diff --git a/turbonfs/inc/nfs_client.h b/turbonfs/inc/nfs_client.h
@@ -547,6 +547,8 @@ struct nfs_client
 
     void jukebox_write(struct api_task_info *rpc_api);
 
+    void jukebox_flush(struct api_task_info *rpc_api);
+
     /**
      * Convert between NFS fattr3 and POSIX struct stat.
      */

diff --git a/turbonfs/inc/nfs_inode.h b/turbonfs/inc/nfs_inode.h
@@ -289,6 +289,29 @@ struct nfs_inode
      */
     struct stat attr;
 
+    /**
+     * We maintain following multiple views of the file and thus multiple file
+     * sizes for those views.
+     * - Cached.
+     *   This is the view of the file that comprises of data that has been
+     *   written by the application and saved in file cache. It may or may not
+     *   have been flushed and/or committed. This is the most uptodate view of
+     *   the file and applications must use this view.
+     *   cached_filesize tracks the file size for this view.
+     * - Uncommited.
+     *   This is the view of the file that tracks data that has been flushed
+     *   using UNSTABLE writes but not yet COMMITted to the Blob. This view of
+     *   the file is only used to see if the next PB call will write after the
+     *   last PB'ed byte and thus can be appended.
+     *   putblock_filesize tracks the file size for this view.
+     * - Committed.
+     *   This is the view of the file that tracks data committed to the Blob.
+     *   Other clients will see this view.
+     *   attr.st_size tracks the file size for this view.
+     */
+    off_t cached_filesize = 0;
+    mutable off_t putblock_filesize = 0;
+
     /*
      * Has this inode seen any non-append write?
      * This starts as false and remains false as long as copy_to_cache() only
@@ -310,7 +333,7 @@ struct nfs_inode
      * Note: As of now, we are not using this flag as commit changes not yet
      *       integrated, so we are setting this flag to true.
      */
-    bool stable_write = true;
+    bool stable_write = false;
 
 public:
     /*
@@ -1400,11 +1423,10 @@ struct nfs_inode
      *       initiate any new flush operations while some truncate call is in
      *       progress (which must have held the flush_lock).
      */
-    int flush_cache_and_wait(uint64_t start_off = 0,
-                             uint64_t end_off = UINT64_MAX);
+    int flush_cache_and_wait();
 
     /**
-     * Wait for currently flushing membufs to complete.
+     * Wait for currently flushing/committing membufs to complete.
      * Returns 0 on success and a positive errno value on error.
      *
      * Note : Caller must hold the inode flush_lock to ensure that
@@ -1416,6 +1438,34 @@ struct nfs_inode
     int wait_for_ongoing_flush(uint64_t start_off = 0,
                                uint64_t end_off = UINT64_MAX);
 
+    /**
+     * commit_membufs() is called to commit uncommitted membufs to the Blob.
+     * It creates commit RPC and sends it to the NFS server.
+     */
+    void commit_membufs(std::vector<bytes_chunk> &bcs);
+
+    /**
+     * switch_to_stable_write() is called to switch the inode to stable write
+     * mode. It waits for all ongoing flush and subsequent commit to complete.
+     * If not already scheduled, it'll perform an explicit commit after the
+     * flush complete.
+     * Post that it'll mark inode for stable write and return. From then on
+     * any writes to this inode will be sent as stable writes.
+     */
+    void switch_to_stable_write();
+
+    /**
+     * Check if stable write is required for the given offset.
+     * Given offset is the start of contiguous dirty membufs that need to be
+     * flushed to the Blob.
+     */
+    bool check_stable_write_required(off_t offset);
+
+    /**
+     * Wait for ongoing commit operation to complete.
+     */
+    void wait_for_ongoing_commit();
+
     /**
      * Sync the dirty membufs in the file cache to the NFS server.
      * All contiguous dirty membufs are clubbed together and sent to the
-Original file line number
+Diff line change
@@ Expand Up / @@ -547,6 +547,8 @@ struct nfs_client @@
         void jukebox_write(struct api_task_info *rpc_api);
+        void jukebox_flush(struct api_task_info *rpc_api);
         /**
          * Convert between NFS fattr3 and POSIX struct stat.
          */
@@ Expand Down @@