Azure · linuxsmiths · Feb 21, 2025 · Feb 22, 2025 · Feb 22, 2025 · Feb 23, 2025
diff --git a/turbonfs/extern/libnfs b/turbonfs/extern/libnfs
diff --git a/turbonfs/inc/aznfsc.h b/turbonfs/inc/aznfsc.h
@@ -48,6 +48,7 @@ static_assert(AZNFSCFG_WSIZE_MAX == AZNFSCFG_RSIZE_MAX);
 #define AZNFSCFG_READDIR_MAX    4194304
 #define AZNFSCFG_READAHEAD_KB_MIN 128
 #define AZNFSCFG_READAHEAD_KB_MAX 1048576
+#define AZNFSCFG_READAHEAD_KB_DEF 16384
 #define AZNFSCFG_FUSE_MAX_BG_MIN 1
 #define AZNFSCFG_FUSE_MAX_BG_MAX 65536
 #define AZNFSCFG_FUSE_MAX_BG_DEF 4096

diff --git a/turbonfs/inc/fcsm.h b/turbonfs/inc/fcsm.h
@@ -225,17 +225,6 @@ class fcsm
     void ctgtq_cleanup();
     void ftgtq_cleanup();
 
-    /**
-     * Update fc_scale_factor according to the current cache pressure.
-     * When global cache utilization is high, it reduces fc_scale_factor so
-     * that all writers flush/commit early, for easing global memory pressure.
-     */
-    static void update_fc_scale_factor();
-    static double get_fc_scale_factor()
-    {
-        return fc_scale_factor;
-    }
-
 private:
     /*
      * The singleton nfs_client, for convenience.
@@ -345,23 +334,6 @@ class fcsm
      * The state machine starts in an idle state.
      */
     std::atomic<bool> running = false;
-
-    /*
-     * Value returned by max_dirty_extent_bytes() is scaled down by this much
-     * before it's used by:
-     * - flush_required()
-     * - commit_required()
-     * - do_inline_write()
-     *
-     * fc_scale_factor is computed by update_fc_scale_factor() according to
-     * the global cache pressure. If global cache pressure is high we want the
-     * local flush/commit limits to be reduced so that each file flushes/commits
-     * faster thus easing the global cache pressure. This promotes fair sharing
-     * of global cache space while also maintaining enough contiguous data to
-     * the server, needed for better write throughput. Stable and unstable
-     * write may use this scale factor differently.
-     */
-    static std::atomic<double> fc_scale_factor;
 };
 
 struct FC_CB_TRACKER

diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
@@ -1389,37 +1389,7 @@ class bytes_chunk_cache
      * scheduler an opportunity to merge better.
      * For unstable writes this allows us enough PB parallelism.
      */
-    static uint64_t max_dirty_extent_bytes()
-    {
-        // Maximum cache size allowed in bytes.
-        static const uint64_t max_total =
-            (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
-        assert(max_total != 0);
-
-        /*
-         * Capped due to global cache size. One single file should not use
-         * more than 60% of the cache.
-         */
-        static const uint64_t max_dirty_extent_g = (max_total * 0.6);
-
-        /*
-         * Capped due to per-file cache discipline.
-         * Every file wants to keep 10 full sized blocks but that can be
-         * reduced as per the current cache pressure, but never less than
-         * one full size block.
-         */
-        static const uint64_t max_dirty_extent_l =
-            (10 * AZNFSC_MAX_BLOCK_SIZE) * fcsm::get_fc_scale_factor();
-        assert(max_dirty_extent_l >= AZNFSC_MAX_BLOCK_SIZE);
-
-        const uint64_t max_dirty_extent =
-            std::min(max_dirty_extent_g, max_dirty_extent_l);
-
-        // At least one full sized block.
-        assert(max_dirty_extent >= AZNFSC_MAX_BLOCK_SIZE);
-
-        return max_dirty_extent;
-    }
+    static uint64_t max_dirty_extent_bytes();
 
     /**
      * Get the amount of dirty data that needs to be flushed.
@@ -1684,8 +1654,8 @@ class bytes_chunk_cache
         assert(max_total != 0);
 
         /*
-         * If cache usage grows to 80% of max, we enforce inline pruning for
-         * writers. When cache usage grows more than 60% we recommend periodic
+         * If cache usage grows to 90% of max, we enforce inline pruning for
+         * writers. When cache usage grows more than 70% we recommend periodic
          * pruning. If the cache size is sufficient, hopefully we will not need
          * inline pruning too often, as it hurts application write performance.
          * Once curr_bytes_total exceeds inline_threshold we need to perform
@@ -1696,10 +1666,10 @@ class bytes_chunk_cache
          * Following also means that at any time, half of the cache_max_mb
          * can be safely present in the cache.
          */
-        static const uint64_t inline_threshold = (max_total * 0.8);
-        static const uint64_t inline_target = (max_total * 0.7);
-        static const uint64_t periodic_threshold = (max_total * 0.6);
-        static const uint64_t periodic_target = (max_total * 0.5);
+        static const uint64_t inline_threshold = (max_total * 0.9);
+        static const uint64_t inline_target = (max_total * 0.8);
+        static const uint64_t periodic_threshold = (max_total * 0.7);
+        static const uint64_t periodic_target = (max_total * 0.6);
 
         /*
          * Current total cache size in bytes. Save it once to avoid issues

diff --git a/turbonfs/inc/nfs_client.h b/turbonfs/inc/nfs_client.h
@@ -137,6 +137,38 @@ struct nfs_client
     std::atomic<uint64_t> max_ino = 0;
 #endif
 
+    /*
+     * Last 5 sec read and write throughput.
+     * rw_genid is updated everytime these values are updated, so can be used
+     * to check when throughput is updated.
+     */
+    std::atomic<uint64_t> r_MBps = 0;
+    std::atomic<uint64_t> w_MBps = 0;
+    std::atomic<uint64_t> rw_genid = 0;
+
+    /*
+     * Value returned by max_dirty_extent_bytes() is scaled down by this much
+     * before it's used by:
+     * - flush_required()
+     * - commit_required()
+     * - do_inline_write()
+     *
+     * fc_scale_factor is computed by periodic_updater() according to the global
+     * cache pressure. If global cache pressure is high we want the local
+     * flush/commit limits to be reduced so that each file flushes/commits
+     * faster thus easing the global cache pressure. This promotes fair sharing
+     * of global cache space while also maintaining enough contiguous data to
+     * the server, needed for better write throughput. Stable and unstable
+     * write may use this scale factor differently.
+     */
+    static std::atomic<double> fc_scale_factor;
+
+    /*
+     * periodic_updater() will update this scaling factor to force all ra_state
+     * machines to slow down readahead in case of high memory pressure.
+     */
+    static std::atomic<double> ra_scale_factor;
+
     /*
      * Set in shutdown() to let others know that nfs_client is shutting
      * down. They can use this to quit what they are doing and plan for
@@ -185,6 +217,18 @@ struct nfs_client
         return client;
     }
 
+    static double get_fc_scale_factor()
+    {
+        assert(fc_scale_factor >= 1.0/10);
+        return fc_scale_factor;
+    }
+
+    static double get_ra_scale_factor()
+    {
+        assert(ra_scale_factor >= 0);
+        return ra_scale_factor;
+    }
+
     /**
      * Returns true if nfs_client is shutting down.
      */
@@ -214,6 +258,38 @@ struct nfs_client
         return inode_map_lock_0;
     }
 
+    /**
+     * Update various stuff that needs to be periodically updated, like:
+     * - Last 5 sec read and write throughput.
+     * - Readahead scale factor for controlling readahead amount, and
+     * - Flush/commit dirty data scale factor for controlling how long we keep
+     *   dirty data before flushing/committing.
+     *
+     * Call this from some place that's called very frequently.
+     */
+    void periodic_updater();
+
+    /**
+     * Get last 5 sec read throughput in MBps.
+     */
+    uint64_t get_read_MBps() const
+    {
+        return r_MBps;
+    }
+
+    /**
+     * Get last 5 sec read throughput in MBps.
+     */
+    uint64_t get_write_MBps() const
+    {
+        return w_MBps;
+    }
+
+    uint64_t get_rw_genid() const
+    {
+        return rw_genid;
+    }
+
     /*
      * The user should first init the client class before using it.
      */

diff --git a/turbonfs/inc/readahead.h b/turbonfs/inc/readahead.h
@@ -305,22 +305,10 @@ class ra_state
         max_byte_read = UINT64_MAX;
     }
 
-    /**
-     * Update ra_scale_factor according to the current cache pressure.
-     * When global cache utilization is high, it reduces ra_scale_factor so
-     * that all readers use less ra window, for easing global memory pressure.
-     * Likewise when global cache utilization is low it increases the
-     * ra_scale_factor to let readers use higher readahead.
-     */
-    static void update_scale_factor();
-
     /**
      * Returns the scaled ra window that caller can safely use.
      */
-    uint64_t get_ra_bytes() const
-    {
-        return ra_bytes * ra_scale_factor;
-    }
+    uint64_t get_ra_bytes() const;
 
     /**
      * This will run self tests to test the correctness of this class.

diff --git a/turbonfs/inc/rpc_stats.h b/turbonfs/inc/rpc_stats.h
@@ -337,7 +337,9 @@ class rpc_stats_az
      *                        This will indicate our readahead effectiveness.
      * bytes_zeroed_from_cache: How many bytes were read from unmapped parts
      *                          of the cache and hence were zero filled.
-     * bytes_read_ahead: How many bytes were read ahead.
+     * num_readhead: Number of readahead calls made.
+     * bytes_read_ahead: How many bytes were read ahead using num_readhead
+     *                   calls.
      * tot_getattr_reqs: How many getattr requests were received from fuse.
      * getattr_served_from_cache: How many were served from inode->attr cache.
      * tot_lookup_reqs: How many lookup requests were received from fuse.
@@ -363,6 +365,8 @@ class rpc_stats_az
      *           beyond configured limit.
      * commit_gp: How many time commit was issued as global cache grew beyond
      *           configured limit.
+     * num_sync_membufs: How many times sync_membufs() was called?
+     * tot_bytes_sync_membufs: Total bytes flushed by sync_membufs().
      */
     static std::atomic<uint64_t> tot_read_reqs;
     static std::atomic<uint64_t> failed_read_reqs;
@@ -371,6 +375,7 @@ class rpc_stats_az
     static std::atomic<uint64_t> bytes_read_from_cache;
     static std::atomic<uint64_t> bytes_zeroed_from_cache;
     static std::atomic<uint64_t> bytes_read_ahead;
+    static std::atomic<uint64_t> num_readhead;
     static std::atomic<uint64_t> tot_getattr_reqs;
     static std::atomic<uint64_t> getattr_served_from_cache;
     static std::atomic<uint64_t> tot_lookup_reqs;
@@ -387,6 +392,8 @@ class rpc_stats_az
     static std::atomic<uint64_t> flush_gp;
     static std::atomic<uint64_t> commit_lp;
     static std::atomic<uint64_t> commit_gp;
+    static std::atomic<uint64_t> num_sync_membufs;
+    static std::atomic<uint64_t> tot_bytes_sync_membufs;
 
     static std::atomic<uint64_t> rpc_tasks_allocated;
     static std::atomic<uint64_t> fuse_responses_awaited;
@@ -395,7 +402,7 @@ class rpc_stats_az
 
 #define INC_GBL_STATS(var, inc)  rpc_stats_az::var += (inc)
 #define DEC_GBL_STATS(var, dec)  {assert(rpc_stats_az::var >= dec); rpc_stats_az::var -= (dec);}
-#define GET_GBL_STATS(var)       rpc_stats_az::var
+#define GET_GBL_STATS(var)       rpc_stats_az::var.load()
 
 struct fuse_req_stats
 {

diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h
@@ -2225,7 +2225,6 @@ struct rpc_task
             DEC_GBL_STATS(fuse_responses_awaited, 1);
         }
 
-        INC_GBL_STATS(tot_bytes_written, count);
         free_rpc_task();
     }
 
@@ -2607,7 +2606,7 @@ class rpc_task_helper
          * used. Later init_*() method can set it to a more appropriate value.
          */
         task->csched = (task->client->mnt_options.nfs_port == 2047) ?
-                        CONN_SCHED_RR : CONN_SCHED_FH_HASH;
+                        CONN_SCHED_RR_W : CONN_SCHED_FH_HASH;
 
 #ifdef ENABLE_PARANOID
         task->issuing_tid = ::gettid();

diff --git a/turbonfs/inc/rpc_transport.h b/turbonfs/inc/rpc_transport.h
@@ -20,15 +20,22 @@ typedef enum
 
     /*
      * Round robin requests over all connections.
+     * Use CONN_SCHED_RR_R for read requests and CONN_SCHED_RR_W for write
+     * requests. This helps scheduler ensure read and write requests use
+     * exclusive connections else small write responses may get stuck behind
+     * large read responses and small read requests may get stuck behind
+     * large write requests. Note that this is not completely avoidable even
+     * though we prioritize smaller read requests over larger write requests.
      */
-    CONN_SCHED_RR       = 2,
+    CONN_SCHED_RR_R     = 2,
+    CONN_SCHED_RR_W     = 3,
 
     /*
      * Every file is affined to one connection based on the FH hash, so all
      * requests to one file go over the same connection while different files
      * will use different connections.
      */
-    CONN_SCHED_FH_HASH  = 3,
+    CONN_SCHED_FH_HASH  = 4,
 } conn_sched_t;
 
 /*

diff --git a/turbonfs/sample-turbo-config.yaml b/turbonfs/sample-turbo-config.yaml
@@ -6,7 +6,7 @@
 account: sjc22prdste06hnfsv3acc1
 container: nfsv3test
 cloud_suffix: blob.preprod.core.windows.net
-port: 2047
+port: 2048
 
 #
 # Auth Config
@@ -20,7 +20,7 @@ auth: false
 # readdir_maxcount
 # lookupcache: all|none|pos|positive
 #
-nconnect: 1
+nconnect: 96
 timeo: 600
 retrans: 2
 acregmin: 3
@@ -112,15 +112,24 @@ fuse_max_background: 4096
 # Memory backed caches are controlled using cache.data.* configs, while
 # file backed cache are controlled using filecache.* configs.
 #
-readahead_kb: 16384
+# Readahead is automatically scaled (up and down) based on the available cache
+# and whether there are ongoing writes competing for the cache. readahead_kb
+# is the initial value which is the scaled appropriately. It can be set to 0
+# for disabling readaheads completely.
+# For most cases you don't need to specify readahead_kb explicitly.
+#
+#readahead_kb: 16384
 cache.attr.user.enable: true
 cache.readdir.kernel.enable: true
 cache.readdir.user.enable: true
 cache.data.kernel.enable: true
 cache.data.user.enable: true
 cache.data.user.max_size_mb: 4096
 
-filecache.enable: false
-filecache.cachedir: /mnt
-filecache.max_size_gb: 1000
-cache_max_mb: 4096
+#
+# These are currently not supported.
+#
+#filecache.enable: false
+#filecache.cachedir: /mnt
+#filecache.max_size_gb: 1000
+#cache_max_mb: 4096
diff --git a/turbonfs/src/config.cpp b/turbonfs/src/config.cpp
@@ -262,7 +262,7 @@ void aznfsc_cfg::set_defaults_and_sanitize()
     if (readdir_maxcount == -1)
         readdir_maxcount = 1048576;
     if (readahead_kb == -1)
-        readahead_kb = 16384;
+        readahead_kb = AZNFSCFG_READAHEAD_KB_DEF;
     if (cache.data.user.enable) {
         if (cache.data.user.max_size_mb == -1)
             cache.data.user.max_size_mb = AZNFSCFG_CACHE_MAX_MB_DEF;