diff --git a/turbonfs/extern/libnfs b/turbonfs/extern/libnfs
index c9b72868..689a2018 160000
--- a/turbonfs/extern/libnfs
+++ b/turbonfs/extern/libnfs
@@ -1 +1 @@
-Subproject commit c9b7286878bc0388f033d9d6c56d0ddf7bb4e49c
+Subproject commit 689a201851cf46e07dd7c45941ede87da5e096ff
diff --git a/turbonfs/inc/aznfsc.h b/turbonfs/inc/aznfsc.h
index 3734cb32..fa4a3eb6 100644
--- a/turbonfs/inc/aznfsc.h
+++ b/turbonfs/inc/aznfsc.h
@@ -48,6 +48,7 @@ static_assert(AZNFSCFG_WSIZE_MAX == AZNFSCFG_RSIZE_MAX);
 #define AZNFSCFG_READDIR_MAX    4194304
 #define AZNFSCFG_READAHEAD_KB_MIN 128
 #define AZNFSCFG_READAHEAD_KB_MAX 1048576
+#define AZNFSCFG_READAHEAD_KB_DEF 16384
 #define AZNFSCFG_FUSE_MAX_BG_MIN 1
 #define AZNFSCFG_FUSE_MAX_BG_MAX 65536
 #define AZNFSCFG_FUSE_MAX_BG_DEF 4096
diff --git a/turbonfs/inc/fcsm.h b/turbonfs/inc/fcsm.h
index 12ddbbaf..91dcd944 100644
--- a/turbonfs/inc/fcsm.h
+++ b/turbonfs/inc/fcsm.h
@@ -225,17 +225,6 @@ class fcsm
     void ctgtq_cleanup();
     void ftgtq_cleanup();
 
-    /**
-     * Update fc_scale_factor according to the current cache pressure.
-     * When global cache utilization is high, it reduces fc_scale_factor so
-     * that all writers flush/commit early, for easing global memory pressure.
-     */
-    static void update_fc_scale_factor();
-    static double get_fc_scale_factor()
-    {
-        return fc_scale_factor;
-    }
-
 private:
     /*
      * The singleton nfs_client, for convenience.
@@ -345,23 +334,6 @@ class fcsm
      * The state machine starts in an idle state.
      */
     std::atomic<bool> running = false;
-
-    /*
-     * Value returned by max_dirty_extent_bytes() is scaled down by this much
-     * before it's used by:
-     * - flush_required()
-     * - commit_required()
-     * - do_inline_write()
-     *
-     * fc_scale_factor is computed by update_fc_scale_factor() according to
-     * the global cache pressure. If global cache pressure is high we want the
-     * local flush/commit limits to be reduced so that each file flushes/commits
-     * faster thus easing the global cache pressure. This promotes fair sharing
-     * of global cache space while also maintaining enough contiguous data to
-     * the server, needed for better write throughput. Stable and unstable
-     * write may use this scale factor differently.
-     */
-    static std::atomic<double> fc_scale_factor;
 };
 
 struct FC_CB_TRACKER
diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index 9fb37c1a..12debd1f 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -1389,37 +1389,7 @@ class bytes_chunk_cache
      * scheduler an opportunity to merge better.
      * For unstable writes this allows us enough PB parallelism.
      */
-    static uint64_t max_dirty_extent_bytes()
-    {
-        // Maximum cache size allowed in bytes.
-        static const uint64_t max_total =
-            (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
-        assert(max_total != 0);
-
-        /*
-         * Capped due to global cache size. One single file should not use
-         * more than 60% of the cache.
-         */
-        static const uint64_t max_dirty_extent_g = (max_total * 0.6);
-
-        /*
-         * Capped due to per-file cache discipline.
-         * Every file wants to keep 10 full sized blocks but that can be
-         * reduced as per the current cache pressure, but never less than
-         * one full size block.
-         */
-        static const uint64_t max_dirty_extent_l =
-            (10 * AZNFSC_MAX_BLOCK_SIZE) * fcsm::get_fc_scale_factor();
-        assert(max_dirty_extent_l >= AZNFSC_MAX_BLOCK_SIZE);
-
-        const uint64_t max_dirty_extent =
-            std::min(max_dirty_extent_g, max_dirty_extent_l);
-
-        // At least one full sized block.
-        assert(max_dirty_extent >= AZNFSC_MAX_BLOCK_SIZE);
-
-        return max_dirty_extent;
-    }
+    static uint64_t max_dirty_extent_bytes();
 
     /**
      * Get the amount of dirty data that needs to be flushed.
@@ -1684,8 +1654,8 @@ class bytes_chunk_cache
         assert(max_total != 0);
 
         /*
-         * If cache usage grows to 80% of max, we enforce inline pruning for
-         * writers. When cache usage grows more than 60% we recommend periodic
+         * If cache usage grows to 90% of max, we enforce inline pruning for
+         * writers. When cache usage grows more than 70% we recommend periodic
          * pruning. If the cache size is sufficient, hopefully we will not need
          * inline pruning too often, as it hurts application write performance.
          * Once curr_bytes_total exceeds inline_threshold we need to perform
@@ -1696,10 +1666,10 @@ class bytes_chunk_cache
          * Following also means that at any time, half of the cache_max_mb
          * can be safely present in the cache.
          */
-        static const uint64_t inline_threshold = (max_total * 0.8);
-        static const uint64_t inline_target = (max_total * 0.7);
-        static const uint64_t periodic_threshold = (max_total * 0.6);
-        static const uint64_t periodic_target = (max_total * 0.5);
+        static const uint64_t inline_threshold = (max_total * 0.9);
+        static const uint64_t inline_target = (max_total * 0.8);
+        static const uint64_t periodic_threshold = (max_total * 0.7);
+        static const uint64_t periodic_target = (max_total * 0.6);
 
         /*
          * Current total cache size in bytes. Save it once to avoid issues
diff --git a/turbonfs/inc/nfs_client.h b/turbonfs/inc/nfs_client.h
index 84a78c09..489d8c55 100644
--- a/turbonfs/inc/nfs_client.h
+++ b/turbonfs/inc/nfs_client.h
@@ -137,6 +137,38 @@ struct nfs_client
     std::atomic<uint64_t> max_ino = 0;
 #endif
 
+    /*
+     * Last 5 sec read and write throughput.
+     * rw_genid is updated everytime these values are updated, so can be used
+     * to check when throughput is updated.
+     */
+    std::atomic<uint64_t> r_MBps = 0;
+    std::atomic<uint64_t> w_MBps = 0;
+    std::atomic<uint64_t> rw_genid = 0;
+
+    /*
+     * Value returned by max_dirty_extent_bytes() is scaled down by this much
+     * before it's used by:
+     * - flush_required()
+     * - commit_required()
+     * - do_inline_write()
+     *
+     * fc_scale_factor is computed by periodic_updater() according to the global
+     * cache pressure. If global cache pressure is high we want the local
+     * flush/commit limits to be reduced so that each file flushes/commits
+     * faster thus easing the global cache pressure. This promotes fair sharing
+     * of global cache space while also maintaining enough contiguous data to
+     * the server, needed for better write throughput. Stable and unstable
+     * write may use this scale factor differently.
+     */
+    static std::atomic<double> fc_scale_factor;
+
+    /*
+     * periodic_updater() will update this scaling factor to force all ra_state
+     * machines to slow down readahead in case of high memory pressure.
+     */
+    static std::atomic<double> ra_scale_factor;
+
     /*
      * Set in shutdown() to let others know that nfs_client is shutting
      * down. They can use this to quit what they are doing and plan for
@@ -185,6 +217,18 @@ struct nfs_client
         return client;
     }
 
+    static double get_fc_scale_factor()
+    {
+        assert(fc_scale_factor >= 1.0/10);
+        return fc_scale_factor;
+    }
+
+    static double get_ra_scale_factor()
+    {
+        assert(ra_scale_factor >= 0);
+        return ra_scale_factor;
+    }
+
     /**
      * Returns true if nfs_client is shutting down.
      */
@@ -214,6 +258,38 @@ struct nfs_client
         return inode_map_lock_0;
     }
 
+    /**
+     * Update various stuff that needs to be periodically updated, like:
+     * - Last 5 sec read and write throughput.
+     * - Readahead scale factor for controlling readahead amount, and
+     * - Flush/commit dirty data scale factor for controlling how long we keep
+     *   dirty data before flushing/committing.
+     *
+     * Call this from some place that's called very frequently.
+     */
+    void periodic_updater();
+
+    /**
+     * Get last 5 sec read throughput in MBps.
+     */
+    uint64_t get_read_MBps() const
+    {
+        return r_MBps;
+    }
+
+    /**
+     * Get last 5 sec read throughput in MBps.
+     */
+    uint64_t get_write_MBps() const
+    {
+        return w_MBps;
+    }
+
+    uint64_t get_rw_genid() const
+    {
+        return rw_genid;
+    }
+
     /*
      * The user should first init the client class before using it.
      */
diff --git a/turbonfs/inc/readahead.h b/turbonfs/inc/readahead.h
index 75a6d0f5..16f86715 100644
--- a/turbonfs/inc/readahead.h
+++ b/turbonfs/inc/readahead.h
@@ -305,22 +305,10 @@ class ra_state
         max_byte_read = UINT64_MAX;
     }
 
-    /**
-     * Update ra_scale_factor according to the current cache pressure.
-     * When global cache utilization is high, it reduces ra_scale_factor so
-     * that all readers use less ra window, for easing global memory pressure.
-     * Likewise when global cache utilization is low it increases the
-     * ra_scale_factor to let readers use higher readahead.
-     */
-    static void update_scale_factor();
-
     /**
      * Returns the scaled ra window that caller can safely use.
      */
-    uint64_t get_ra_bytes() const
-    {
-        return ra_bytes * ra_scale_factor;
-    }
+    uint64_t get_ra_bytes() const;
 
     /**
      * This will run self tests to test the correctness of this class.
diff --git a/turbonfs/inc/rpc_stats.h b/turbonfs/inc/rpc_stats.h
index 457bb737..8676bfc1 100644
--- a/turbonfs/inc/rpc_stats.h
+++ b/turbonfs/inc/rpc_stats.h
@@ -337,7 +337,9 @@ class rpc_stats_az
      *                        This will indicate our readahead effectiveness.
      * bytes_zeroed_from_cache: How many bytes were read from unmapped parts
      *                          of the cache and hence were zero filled.
-     * bytes_read_ahead: How many bytes were read ahead.
+     * num_readhead: Number of readahead calls made.
+     * bytes_read_ahead: How many bytes were read ahead using num_readhead
+     *                   calls.
      * tot_getattr_reqs: How many getattr requests were received from fuse.
      * getattr_served_from_cache: How many were served from inode->attr cache.
      * tot_lookup_reqs: How many lookup requests were received from fuse.
@@ -363,6 +365,8 @@ class rpc_stats_az
      *           beyond configured limit.
      * commit_gp: How many time commit was issued as global cache grew beyond
      *           configured limit.
+     * num_sync_membufs: How many times sync_membufs() was called?
+     * tot_bytes_sync_membufs: Total bytes flushed by sync_membufs().
      */
     static std::atomic<uint64_t> tot_read_reqs;
     static std::atomic<uint64_t> failed_read_reqs;
@@ -371,6 +375,7 @@ class rpc_stats_az
     static std::atomic<uint64_t> bytes_read_from_cache;
     static std::atomic<uint64_t> bytes_zeroed_from_cache;
     static std::atomic<uint64_t> bytes_read_ahead;
+    static std::atomic<uint64_t> num_readhead;
     static std::atomic<uint64_t> tot_getattr_reqs;
     static std::atomic<uint64_t> getattr_served_from_cache;
     static std::atomic<uint64_t> tot_lookup_reqs;
@@ -387,6 +392,8 @@ class rpc_stats_az
     static std::atomic<uint64_t> flush_gp;
     static std::atomic<uint64_t> commit_lp;
     static std::atomic<uint64_t> commit_gp;
+    static std::atomic<uint64_t> num_sync_membufs;
+    static std::atomic<uint64_t> tot_bytes_sync_membufs;
 
     static std::atomic<uint64_t> rpc_tasks_allocated;
     static std::atomic<uint64_t> fuse_responses_awaited;
@@ -395,7 +402,7 @@ class rpc_stats_az
 
 #define INC_GBL_STATS(var, inc)  rpc_stats_az::var += (inc)
 #define DEC_GBL_STATS(var, dec)  {assert(rpc_stats_az::var >= dec); rpc_stats_az::var -= (dec);}
-#define GET_GBL_STATS(var)       rpc_stats_az::var
+#define GET_GBL_STATS(var)       rpc_stats_az::var.load()
 
 struct fuse_req_stats
 {
diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h
index 29f15c8e..9b2fe01a 100644
--- a/turbonfs/inc/rpc_task.h
+++ b/turbonfs/inc/rpc_task.h
@@ -2225,7 +2225,6 @@ struct rpc_task
             DEC_GBL_STATS(fuse_responses_awaited, 1);
         }
 
-        INC_GBL_STATS(tot_bytes_written, count);
         free_rpc_task();
     }
 
@@ -2607,7 +2606,7 @@ class rpc_task_helper
          * used. Later init_*() method can set it to a more appropriate value.
          */
         task->csched = (task->client->mnt_options.nfs_port == 2047) ?
-                        CONN_SCHED_RR : CONN_SCHED_FH_HASH;
+                        CONN_SCHED_RR_W : CONN_SCHED_FH_HASH;
 
 #ifdef ENABLE_PARANOID
         task->issuing_tid = ::gettid();
diff --git a/turbonfs/inc/rpc_transport.h b/turbonfs/inc/rpc_transport.h
index a15cccbf..37acecf0 100644
--- a/turbonfs/inc/rpc_transport.h
+++ b/turbonfs/inc/rpc_transport.h
@@ -20,15 +20,22 @@ typedef enum
 
     /*
      * Round robin requests over all connections.
+     * Use CONN_SCHED_RR_R for read requests and CONN_SCHED_RR_W for write
+     * requests. This helps scheduler ensure read and write requests use
+     * exclusive connections else small write responses may get stuck behind
+     * large read responses and small read requests may get stuck behind
+     * large write requests. Note that this is not completely avoidable even
+     * though we prioritize smaller read requests over larger write requests.
      */
-    CONN_SCHED_RR       = 2,
+    CONN_SCHED_RR_R     = 2,
+    CONN_SCHED_RR_W     = 3,
 
     /*
      * Every file is affined to one connection based on the FH hash, so all
      * requests to one file go over the same connection while different files
      * will use different connections.
      */
-    CONN_SCHED_FH_HASH  = 3,
+    CONN_SCHED_FH_HASH  = 4,
 } conn_sched_t;
 
 /*
diff --git a/turbonfs/sample-turbo-config.yaml b/turbonfs/sample-turbo-config.yaml
index 03063a6f..992c08d1 100644
--- a/turbonfs/sample-turbo-config.yaml
+++ b/turbonfs/sample-turbo-config.yaml
@@ -6,7 +6,7 @@
 account: sjc22prdste06hnfsv3acc1
 container: nfsv3test
 cloud_suffix: blob.preprod.core.windows.net
-port: 2047
+port: 2048
 
 #
 # Auth Config
@@ -20,7 +20,7 @@ auth: false
 # readdir_maxcount
 # lookupcache: all|none|pos|positive
 #
-nconnect: 1
+nconnect: 96
 timeo: 600
 retrans: 2
 acregmin: 3
@@ -112,7 +112,13 @@ fuse_max_background: 4096
 # Memory backed caches are controlled using cache.data.* configs, while
 # file backed cache are controlled using filecache.* configs.
 #
-readahead_kb: 16384
+# Readahead is automatically scaled (up and down) based on the available cache
+# and whether there are ongoing writes competing for the cache. readahead_kb
+# is the initial value which is the scaled appropriately. It can be set to 0
+# for disabling readaheads completely.
+# For most cases you don't need to specify readahead_kb explicitly.
+#
+#readahead_kb: 16384
 cache.attr.user.enable: true
 cache.readdir.kernel.enable: true
 cache.readdir.user.enable: true
@@ -120,7 +126,10 @@ cache.data.kernel.enable: true
 cache.data.user.enable: true
 cache.data.user.max_size_mb: 4096
 
-filecache.enable: false
-filecache.cachedir: /mnt
-filecache.max_size_gb: 1000
-cache_max_mb: 4096
+#
+# These are currently not supported.
+#
+#filecache.enable: false
+#filecache.cachedir: /mnt
+#filecache.max_size_gb: 1000
+#cache_max_mb: 4096
diff --git a/turbonfs/src/config.cpp b/turbonfs/src/config.cpp
index f009479a..3ee1c26c 100644
--- a/turbonfs/src/config.cpp
+++ b/turbonfs/src/config.cpp
@@ -262,7 +262,7 @@ void aznfsc_cfg::set_defaults_and_sanitize()
     if (readdir_maxcount == -1)
         readdir_maxcount = 1048576;
     if (readahead_kb == -1)
-        readahead_kb = 16384;
+        readahead_kb = AZNFSCFG_READAHEAD_KB_DEF;
     if (cache.data.user.enable) {
         if (cache.data.user.max_size_mb == -1)
             cache.data.user.max_size_mb = AZNFSCFG_CACHE_MAX_MB_DEF;
diff --git a/turbonfs/src/fcsm.cpp b/turbonfs/src/fcsm.cpp
index 9918c5e3..08068ecd 100644
--- a/turbonfs/src/fcsm.cpp
+++ b/turbonfs/src/fcsm.cpp
@@ -3,7 +3,6 @@
 #include "nfs_inode.h"
 
 namespace aznfsc {
-/* static */ std::atomic<double> fcsm::fc_scale_factor = 1.0;
 
 /**
  * This is called from alloc_fcsm() with exclusive lock on ilock_1.
@@ -110,51 +109,6 @@ void fcsm::add_committing(uint64_t bytes)
     assert(committing_seq_num <= flushed_seq_num);
 }
 
-/* static */
-void fcsm::update_fc_scale_factor()
-{
-    // Maximum cache size allowed in bytes.
-    static const uint64_t max_cache =
-        (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
-    assert(max_cache != 0);
-    const uint64_t curr_cache = bytes_chunk_cache::bytes_allocated_g;
-    const double percent_cache = (curr_cache * 100.0) / max_cache;
-    double scale = 1.0;
-
-    if (percent_cache > 95) {
-        /*
-         * Every file has fundamental right to 100MB of cache space.
-         * If we reduce it further we will end up in sub-optimal writes
-         * to the server.
-         */
-        scale = 1.0/10;
-    } else if (percent_cache > 90) {
-        // 200MB
-        scale = 2.0/10;
-    } else if (percent_cache > 80) {
-        // 300MB
-        scale = 3.0/10;
-    } else if (percent_cache > 70) {
-        // 500MB
-        scale = 5.0/10;
-    } else if (percent_cache > 60) {
-        // 800MB
-        scale = 8.0/10;
-    }
-
-    if (fc_scale_factor != scale) {
-        static uint64_t last_log_usec;
-        const uint64_t now = get_current_usecs();
-        // Don't log more frequently than 5 secs.
-        if ((now - last_log_usec) > (5 * 1000 * 1000)) {
-            AZLogInfo("[FC] Scale factor updated ({} -> {})",
-                      fc_scale_factor.load(), scale);
-            last_log_usec = now;
-        }
-        fc_scale_factor = scale;
-    }
-}
-
 void fcsm::run(struct rpc_task *task,
                uint64_t extent_left,
                uint64_t extent_right)
diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp
index 6a837bdc..4d4d917d 100644
--- a/turbonfs/src/file_cache.cpp
+++ b/turbonfs/src/file_cache.cpp
@@ -2404,6 +2404,39 @@ int bytes_chunk_cache::truncate(uint64_t trunc_len,
     return mb_skipped;
 }
 
+/* static */
+uint64_t bytes_chunk_cache::max_dirty_extent_bytes()
+{
+    // Maximum cache size allowed in bytes.
+    static const uint64_t max_total =
+        (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
+    assert(max_total != 0);
+
+    /*
+     * Capped due to global cache size. One single file should not use
+     * more than 60% of the cache.
+     */
+    static const uint64_t max_dirty_extent_g = (max_total * 0.6);
+
+    /*
+     * Capped due to per-file cache discipline.
+     * Every file wants to keep 10 full sized blocks but that can be
+     * reduced as per the current cache pressure, but never less than
+     * one full size block.
+     */
+    static const uint64_t max_dirty_extent_l =
+        (10 * AZNFSC_MAX_BLOCK_SIZE) * nfs_client::get_fc_scale_factor();
+    assert(max_dirty_extent_l >= AZNFSC_MAX_BLOCK_SIZE);
+
+    const uint64_t max_dirty_extent =
+        std::min(max_dirty_extent_g, max_dirty_extent_l);
+
+    // At least one full sized block.
+    assert(max_dirty_extent >= AZNFSC_MAX_BLOCK_SIZE);
+
+    return max_dirty_extent;
+}
+
 /*
  * TODO: Add pruning stats.
  */
@@ -2413,11 +2446,10 @@ void bytes_chunk_cache::inline_prune()
     uint64_t pruned_bytes = 0;
 
     /*
-     * See if we need to slowdown/speedup readahead and flush/commit more
-     * promptly, per the current memory pressure.
+     * Update various client level stuff that needs to be updated periodically,
+     * like various read/write scale factors, last 5 secs throughput, etc.
      */
-    ra_state::update_scale_factor();
-    fcsm::update_fc_scale_factor();
+    nfs_client::get_instance().periodic_updater();
 
     get_prune_goals(&inline_bytes, nullptr);
 
diff --git a/turbonfs/src/nfs_client.cpp b/turbonfs/src/nfs_client.cpp
index 924a2c77..add4b826 100644
--- a/turbonfs/src/nfs_client.cpp
+++ b/turbonfs/src/nfs_client.cpp
@@ -4,6 +4,11 @@
 #include "rpc_task.h"
 #include "rpc_readdir.h"
 
+/* static */
+std::atomic<double> nfs_client::ra_scale_factor = 1.0;
+/* static */
+std::atomic<double> nfs_client::fc_scale_factor = 1.0;
+
 // The user should first init the client class before using it.
 bool nfs_client::init()
 {
@@ -209,6 +214,197 @@ void nfs_client::shutdown()
     jukebox_thread.join();
 }
 
+void nfs_client::periodic_updater()
+{
+    // Maximum cache size allowed in bytes.
+    static const uint64_t max_cache =
+        (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
+    assert(max_cache != 0);
+
+    /*
+     * #1 Calculate recent read/write throughput.
+     */
+    static std::atomic<time_t> last_sec;
+    static std::atomic<uint64_t> last_bytes_written;
+    static std::atomic<uint64_t> last_bytes_read;
+    static std::atomic<uint64_t> last_genid;
+    const time_t now_sec = ::time(NULL);
+    const int sample_intvl = 5;
+
+    assert(GET_GBL_STATS(tot_bytes_written) >= last_bytes_written);
+    assert(GET_GBL_STATS(tot_bytes_read) >= last_bytes_read);
+    assert(now_sec >= last_sec);
+
+    /*
+     * Every sample_intvl, compute read/write throughput for the last
+     * interval. Only one thread should update the throughput.
+     */
+    const int intvl = now_sec - last_sec;
+    if (intvl >= sample_intvl) {
+        uint64_t expected = last_genid.load();
+        if (rw_genid.compare_exchange_strong(expected, expected + 1)) {
+            w_MBps = (GET_GBL_STATS(tot_bytes_written) - last_bytes_written) /
+                     ((now_sec - last_sec) * 1000'000);
+            r_MBps = (GET_GBL_STATS(tot_bytes_read) - last_bytes_read) /
+                     ((now_sec - last_sec) * 1000'000);
+
+            last_sec = now_sec;
+            last_bytes_read = GET_GBL_STATS(tot_bytes_read);
+            last_bytes_written = GET_GBL_STATS(tot_bytes_written);
+            last_genid = rw_genid.load();
+        }
+    }
+
+    /*
+     * #2 Update scaling factor for readahead and flush/commit.
+     *    The way we decide optimal values of these scaling factors is by
+     *    letting each of these grow to take up more space as soon as its
+     *    share of the cache grows and reduce the scaling to take up less
+     *    space as its share grows. These competing forces result in both
+     *    reaching equilibrium.
+     */
+
+    /*
+     * cache  => total cache allocated, read + write.
+     * wcache => cache allocated by writers.
+     * rcache => cache allocated by reades.
+     *
+     * Note: We use dirty and commit_pending to indicate cache used by writers.
+     *       This assumes that cache is released immediately after write or
+     *       commit completes, else we will account for write cache as read.
+     */
+    const uint64_t cache = bytes_chunk_cache::bytes_allocated_g;
+    const uint64_t wcache = bytes_chunk_cache::bytes_dirty_g +
+                            bytes_chunk_cache::bytes_commit_pending_g;
+    const uint64_t rcache = (uint64_t) std::max((int64_t)(cache - wcache), 0L);
+
+    /*
+     * Read cache usage as percent of max_cache.
+     * This tells how aggressively readers are filling up the cache by
+     * readahead. We increase readahead if this is low and reduce readahead
+     * as this grows.
+     */
+    const double pct_rcache = (rcache * 100.0) / max_cache;
+
+    /*
+     * Write cache usage as percent of max_cache.
+     * This tells how aggressively writees are filling up the cache by adding
+     * dirty data. We reduce max_dirty_extent_bytes() if this grows so that
+     * dirty data is flushed faster, and increase max_dirty_extent_bytes() if
+     * this is low and we want to accumulate more dirty data and flush in bigger
+     * chunks, for fast backend write throughput.
+     */
+    const double pct_wcache = (wcache * 100.0) / max_cache;
+
+    /*
+     * Total cache usage (read + write) as percent of max_cache.
+     * Reads and writes are controlled individually as per pct_rcache and
+     * pct_wcache, but in the end total cache usage is consulted and the
+     * read and write scale factors are reduced accordingly.
+     */
+    const double pct_cache = (cache * 100.0) / max_cache;
+
+    assert(pct_cache <= 100.0);
+    assert((pct_rcache + pct_wcache) <= pct_cache);
+
+    double rscale = 1.0;
+    double wscale = 1.0;
+
+    /*
+     * Stop readahead completely if we are beyond the max cache size, o/w scale
+     * it down proportionately to keep the cache size less than max cache limit.
+     * We also scale up the readahead to make better utilization of the allowed
+     * cache size, when there are fewer reads they are allowed to use more of
+     * the cache per user for readahead.
+     *
+     * Just like read, write also tries to mop up cache space. Those two apply
+     * opposing forces finally reaching an equilibrium.
+     */
+    if (pct_rcache >= 100) {
+        /*
+         * reads taking up all the cache space, completely stop readaheads.
+         * This will cause read cache utilization to drop and then we will
+         * increase readaheads, finally it'll settle on an optimal value.
+         */
+        rscale = 0;
+    } else if (pct_rcache > 95) {
+        rscale = 0.5;
+    } else if (pct_rcache > 90) {
+        rscale = 0.7;
+    } else if (pct_rcache > 80) {
+        rscale = 0.8;
+    } else if (pct_rcache > 70) {
+        rscale = 0.9;
+    } else if (pct_rcache < 3) {
+        rscale = 32;
+    } else if (pct_rcache < 5) {
+        rscale = 24;
+    } else if (pct_rcache < 10) {
+        rscale = 12;
+    } else if (pct_rcache < 20) {
+        rscale = 6;
+    } else if (pct_rcache < 30) {
+        rscale = 4;
+    } else if (pct_rcache < 50) {
+        rscale = 2.5;
+    }
+
+    if (pct_wcache > 95) {
+        /*
+         * Every file has fundamental right to 100MB of cache space.
+         * If we reduce it further we will end up in sub-optimal writes
+         * to the server.
+         */
+        wscale = 1.0/10;
+    } else if (pct_wcache > 90) {
+        // 200MB
+        wscale = 2.0/10;
+    } else if (pct_wcache > 80) {
+        // 300MB
+        wscale = 3.0/10;
+    } else if (pct_wcache > 70) {
+        // 400MB
+        wscale = 4.0/10;
+    } else if (pct_wcache > 60) {
+        // 600MB
+        wscale = 6.0/10;
+    } else if (pct_wcache > 50) {
+        // 700MB
+        wscale = 7.0/10;
+    }
+
+    static uint64_t last_log_sec;
+    bool log_now = false;
+
+    // Don't log more frequently than 5 secs.
+    if ((now_sec - last_log_sec) >= 5) {
+        log_now = true;
+        last_log_sec = now_sec;
+    }
+
+    if (fc_scale_factor != wscale) {
+        if (log_now) {
+            AZLogInfo("[FC] Scale factor updated ({} -> {}), "
+                      "cache util [R: {:0.2f}%, W: {:0.2f}%, T: {:0.2f}%]",
+                      fc_scale_factor.load(), wscale,
+                      pct_rcache, pct_wcache, pct_cache);
+        }
+        fc_scale_factor = wscale;
+        assert(fc_scale_factor >= 1.0/10);
+    }
+
+    if (ra_scale_factor != rscale) {
+        if (log_now) {
+            AZLogInfo("[RA] Scale factor updated ({} -> {}), "
+                      "cache util [R: {:0.2f}%, W: {:0.2f}%, T: {:0.2f}%]",
+                      ra_scale_factor.load(), rscale,
+                      pct_rcache, pct_wcache, pct_cache);
+        }
+        ra_scale_factor = rscale;
+        assert(ra_scale_factor >= 0);
+    }
+}
+
 void nfs_client::jukebox_runner()
 {
     AZLogDebug("Started jukebox_runner");
diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index 3087d856..98f30516 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -663,6 +663,8 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
      */
     assert(!is_commit_in_progress());
 
+    INC_GBL_STATS(num_sync_membufs, 1);
+
     if (bc_vec.empty()) {
         return;
     }
@@ -818,6 +820,8 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
             continue;
         }
 
+        INC_GBL_STATS(tot_bytes_sync_membufs, mb->length.load());
+
         if (write_task == nullptr) {
             write_task =
                 get_client()->get_rpc_task_helper()->alloc_rpc_task(FUSE_WRITE);
@@ -1422,7 +1426,7 @@ int nfs_inode::flush_cache_and_wait()
                       " flush to complete still waiting, iter: {}",
                       get_fuse_ino(), iter);
         }
-        ::usleep(1000);
+        ::usleep(10 * 1000);
     }
 
     return get_write_error();
diff --git a/turbonfs/src/readahead.cpp b/turbonfs/src/readahead.cpp
index bcb32a3c..45ef796b 100644
--- a/turbonfs/src/readahead.cpp
+++ b/turbonfs/src/readahead.cpp
@@ -18,8 +18,6 @@
 
 namespace aznfsc {
 
-/* static */ std::atomic<double> ra_state::ra_scale_factor = 1.0;
-
 /**
  * This is called from alloc_rastate() with exclusive lock on ilock_1.
  */
@@ -64,62 +62,9 @@ ra_state::ra_state(struct nfs_client *_client,
                inode->get_fuse_ino(), ra_bytes, def_ra_size);
 }
 
-/* static */
-void ra_state::update_scale_factor()
+uint64_t ra_state::get_ra_bytes() const
 {
-    // Maximum cache size allowed in bytes.
-    static const uint64_t max_cache =
-        (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
-    assert(max_cache != 0);
-    const uint64_t curr_cache = bytes_chunk_cache::bytes_allocated_g;
-    const double percent_cache = (curr_cache * 100.0) / max_cache;
-    double scale = 1.0;
-
-    /*
-     * Stop readahead fully if we are beyond the max cache size, o/w scale it
-     * down proportionately to keep the cache size less than max cache limit.
-     * We also scale up the readahead to make better utilization of the allowed
-     * cache size, when there are fewer reads they are allowed to use more of
-     * the cache per user for readahead. We increase the scale factor just a
-     * little less than what would exhaust the entire cache, e.g., if percent
-     * cache utilization is ~5% it means if we increase the readahead by 20
-     * times we should be able to get all of cache utilized by readaheads,
-     * we increase the scale factor to slightly less than 20, to 16, and so
-     * on.
-     */
-    if (percent_cache >= 100) {
-        scale = 0;
-    } else if (percent_cache > 95) {
-        scale = 0.5;
-    } else if (percent_cache > 90) {
-        scale = 0.7;
-    } else if (percent_cache > 80) {
-        scale = 0.8;
-    } else if (percent_cache > 70) {
-        scale = 0.9;
-    } else if (percent_cache < 5) {
-        scale = 16;
-    } else if (percent_cache < 10) {
-        scale = 8;
-    } else if (percent_cache < 20) {
-        scale = 4;
-    } else if (percent_cache < 30) {
-        scale = 2;
-    } else if (percent_cache < 50) {
-        scale = 1.5;
-    }
-
-    if (ra_scale_factor != scale) {
-        static uint64_t last_log_usec;
-        const uint64_t now = get_current_usecs();
-        // Don't log more frequently than 5 secs.
-        if ((now - last_log_usec) > (5 * 1000 * 1000)) {
-            AZLogInfo("[Readahead] Scale factor updated ({} -> {})",
-                      ra_scale_factor.load(), scale);
-            last_log_usec = now;
-        }
-        ra_scale_factor = scale;
-    }
+    return ra_bytes * nfs_client::get_ra_scale_factor();
 }
 
 /**
@@ -244,6 +189,7 @@ static void readahead_callback (
         goto delete_ctx;
     } else {
         UPDATE_INODE_ATTR(inode, res->READ3res_u.resok.file_attributes);
+        INC_GBL_STATS(tot_bytes_read, res->READ3res_u.resok.count);
 
         /*
          * Only first read call would have bc->pvt == 0, for subsequent calls
@@ -722,6 +668,8 @@ int ra_state::issue_readaheads()
                        inode->get_fuse_ino(),
                        num_no_readahead.load(), ra_offset);
         }
+    } else {
+        INC_GBL_STATS(num_readhead, 1);
     }
 
     return ra_issued;
diff --git a/turbonfs/src/rpc_stats.cpp b/turbonfs/src/rpc_stats.cpp
index 2e29eed1..b85ac5d8 100644
--- a/turbonfs/src/rpc_stats.cpp
+++ b/turbonfs/src/rpc_stats.cpp
@@ -18,6 +18,7 @@ namespace aznfsc {
 /* static */ std::atomic<uint64_t> rpc_stats_az::bytes_read_from_cache = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::bytes_zeroed_from_cache = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::bytes_read_ahead = 0;
+/* static */ std::atomic<uint64_t> rpc_stats_az::num_readhead = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::tot_getattr_reqs = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::getattr_served_from_cache = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::tot_lookup_reqs = 0;
@@ -34,6 +35,8 @@ namespace aznfsc {
 /* static */ std::atomic<uint64_t> rpc_stats_az::commit_lp = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::commit_gp = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::writes_np = 0;
+/* static */ std::atomic<uint64_t> rpc_stats_az::num_sync_membufs = 0;
+/* static */ std::atomic<uint64_t> rpc_stats_az::tot_bytes_sync_membufs = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::rpc_tasks_allocated = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::fuse_responses_awaited = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::fuse_reply_failed = 0;
@@ -176,6 +179,11 @@ void rpc_stats_az::dump_stats()
     str += "  " + std::to_string(num_silly_renamed) +
                   " inodes silly-renamed (waiting for last close)\n";
 
+    // Maximum cache size allowed in bytes.
+    static const uint64_t max_cache =
+        (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
+    assert(max_cache != 0);
+
     str += "File Cache statistics:\n";
     if (aznfsc_cfg.cache.data.user.enable) {
         str += "  " + std::to_string(aznfsc_cfg.cache.data.user.max_size_mb) +
@@ -192,12 +200,22 @@ void rpc_stats_az::dump_stats()
                   " file caches\n";
     str += "  " + std::to_string(bytes_chunk_cache::num_chunks_g) +
                   " chunks in chunkmap\n";
+
+    const double allocate_pct =
+        ((bytes_chunk_cache::bytes_allocated_g * 100.0) / max_cache);
     str += "  " + std::to_string(bytes_chunk_cache::bytes_allocated_g) +
-                  " bytes allocated\n";
+                  " bytes allocated (" +
+                  std::to_string(allocate_pct) + "%)\n";
+
     str += "  " + std::to_string(bytes_chunk_cache::bytes_cached_g) +
                   " bytes cached\n";
+
+    const double dirty_pct =
+        ((bytes_chunk_cache::bytes_dirty_g * 100.0) / max_cache);
     str += "  " + std::to_string(bytes_chunk_cache::bytes_dirty_g) +
-                  " bytes dirty\n";
+                  " bytes dirty (" +
+                  std::to_string(dirty_pct) + "%)\n";
+
     str += "  " + std::to_string(bytes_chunk_cache::bytes_flushing_g) +
                   " bytes currently flushing\n";
     str += "  " + std::to_string(bytes_chunk_cache::bytes_commit_pending_g) +
@@ -262,8 +280,13 @@ void rpc_stats_az::dump_stats()
     str += "  " + std::to_string(GET_GBL_STATS(bytes_zeroed_from_cache)) +
                   " bytes holes read from cache (" +
                   std::to_string(hole_cache_pct) + "%)\n";
+
+    const uint64_t avg_ra_size =
+        num_readhead ? (bytes_read_ahead / num_readhead) : 0;
     str += "  " + std::to_string(GET_GBL_STATS(bytes_read_ahead)) +
-                  " bytes read by readahead\n";
+                  " bytes read by readahead with avg size " +
+                  std::to_string(avg_ra_size) + " bytes and ra scale factor " +
+                  std::to_string(nfs_client::get_ra_scale_factor()) + "\n";
 
     const uint64_t avg_write_size =
         tot_write_reqs ? (tot_bytes_written / tot_write_reqs) : 0;
@@ -275,6 +298,7 @@ void rpc_stats_az::dump_stats()
         str += "  " + std::to_string(GET_GBL_STATS(failed_write_reqs)) +
                       " application writes failed\n";
     }
+
     str += "  " + std::to_string(GET_GBL_STATS(writes_np)) +
                   " writes did not hit any memory pressure\n";
     str += "  " + std::to_string(GET_GBL_STATS(inline_writes)) +
@@ -294,6 +318,13 @@ void rpc_stats_az::dump_stats()
     str += "  " + std::to_string(GET_GBL_STATS(commit_gp)) +
                   " commits triggered as global cache limit was reached\n";
 
+    const uint64_t avg_sync_membufs_size =
+        num_sync_membufs ? (tot_bytes_sync_membufs / num_sync_membufs) : 0;
+    str += "  " + std::to_string(GET_GBL_STATS(num_sync_membufs)) +
+                  " sync_membufs calls with avg size " +
+                  std::to_string(avg_sync_membufs_size) + " bytes and fc scale factor " +
+                  std::to_string(nfs_client::get_fc_scale_factor()) + "\n";
+
     const double getattr_cache_pct =
         tot_getattr_reqs ?
         ((getattr_served_from_cache * 100.0) / tot_getattr_reqs) : 0;
@@ -348,6 +379,13 @@ do { \
         str += "        Avg Total execute time: " + \
                         std::to_string(ops.total_usec / (ops.count * 1000.0)) + \
                         " msec\n"; \
+        if (opcode == FUSE_READ) { \
+            str += "        Last 5 sec throughput: " + \
+                            std::to_string(client.get_read_MBps()) + " MBps\n"; \
+        } else if (opcode == FUSE_WRITE) { \
+            str += "        Last 5 sec throughput: " + \
+                            std::to_string(client.get_write_MBps()) + " MBps\n"; \
+        } \
         if (!ops.error_map.empty()) { \
             str += "        Errors encountered: \n"; \
             for (const auto& entry : ops.error_map) { \
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 488eb36f..f278c547 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -674,7 +674,7 @@ void rpc_task::init_read_be(fuse_ino_t ino,
      *
      * TODO: Control this with a config.
      */
-    set_csched(CONN_SCHED_RR);
+    set_csched(CONN_SCHED_RR_R);
 
     assert(!rpc_api->read_task.is_fe());
     assert(rpc_api->read_task.is_be());
@@ -1563,7 +1563,7 @@ void rpc_task::issue_write_rpc()
      * issues as seen by stable writes.
      */
     if (!inode->is_stable_write()) {
-        set_csched(CONN_SCHED_RR);
+        set_csched(CONN_SCHED_RR_W);
     }
 
     do {
@@ -1590,6 +1590,12 @@ void rpc_task::issue_write_rpc()
             ::sleep(5);
         }
     } while (rpc_retry);
+
+    /*
+     * Write bytes are counted when we send them to libnfs as that's when they
+     * appear on the wire.
+     */
+    INC_GBL_STATS(tot_bytes_written, bciov->length);
 }
 
 static void statfs_callback(
@@ -3750,7 +3756,6 @@ void rpc_task::send_read_response()
         AZLogDebug("[{}] Sending empty read response", ino);
         reply_iov(nullptr, 0);
     } else {
-        INC_GBL_STATS(tot_bytes_read, bytes_read);
         AZLogDebug("[{}] Sending success read response, iovec={}, "
                    "bytes_read={}",
                    ino, count, bytes_read);
@@ -3866,6 +3871,12 @@ static void read_callback(
          */
         UPDATE_INODE_ATTR(inode, res->READ3res_u.resok.file_attributes);
 
+        /*
+         * Reads are counted in the callback as that's when the read
+         * responses have just come on the wire.
+         */
+        INC_GBL_STATS(tot_bytes_read, res->READ3res_u.resok.count);
+
 #ifdef ENABLE_PRESSURE_POINTS
         /*
          * Short read pressure point, skip when eof received.
diff --git a/turbonfs/src/rpc_transport.cpp b/turbonfs/src/rpc_transport.cpp
index c8c58222..471867ca 100644
--- a/turbonfs/src/rpc_transport.cpp
+++ b/turbonfs/src/rpc_transport.cpp
@@ -106,27 +106,81 @@ void rpc_transport::close()
 /*
  * This function decides which connection should be chosen for sending
  * the current request.
- * TODO: This is round-robined for now, should be modified later.
  */
 struct nfs_context *rpc_transport::get_nfs_context(conn_sched_t csched,
                                                    uint32_t fh_hash) const
 {
-    uint32_t idx = 0;
+    int idx = 0;
+    const int nconn = client->mnt_options.num_connections;
+    assert(nconn > 0);
+    const int rconn = nconn / 3;
+    const int wconn = nconn - rconn;
+
+    static std::atomic<uint64_t> last_sec;
+    static std::atomic<bool> rnw = false;
+    uint64_t now_sec = ::time(NULL);
+
+    assert(now_sec >= last_sec);
+
+    /*
+     * Take stock of things, no sooner than 5 secs.
+     */
+    if (now_sec > (last_sec + 5)) {
+        const uint64_t r_MBps = client->get_read_MBps();
+        const uint64_t w_MBps = client->get_write_MBps();
+
+        /*
+         * If both read and write are happening, assign them to separate
+         * connection pool, else writes may slow down reads as small write
+         * responses have to wait behind large read responses and v.v.
+         */
+        if (w_MBps > 100 && r_MBps > 100) {
+            rnw = (nconn >= 4);
+            AZLogInfo("[RNW=true] Write: {} Gbps, Read: {} Gbps",
+                      (w_MBps * 8.0) / 1000, (r_MBps * 8.0) / 1000);
+        } else {
+            rnw = false;
+            AZLogInfo("[RNW=false] Write: {} Gbps, Read: {} Gbps",
+                      (w_MBps * 8.0) / 1000, (r_MBps * 8.0) / 1000);
+        }
+
+        assert(!rnw || (rconn > 0 && wconn > 0));
+        last_sec = now_sec;
+    }
 
     switch (csched) {
         case CONN_SCHED_FIRST:
             idx = 0;
             break;
-        case CONN_SCHED_RR:
-            idx = (last_context++ % client->mnt_options.num_connections);
+        case CONN_SCHED_RR_R:
+            /*
+             * Reads get top 1/3rd of connections when there are simultaneous
+             * readers and writers, else they get all the connections.
+             */
+            idx = (rnw ? (wconn + (last_context++ % rconn))
+                       : (last_context++ % nconn));
+            break;
+        case CONN_SCHED_RR_W:
+            /*
+             * Writes get low 2/3rd of connections when there are simultaneous
+             * readers and writers, else they get all the connections.
+             *
+             * TODO: We should not change write connection ranges while there
+             *       are old writes still pending else we will get slowed down
+             *       by optimistic concurrency backoff.
+             */
+            idx = (rnw ? (last_context++ % wconn)
+                       : (last_context++ % nconn));
             break;
         case CONN_SCHED_FH_HASH:
             assert(fh_hash != 0);
-            idx = fh_hash % client->mnt_options.num_connections;
+            idx = rnw ? (fh_hash % wconn) : (fh_hash % nconn);
             break;
         default:
             assert(0);
     }
 
+    assert(idx >= 0 && idx < client->mnt_options.num_connections);
+
     return nfs_connections[idx]->get_nfs_context();
 }