From 30cb6b02c65439f6deff2743823520fc4b40ab7b Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Fri, 21 Feb 2025 02:58:18 +0000
Subject: [PATCH 1/7] Use separate connection pool for reads and writes

Read throughput have been seen to be affected when write throughput is high.
This is likely because of the small read requests having to wait behind large
write requests. Though we use high prio for read requests, with lots of writes
happening, read requests are likely to sit behind the huge write data.
By keeping separate connections, read requests can be promptly sent out and thus
we can get the responses back faster.
---
 turbonfs/inc/rpc_stats.h       |  5 ++-
 turbonfs/inc/rpc_task.h        |  2 +-
 turbonfs/inc/rpc_transport.h   |  9 +++--
 turbonfs/src/nfs_inode.cpp     |  6 +++-
 turbonfs/src/readahead.cpp     |  2 ++
 turbonfs/src/rpc_stats.cpp     | 16 ++++++++-
 turbonfs/src/rpc_task.cpp      |  4 +--
 turbonfs/src/rpc_transport.cpp | 63 +++++++++++++++++++++++++++++++---
 8 files changed, 95 insertions(+), 12 deletions(-)

diff --git a/turbonfs/inc/rpc_stats.h b/turbonfs/inc/rpc_stats.h
index 457bb737..f681e0c7 100644
--- a/turbonfs/inc/rpc_stats.h
+++ b/turbonfs/inc/rpc_stats.h
@@ -371,6 +371,7 @@ class rpc_stats_az
     static std::atomic<uint64_t> bytes_read_from_cache;
     static std::atomic<uint64_t> bytes_zeroed_from_cache;
     static std::atomic<uint64_t> bytes_read_ahead;
+    static std::atomic<uint64_t> num_readhead;
     static std::atomic<uint64_t> tot_getattr_reqs;
     static std::atomic<uint64_t> getattr_served_from_cache;
     static std::atomic<uint64_t> tot_lookup_reqs;
@@ -387,6 +388,8 @@ class rpc_stats_az
     static std::atomic<uint64_t> flush_gp;
     static std::atomic<uint64_t> commit_lp;
     static std::atomic<uint64_t> commit_gp;
+    static std::atomic<uint64_t> num_sync_membufs;
+    static std::atomic<uint64_t> tot_bytes_sync_membufs;
 
     static std::atomic<uint64_t> rpc_tasks_allocated;
     static std::atomic<uint64_t> fuse_responses_awaited;
@@ -395,7 +398,7 @@ class rpc_stats_az
 
 #define INC_GBL_STATS(var, inc)  rpc_stats_az::var += (inc)
 #define DEC_GBL_STATS(var, dec)  {assert(rpc_stats_az::var >= dec); rpc_stats_az::var -= (dec);}
-#define GET_GBL_STATS(var)       rpc_stats_az::var
+#define GET_GBL_STATS(var)       rpc_stats_az::var.load()
 
 struct fuse_req_stats
 {
diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h
index 29f15c8e..4771ea2d 100644
--- a/turbonfs/inc/rpc_task.h
+++ b/turbonfs/inc/rpc_task.h
@@ -2607,7 +2607,7 @@ class rpc_task_helper
          * used. Later init_*() method can set it to a more appropriate value.
          */
         task->csched = (task->client->mnt_options.nfs_port == 2047) ?
-                        CONN_SCHED_RR : CONN_SCHED_FH_HASH;
+                        CONN_SCHED_RR_W : CONN_SCHED_FH_HASH;
 
 #ifdef ENABLE_PARANOID
         task->issuing_tid = ::gettid();
diff --git a/turbonfs/inc/rpc_transport.h b/turbonfs/inc/rpc_transport.h
index a15cccbf..d677eeba 100644
--- a/turbonfs/inc/rpc_transport.h
+++ b/turbonfs/inc/rpc_transport.h
@@ -20,15 +20,20 @@ typedef enum
 
     /*
      * Round robin requests over all connections.
+     * Use CONN_SCHED_RR_R for read requests and CONN_SCHED_RR_W for write
+     * requests. This helps scheduler ensure read and write requests use
+     * exclusive connections else small write responses may get stuck behind
+     * large read responses.
      */
-    CONN_SCHED_RR       = 2,
+    CONN_SCHED_RR_R     = 2,
+    CONN_SCHED_RR_W     = 3,
 
     /*
      * Every file is affined to one connection based on the FH hash, so all
      * requests to one file go over the same connection while different files
      * will use different connections.
      */
-    CONN_SCHED_FH_HASH  = 3,
+    CONN_SCHED_FH_HASH  = 4,
 } conn_sched_t;
 
 /*
diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp
index 3087d856..98f30516 100644
--- a/turbonfs/src/nfs_inode.cpp
+++ b/turbonfs/src/nfs_inode.cpp
@@ -663,6 +663,8 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
      */
     assert(!is_commit_in_progress());
 
+    INC_GBL_STATS(num_sync_membufs, 1);
+
     if (bc_vec.empty()) {
         return;
     }
@@ -818,6 +820,8 @@ void nfs_inode::sync_membufs(std::vector<bytes_chunk> &bc_vec,
             continue;
         }
 
+        INC_GBL_STATS(tot_bytes_sync_membufs, mb->length.load());
+
         if (write_task == nullptr) {
             write_task =
                 get_client()->get_rpc_task_helper()->alloc_rpc_task(FUSE_WRITE);
@@ -1422,7 +1426,7 @@ int nfs_inode::flush_cache_and_wait()
                       " flush to complete still waiting, iter: {}",
                       get_fuse_ino(), iter);
         }
-        ::usleep(1000);
+        ::usleep(10 * 1000);
     }
 
     return get_write_error();
diff --git a/turbonfs/src/readahead.cpp b/turbonfs/src/readahead.cpp
index bcb32a3c..41f3e6de 100644
--- a/turbonfs/src/readahead.cpp
+++ b/turbonfs/src/readahead.cpp
@@ -722,6 +722,8 @@ int ra_state::issue_readaheads()
                        inode->get_fuse_ino(),
                        num_no_readahead.load(), ra_offset);
         }
+    } else {
+        INC_GBL_STATS(num_readhead, 1);
     }
 
     return ra_issued;
diff --git a/turbonfs/src/rpc_stats.cpp b/turbonfs/src/rpc_stats.cpp
index 2e29eed1..8b39df29 100644
--- a/turbonfs/src/rpc_stats.cpp
+++ b/turbonfs/src/rpc_stats.cpp
@@ -18,6 +18,7 @@ namespace aznfsc {
 /* static */ std::atomic<uint64_t> rpc_stats_az::bytes_read_from_cache = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::bytes_zeroed_from_cache = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::bytes_read_ahead = 0;
+/* static */ std::atomic<uint64_t> rpc_stats_az::num_readhead = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::tot_getattr_reqs = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::getattr_served_from_cache = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::tot_lookup_reqs = 0;
@@ -34,6 +35,8 @@ namespace aznfsc {
 /* static */ std::atomic<uint64_t> rpc_stats_az::commit_lp = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::commit_gp = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::writes_np = 0;
+/* static */ std::atomic<uint64_t> rpc_stats_az::num_sync_membufs = 0;
+/* static */ std::atomic<uint64_t> rpc_stats_az::tot_bytes_sync_membufs = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::rpc_tasks_allocated = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::fuse_responses_awaited = 0;
 /* static */ std::atomic<uint64_t> rpc_stats_az::fuse_reply_failed = 0;
@@ -262,8 +265,12 @@ void rpc_stats_az::dump_stats()
     str += "  " + std::to_string(GET_GBL_STATS(bytes_zeroed_from_cache)) +
                   " bytes holes read from cache (" +
                   std::to_string(hole_cache_pct) + "%)\n";
+
+    const uint64_t avg_ra_size =
+        num_readhead ? (bytes_read_ahead / num_readhead) : 0;
     str += "  " + std::to_string(GET_GBL_STATS(bytes_read_ahead)) +
-                  " bytes read by readahead\n";
+                  " bytes read by readahead with avg size " +
+                  std::to_string(avg_ra_size) + " bytes\n";
 
     const uint64_t avg_write_size =
         tot_write_reqs ? (tot_bytes_written / tot_write_reqs) : 0;
@@ -275,6 +282,7 @@ void rpc_stats_az::dump_stats()
         str += "  " + std::to_string(GET_GBL_STATS(failed_write_reqs)) +
                       " application writes failed\n";
     }
+
     str += "  " + std::to_string(GET_GBL_STATS(writes_np)) +
                   " writes did not hit any memory pressure\n";
     str += "  " + std::to_string(GET_GBL_STATS(inline_writes)) +
@@ -294,6 +302,12 @@ void rpc_stats_az::dump_stats()
     str += "  " + std::to_string(GET_GBL_STATS(commit_gp)) +
                   " commits triggered as global cache limit was reached\n";
 
+    const uint64_t avg_sync_membufs_size =
+        num_sync_membufs ? (tot_bytes_sync_membufs / num_sync_membufs) : 0;
+    str += "  " + std::to_string(GET_GBL_STATS(num_sync_membufs)) +
+                  " sync_membufs calls with avg size " +
+                  std::to_string(avg_sync_membufs_size) + " bytes\n";
+
     const double getattr_cache_pct =
         tot_getattr_reqs ?
         ((getattr_served_from_cache * 100.0) / tot_getattr_reqs) : 0;
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index 488eb36f..a491e6a3 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -674,7 +674,7 @@ void rpc_task::init_read_be(fuse_ino_t ino,
      *
      * TODO: Control this with a config.
      */
-    set_csched(CONN_SCHED_RR);
+    set_csched(CONN_SCHED_RR_R);
 
     assert(!rpc_api->read_task.is_fe());
     assert(rpc_api->read_task.is_be());
@@ -1563,7 +1563,7 @@ void rpc_task::issue_write_rpc()
      * issues as seen by stable writes.
      */
     if (!inode->is_stable_write()) {
-        set_csched(CONN_SCHED_RR);
+        set_csched(CONN_SCHED_RR_W);
     }
 
     do {
diff --git a/turbonfs/src/rpc_transport.cpp b/turbonfs/src/rpc_transport.cpp
index c8c58222..e3bd3c2b 100644
--- a/turbonfs/src/rpc_transport.cpp
+++ b/turbonfs/src/rpc_transport.cpp
@@ -111,22 +111,77 @@ void rpc_transport::close()
 struct nfs_context *rpc_transport::get_nfs_context(conn_sched_t csched,
                                                    uint32_t fh_hash) const
 {
-    uint32_t idx = 0;
+    int idx = 0;
+    const int nconn = client->mnt_options.num_connections;
+    assert(nconn > 0);
+    const int rconn = nconn / 3;
+    const int wconn = nconn - rconn;
+
+    static std::atomic<uint64_t> last_sec;
+    static std::atomic<uint64_t> last_bytes_written;
+    static std::atomic<uint64_t> last_bytes_read;
+    static std::atomic<bool> rnw = false;
+    uint64_t now_sec = ::time(NULL);
+
+    assert(GET_GBL_STATS(tot_bytes_written) >= last_bytes_written);
+    assert(GET_GBL_STATS(tot_bytes_read) >= last_bytes_read);
+    assert(now_sec >= last_sec);
+
+    /*
+     * Take stock of things, no sooner than 5 secs.
+     */
+    if (now_sec > (last_sec + 5)) {
+        const uint64_t w_MBps =
+            (GET_GBL_STATS(tot_bytes_written) - last_bytes_written) /
+            ((now_sec - last_sec) * 1000'000);
+        const uint64_t r_MBps =
+            (GET_GBL_STATS(tot_bytes_read) - last_bytes_read) /
+            ((now_sec - last_sec) * 1000'000);
+
+        /*
+         * If both read and write are happening fast, assign them to separate
+         * connection pool, else writes may slow down reads as small write
+         * responses have to wait behind large read responses.
+         */
+        if (w_MBps > 500 && r_MBps > 500) {
+            rnw = true;
+            AZLogInfo("[RNW] Write: {} Gbps, Read: {} Gbps",
+                      (w_MBps * 8.0) / 1000, (r_MBps * 8.0) / 1000);
+        } else {
+            rnw = false;
+            AZLogInfo("Write: {} Gbps, Read: {} Gbps",
+                      (w_MBps * 8.0) / 1000, (r_MBps * 8.0) / 1000);
+        }
+
+        last_bytes_written = GET_GBL_STATS(tot_bytes_written);
+        last_bytes_read = GET_GBL_STATS(tot_bytes_read);
+        last_sec = now_sec;
+    }
 
     switch (csched) {
         case CONN_SCHED_FIRST:
             idx = 0;
             break;
-        case CONN_SCHED_RR:
-            idx = (last_context++ % client->mnt_options.num_connections);
+        case CONN_SCHED_RR_R:
+            // Reads get 2nd half of connections.
+            idx = (rnw ? (wconn + (last_context++ % rconn))
+                       : (last_context++ % nconn));
+            break;
+        case CONN_SCHED_RR_W:
+            // Writes get 1st half of connections.
+            idx = (rnw ? (last_context++ % wconn)
+                       : (last_context++ % nconn));
             break;
         case CONN_SCHED_FH_HASH:
             assert(fh_hash != 0);
-            idx = fh_hash % client->mnt_options.num_connections;
+            // Writes and other non-read requests get 1st half of connections.
+            idx = rnw ? (fh_hash % wconn) : (fh_hash % nconn);
             break;
         default:
             assert(0);
     }
 
+    assert(idx >= 0 && idx < client->mnt_options.num_connections);
+
     return nfs_connections[idx]->get_nfs_context();
 }

From 4384dd2cbbbc06a419ab653be622838c58bf4052 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Sat, 22 Feb 2025 15:05:09 +0000
Subject: [PATCH 2/7] Calculate read/write MBps inside nfs_client
 on_rw_complete() method.

---
 turbonfs/inc/nfs_client.h      | 61 ++++++++++++++++++++++++++++++++++
 turbonfs/src/readahead.cpp     |  2 ++
 turbonfs/src/rpc_stats.cpp     |  5 +++
 turbonfs/src/rpc_task.cpp      |  4 +++
 turbonfs/src/rpc_transport.cpp | 14 ++------
 5 files changed, 74 insertions(+), 12 deletions(-)

diff --git a/turbonfs/inc/nfs_client.h b/turbonfs/inc/nfs_client.h
index 84a78c09..02e86801 100644
--- a/turbonfs/inc/nfs_client.h
+++ b/turbonfs/inc/nfs_client.h
@@ -137,6 +137,15 @@ struct nfs_client
     std::atomic<uint64_t> max_ino = 0;
 #endif
 
+    /*
+     * Latest read and write throughput.
+     * rw_genid is updated everytime these values are updated, so can be used
+     * to see throughput is changing.
+     */
+    std::atomic<uint64_t> r_MBps = 0;
+    std::atomic<uint64_t> w_MBps = 0;
+    std::atomic<uint64_t> rw_genid = 0;
+
     /*
      * Set in shutdown() to let others know that nfs_client is shutting
      * down. They can use this to quit what they are doing and plan for
@@ -214,6 +223,58 @@ struct nfs_client
         return inode_map_lock_0;
     }
 
+    /**
+     * Call this whenever a read/write completes at the server.
+     * This tracks the read/write speed provided by the server.
+     */
+    void on_rw_complete(uint64_t r_bytes, uint64_t w_bytes)
+    {
+        static std::atomic<uint64_t> last_usec;
+        static std::atomic<uint64_t> last_read;
+        static std::atomic<uint64_t> tot_read;
+        static std::atomic<uint64_t> last_written;
+        static std::atomic<uint64_t> tot_written;
+        static std::atomic<uint64_t> last_genid;
+        /*
+         * Measure read/write speed no sooner than 10 msec interval.
+         * Anything smaller and we may not get accurate reading and anything
+         * larger and it will be less valuable for readers.
+         */
+        const uint64_t sample_intvl = 10 * 1000;
+        const uint64_t now_usec = get_current_usecs();
+
+        tot_read += r_bytes;
+        tot_written += w_bytes;
+
+        /*
+         * Every sample_intvl, compute read/write throughput for the last
+         * interval.
+         */
+        const uint64_t intvl = now_usec - last_usec;
+        if (intvl >= sample_intvl) {
+            uint64_t expected = last_genid.load();
+            if (rw_genid.compare_exchange_strong(expected, expected + 1)) {
+                w_MBps = (tot_written - last_written) / intvl;
+                r_MBps = (tot_read - last_read) / intvl;
+
+                last_usec = now_usec;
+                last_read = tot_read.load();
+                last_written = tot_written.load();
+                last_genid = rw_genid.load();
+            }
+        }
+    }
+
+    uint64_t get_read_MBps() const
+    {
+        return r_MBps;
+    }
+
+    uint64_t get_write_MBps() const
+    {
+        return w_MBps;
+    }
+
     /*
      * The user should first init the client class before using it.
      */
diff --git a/turbonfs/src/readahead.cpp b/turbonfs/src/readahead.cpp
index 41f3e6de..0d28f482 100644
--- a/turbonfs/src/readahead.cpp
+++ b/turbonfs/src/readahead.cpp
@@ -440,6 +440,8 @@ static void readahead_callback (
     // Success or failure, report readahead completion.
     inode->get_rastate()->on_readahead_complete(bc->offset, bc->length);
 
+    nfs_client::get_instance().on_rw_complete(bc->length, 0);
+
     // Free the readahead RPC task.
     task->free_rpc_task();
 
diff --git a/turbonfs/src/rpc_stats.cpp b/turbonfs/src/rpc_stats.cpp
index 8b39df29..ae02c730 100644
--- a/turbonfs/src/rpc_stats.cpp
+++ b/turbonfs/src/rpc_stats.cpp
@@ -362,6 +362,11 @@ do { \
         str += "        Avg Total execute time: " + \
                         std::to_string(ops.total_usec / (ops.count * 1000.0)) + \
                         " msec\n"; \
+        if (opcode == FUSE_READ) { \
+            str += "        " + std::to_string(client.get_read_MBps()) + " MBps\n"; \
+        } else if (opcode == FUSE_WRITE) { \
+            str += "        " + std::to_string(client.get_write_MBps()) + " MBps\n"; \
+        } \
         if (!ops.error_map.empty()) { \
             str += "        Errors encountered: \n"; \
             for (const auto& entry : ops.error_map) { \
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index a491e6a3..fb2b0a60 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -1590,6 +1590,8 @@ void rpc_task::issue_write_rpc()
             ::sleep(5);
         }
     } while (rpc_retry);
+
+    client->on_rw_complete(0, bciov->orig_length);
 }
 
 static void statfs_callback(
@@ -4165,6 +4167,8 @@ static void read_callback(
     filecache_handle->release(bc->offset, bc->length);
 #endif
 
+    task->get_client()->on_rw_complete(bc->length, 0);
+
     // For failed status we must never mark the buffer uptodate.
     assert(!status || !bc->get_membuf()->is_uptodate());
 
diff --git a/turbonfs/src/rpc_transport.cpp b/turbonfs/src/rpc_transport.cpp
index e3bd3c2b..d5f20428 100644
--- a/turbonfs/src/rpc_transport.cpp
+++ b/turbonfs/src/rpc_transport.cpp
@@ -118,25 +118,17 @@ struct nfs_context *rpc_transport::get_nfs_context(conn_sched_t csched,
     const int wconn = nconn - rconn;
 
     static std::atomic<uint64_t> last_sec;
-    static std::atomic<uint64_t> last_bytes_written;
-    static std::atomic<uint64_t> last_bytes_read;
     static std::atomic<bool> rnw = false;
     uint64_t now_sec = ::time(NULL);
 
-    assert(GET_GBL_STATS(tot_bytes_written) >= last_bytes_written);
-    assert(GET_GBL_STATS(tot_bytes_read) >= last_bytes_read);
     assert(now_sec >= last_sec);
 
     /*
      * Take stock of things, no sooner than 5 secs.
      */
     if (now_sec > (last_sec + 5)) {
-        const uint64_t w_MBps =
-            (GET_GBL_STATS(tot_bytes_written) - last_bytes_written) /
-            ((now_sec - last_sec) * 1000'000);
-        const uint64_t r_MBps =
-            (GET_GBL_STATS(tot_bytes_read) - last_bytes_read) /
-            ((now_sec - last_sec) * 1000'000);
+        const uint64_t r_MBps = client->get_read_MBps();
+        const uint64_t w_MBps = client->get_write_MBps();
 
         /*
          * If both read and write are happening fast, assign them to separate
@@ -153,8 +145,6 @@ struct nfs_context *rpc_transport::get_nfs_context(conn_sched_t csched,
                       (w_MBps * 8.0) / 1000, (r_MBps * 8.0) / 1000);
         }
 
-        last_bytes_written = GET_GBL_STATS(tot_bytes_written);
-        last_bytes_read = GET_GBL_STATS(tot_bytes_read);
         last_sec = now_sec;
     }
 

From cd76e0d6d2c6ae39c212f4a3d7c3fc4360c6c53b Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Sat, 22 Feb 2025 22:44:49 +0000
Subject: [PATCH 3/7] Unified read and write scale factor updation in
 nfs_client::update_adaptive()

---
 turbonfs/inc/fcsm.h         |  28 -------
 turbonfs/inc/file_cache.h   |  32 +-------
 turbonfs/inc/nfs_client.h   |  46 ++++++++++-
 turbonfs/inc/readahead.h    |  14 +---
 turbonfs/src/fcsm.cpp       |  46 -----------
 turbonfs/src/file_cache.cpp |  36 ++++++++-
 turbonfs/src/nfs_client.cpp | 149 ++++++++++++++++++++++++++++++++++++
 turbonfs/src/readahead.cpp  |  59 +-------------
 turbonfs/src/rpc_stats.cpp  |  25 +++++-
 9 files changed, 253 insertions(+), 182 deletions(-)

diff --git a/turbonfs/inc/fcsm.h b/turbonfs/inc/fcsm.h
index 12ddbbaf..91dcd944 100644
--- a/turbonfs/inc/fcsm.h
+++ b/turbonfs/inc/fcsm.h
@@ -225,17 +225,6 @@ class fcsm
     void ctgtq_cleanup();
     void ftgtq_cleanup();
 
-    /**
-     * Update fc_scale_factor according to the current cache pressure.
-     * When global cache utilization is high, it reduces fc_scale_factor so
-     * that all writers flush/commit early, for easing global memory pressure.
-     */
-    static void update_fc_scale_factor();
-    static double get_fc_scale_factor()
-    {
-        return fc_scale_factor;
-    }
-
 private:
     /*
      * The singleton nfs_client, for convenience.
@@ -345,23 +334,6 @@ class fcsm
      * The state machine starts in an idle state.
      */
     std::atomic<bool> running = false;
-
-    /*
-     * Value returned by max_dirty_extent_bytes() is scaled down by this much
-     * before it's used by:
-     * - flush_required()
-     * - commit_required()
-     * - do_inline_write()
-     *
-     * fc_scale_factor is computed by update_fc_scale_factor() according to
-     * the global cache pressure. If global cache pressure is high we want the
-     * local flush/commit limits to be reduced so that each file flushes/commits
-     * faster thus easing the global cache pressure. This promotes fair sharing
-     * of global cache space while also maintaining enough contiguous data to
-     * the server, needed for better write throughput. Stable and unstable
-     * write may use this scale factor differently.
-     */
-    static std::atomic<double> fc_scale_factor;
 };
 
 struct FC_CB_TRACKER
diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index 9fb37c1a..fea61764 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -1389,37 +1389,7 @@ class bytes_chunk_cache
      * scheduler an opportunity to merge better.
      * For unstable writes this allows us enough PB parallelism.
      */
-    static uint64_t max_dirty_extent_bytes()
-    {
-        // Maximum cache size allowed in bytes.
-        static const uint64_t max_total =
-            (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
-        assert(max_total != 0);
-
-        /*
-         * Capped due to global cache size. One single file should not use
-         * more than 60% of the cache.
-         */
-        static const uint64_t max_dirty_extent_g = (max_total * 0.6);
-
-        /*
-         * Capped due to per-file cache discipline.
-         * Every file wants to keep 10 full sized blocks but that can be
-         * reduced as per the current cache pressure, but never less than
-         * one full size block.
-         */
-        static const uint64_t max_dirty_extent_l =
-            (10 * AZNFSC_MAX_BLOCK_SIZE) * fcsm::get_fc_scale_factor();
-        assert(max_dirty_extent_l >= AZNFSC_MAX_BLOCK_SIZE);
-
-        const uint64_t max_dirty_extent =
-            std::min(max_dirty_extent_g, max_dirty_extent_l);
-
-        // At least one full sized block.
-        assert(max_dirty_extent >= AZNFSC_MAX_BLOCK_SIZE);
-
-        return max_dirty_extent;
-    }
+    static uint64_t max_dirty_extent_bytes();
 
     /**
      * Get the amount of dirty data that needs to be flushed.
diff --git a/turbonfs/inc/nfs_client.h b/turbonfs/inc/nfs_client.h
index 02e86801..4708117a 100644
--- a/turbonfs/inc/nfs_client.h
+++ b/turbonfs/inc/nfs_client.h
@@ -146,6 +146,29 @@ struct nfs_client
     std::atomic<uint64_t> w_MBps = 0;
     std::atomic<uint64_t> rw_genid = 0;
 
+    /*
+     * Value returned by max_dirty_extent_bytes() is scaled down by this much
+     * before it's used by:
+     * - flush_required()
+     * - commit_required()
+     * - do_inline_write()
+     *
+     * fc_scale_factor is computed by update_adaptive() according to the global
+     * cache pressure. If global cache pressure is high we want the local
+     * flush/commit limits to be reduced so that each file flushes/commits
+     * faster thus easing the global cache pressure. This promotes fair sharing
+     * of global cache space while also maintaining enough contiguous data to
+     * the server, needed for better write throughput. Stable and unstable
+     * write may use this scale factor differently.
+     */
+    static std::atomic<double> fc_scale_factor;
+
+    /*
+     * update_adaptive() will update this scaling factor to force all ra_state
+     * machines to slow down readahead in case of high memory pressure.
+     */
+    static std::atomic<double> ra_scale_factor;
+
     /*
      * Set in shutdown() to let others know that nfs_client is shutting
      * down. They can use this to quit what they are doing and plan for
@@ -194,6 +217,16 @@ struct nfs_client
         return client;
     }
 
+    static double get_fc_scale_factor()
+    {
+        return fc_scale_factor;
+    }
+
+    static double get_ra_scale_factor()
+    {
+        return ra_scale_factor;
+    }
+
     /**
      * Returns true if nfs_client is shutting down.
      */
@@ -223,6 +256,17 @@ struct nfs_client
         return inode_map_lock_0;
     }
 
+    /**
+     * Update various adaptive scale factors that decide following things:
+     * - how much we readahead, and
+     * - how long we keep dirty data before flushing.
+     *
+     * It monitors various things like, how much of the cache is occupied
+     * by read or write data, whether read/write speed is increasing by
+     * chaning the various scale factors, etc.
+     */
+    void update_adaptive();
+
     /**
      * Call this whenever a read/write completes at the server.
      * This tracks the read/write speed provided by the server.
@@ -240,7 +284,7 @@ struct nfs_client
          * Anything smaller and we may not get accurate reading and anything
          * larger and it will be less valuable for readers.
          */
-        const uint64_t sample_intvl = 10 * 1000;
+        const uint64_t sample_intvl = 5 * 1000 * 1000;
         const uint64_t now_usec = get_current_usecs();
 
         tot_read += r_bytes;
diff --git a/turbonfs/inc/readahead.h b/turbonfs/inc/readahead.h
index 75a6d0f5..16f86715 100644
--- a/turbonfs/inc/readahead.h
+++ b/turbonfs/inc/readahead.h
@@ -305,22 +305,10 @@ class ra_state
         max_byte_read = UINT64_MAX;
     }
 
-    /**
-     * Update ra_scale_factor according to the current cache pressure.
-     * When global cache utilization is high, it reduces ra_scale_factor so
-     * that all readers use less ra window, for easing global memory pressure.
-     * Likewise when global cache utilization is low it increases the
-     * ra_scale_factor to let readers use higher readahead.
-     */
-    static void update_scale_factor();
-
     /**
      * Returns the scaled ra window that caller can safely use.
      */
-    uint64_t get_ra_bytes() const
-    {
-        return ra_bytes * ra_scale_factor;
-    }
+    uint64_t get_ra_bytes() const;
 
     /**
      * This will run self tests to test the correctness of this class.
diff --git a/turbonfs/src/fcsm.cpp b/turbonfs/src/fcsm.cpp
index 9918c5e3..08068ecd 100644
--- a/turbonfs/src/fcsm.cpp
+++ b/turbonfs/src/fcsm.cpp
@@ -3,7 +3,6 @@
 #include "nfs_inode.h"
 
 namespace aznfsc {
-/* static */ std::atomic<double> fcsm::fc_scale_factor = 1.0;
 
 /**
  * This is called from alloc_fcsm() with exclusive lock on ilock_1.
@@ -110,51 +109,6 @@ void fcsm::add_committing(uint64_t bytes)
     assert(committing_seq_num <= flushed_seq_num);
 }
 
-/* static */
-void fcsm::update_fc_scale_factor()
-{
-    // Maximum cache size allowed in bytes.
-    static const uint64_t max_cache =
-        (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
-    assert(max_cache != 0);
-    const uint64_t curr_cache = bytes_chunk_cache::bytes_allocated_g;
-    const double percent_cache = (curr_cache * 100.0) / max_cache;
-    double scale = 1.0;
-
-    if (percent_cache > 95) {
-        /*
-         * Every file has fundamental right to 100MB of cache space.
-         * If we reduce it further we will end up in sub-optimal writes
-         * to the server.
-         */
-        scale = 1.0/10;
-    } else if (percent_cache > 90) {
-        // 200MB
-        scale = 2.0/10;
-    } else if (percent_cache > 80) {
-        // 300MB
-        scale = 3.0/10;
-    } else if (percent_cache > 70) {
-        // 500MB
-        scale = 5.0/10;
-    } else if (percent_cache > 60) {
-        // 800MB
-        scale = 8.0/10;
-    }
-
-    if (fc_scale_factor != scale) {
-        static uint64_t last_log_usec;
-        const uint64_t now = get_current_usecs();
-        // Don't log more frequently than 5 secs.
-        if ((now - last_log_usec) > (5 * 1000 * 1000)) {
-            AZLogInfo("[FC] Scale factor updated ({} -> {})",
-                      fc_scale_factor.load(), scale);
-            last_log_usec = now;
-        }
-        fc_scale_factor = scale;
-    }
-}
-
 void fcsm::run(struct rpc_task *task,
                uint64_t extent_left,
                uint64_t extent_right)
diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp
index 6a837bdc..65c976b5 100644
--- a/turbonfs/src/file_cache.cpp
+++ b/turbonfs/src/file_cache.cpp
@@ -2404,6 +2404,39 @@ int bytes_chunk_cache::truncate(uint64_t trunc_len,
     return mb_skipped;
 }
 
+/* static */
+uint64_t bytes_chunk_cache::max_dirty_extent_bytes()
+{
+    // Maximum cache size allowed in bytes.
+    static const uint64_t max_total =
+        (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
+    assert(max_total != 0);
+
+    /*
+     * Capped due to global cache size. One single file should not use
+     * more than 60% of the cache.
+     */
+    static const uint64_t max_dirty_extent_g = (max_total * 0.6);
+
+    /*
+     * Capped due to per-file cache discipline.
+     * Every file wants to keep 10 full sized blocks but that can be
+     * reduced as per the current cache pressure, but never less than
+     * one full size block.
+     */
+    static const uint64_t max_dirty_extent_l =
+        (10 * AZNFSC_MAX_BLOCK_SIZE) * nfs_client::get_fc_scale_factor();
+    assert(max_dirty_extent_l >= AZNFSC_MAX_BLOCK_SIZE);
+
+    const uint64_t max_dirty_extent =
+        std::min(max_dirty_extent_g, max_dirty_extent_l);
+
+    // At least one full sized block.
+    assert(max_dirty_extent >= AZNFSC_MAX_BLOCK_SIZE);
+
+    return max_dirty_extent;
+}
+
 /*
  * TODO: Add pruning stats.
  */
@@ -2416,8 +2449,7 @@ void bytes_chunk_cache::inline_prune()
      * See if we need to slowdown/speedup readahead and flush/commit more
      * promptly, per the current memory pressure.
      */
-    ra_state::update_scale_factor();
-    fcsm::update_fc_scale_factor();
+    nfs_client::get_instance().update_adaptive();
 
     get_prune_goals(&inline_bytes, nullptr);
 
diff --git a/turbonfs/src/nfs_client.cpp b/turbonfs/src/nfs_client.cpp
index 924a2c77..369e7bc9 100644
--- a/turbonfs/src/nfs_client.cpp
+++ b/turbonfs/src/nfs_client.cpp
@@ -4,6 +4,11 @@
 #include "rpc_task.h"
 #include "rpc_readdir.h"
 
+/* static */
+std::atomic<double> nfs_client::ra_scale_factor = 1.0;
+/* static */
+std::atomic<double> nfs_client::fc_scale_factor = 1.0;
+
 // The user should first init the client class before using it.
 bool nfs_client::init()
 {
@@ -209,6 +214,150 @@ void nfs_client::shutdown()
     jukebox_thread.join();
 }
 
+void nfs_client::update_adaptive()
+{
+    // Maximum cache size allowed in bytes.
+    static const uint64_t max_cache =
+        (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
+    assert(max_cache != 0);
+
+    /*
+     * cache  => total cache allocated, read + write.
+     * wcache => cache allocated by writers.
+     * rcache => cache allocated by reades.
+     */
+    const uint64_t cache = bytes_chunk_cache::bytes_allocated_g;
+    const uint64_t wcache = bytes_chunk_cache::bytes_dirty_g;
+    const uint64_t rcache = cache - wcache;
+
+    /*
+     * Read cache usage as percent of max_cache.
+     * This tells how aggressively readers are filling up the cache by
+     * readahead. We increase readahead if this is low and reduce readahead
+     * as this grows.
+     */
+    const double pct_rcache = (rcache * 100.0) / max_cache;
+
+    /*
+     * Write cache usage as percent of max_cache.
+     * This tells how aggressively writes are filling up the cache by adding
+     * dirty data. We reduce max_dirty_extent_bytes() if this grows so that
+     * dirty data is flushed faster, and increase max_dirty_extent_bytes() if
+     * this is low and we want to accumulate more dirty data and flush in bigger
+     * chunks, for fast backend write throughput.
+     */
+    const double pct_wcache = (wcache * 100.0) / max_cache;
+
+    /*
+     * Total cache usage (read + write) as percent of max_cache.
+     * Reads and writes are controlled individually as per pct_rcache and
+     * pct_wcache, but in the end total cache usage is consulted and the
+     * read and write scale factors are reduced accordingly.
+     */
+    const double pct_cache = (cache * 100.0) / max_cache;
+
+    double rscale = 1.0;
+    double wscale = 1.0;
+
+    /*
+     * Stop readahead fully if we are beyond the max cache size, o/w scale it
+     * down proportionately to keep the cache size less than max cache limit.
+     * We also scale up the readahead to make better utilization of the allowed
+     * cache size, when there are fewer reads they are allowed to use more of
+     * the cache per user for readahead. We increase the scale factor just a
+     * little less than what would exhaust the entire cache, e.g., if percent
+     * cache utilization is ~5% it means if we increase the readahead by 20
+     * times we should be able to get all of cache utilized by readaheads,
+     * we increase the scale factor to slightly less than 20, to 16, and so
+     * on.
+     */
+    if (pct_rcache >= 100) {
+        rscale = 0;
+    } else if (pct_rcache > 95) {
+        rscale = 0.5;
+    } else if (pct_rcache > 90) {
+        rscale = 0.7;
+    } else if (pct_rcache > 80) {
+        rscale = 0.8;
+    } else if (pct_rcache > 70) {
+        rscale = 0.9;
+    } else if (pct_rcache < 5) {
+        rscale = 16;
+    } else if (pct_rcache < 10) {
+        rscale = 8;
+    } else if (pct_rcache < 20) {
+        rscale = 4;
+    } else if (pct_rcache < 30) {
+        rscale = 2;
+    } else if (pct_rcache < 50) {
+        rscale = 1.5;
+    }
+
+    if (pct_wcache > 95) {
+        /*
+         * Every file has fundamental right to 100MB of cache space.
+         * If we reduce it further we will end up in sub-optimal writes
+         * to the server.
+         */
+        wscale = 1.0/10;
+    } else if (pct_wcache > 90) {
+        // 200MB
+        wscale = 2.0/10;
+    } else if (pct_wcache > 80) {
+        // 300MB
+        wscale = 3.0/10;
+    } else if (pct_wcache > 70) {
+        // 500MB
+        wscale = 5.0/10;
+    } else if (pct_wcache > 60) {
+        // 800MB
+        wscale = 8.0/10;
+    }
+
+    /*
+     * In the end check total cache utilization and reduce wscale/rscale if
+     * total cache utilization is high.
+     */
+    if (pct_cache >= 100) {
+        rscale = 0;
+        wscale = 1.0/10;
+    } else if (pct_cache >= 75) {
+        wscale -= 1.0/10;
+        if (wscale < 1.0/10) {
+            wscale = 1.0/10;
+        }
+        if (rscale > 1.0) {
+            rscale--;
+        } else if (rscale > 0.3) {
+            rscale -= 0.1;
+        }
+    }
+
+    if (fc_scale_factor != wscale) {
+        static uint64_t last_log_usec;
+        const uint64_t now = get_current_usecs();
+        // Don't log more frequently than 5 secs.
+        if ((now - last_log_usec) > (5 * 1000 * 1000)) {
+            AZLogInfo("[FC] Scale factor updated ({} -> {})",
+                      fc_scale_factor.load(), wscale);
+            last_log_usec = now;
+        }
+        fc_scale_factor = wscale;
+    }
+
+    if (ra_scale_factor != rscale) {
+        static uint64_t last_log_usec;
+        const uint64_t now = get_current_usecs();
+        // Don't log more frequently than 5 secs.
+        if ((now - last_log_usec) > (5 * 1000 * 1000)) {
+            AZLogInfo("[Readahead] Scale factor updated ({} -> {})",
+                      ra_scale_factor.load(), rscale);
+            last_log_usec = now;
+        }
+        ra_scale_factor = rscale;
+    }
+}
+
 void nfs_client::jukebox_runner()
 {
     AZLogDebug("Started jukebox_runner");
diff --git a/turbonfs/src/readahead.cpp b/turbonfs/src/readahead.cpp
index 0d28f482..d4416fdf 100644
--- a/turbonfs/src/readahead.cpp
+++ b/turbonfs/src/readahead.cpp
@@ -18,8 +18,6 @@
 
 namespace aznfsc {
 
-/* static */ std::atomic<double> ra_state::ra_scale_factor = 1.0;
-
 /**
  * This is called from alloc_rastate() with exclusive lock on ilock_1.
  */
@@ -64,62 +62,9 @@ ra_state::ra_state(struct nfs_client *_client,
                inode->get_fuse_ino(), ra_bytes, def_ra_size);
 }
 
-/* static */
-void ra_state::update_scale_factor()
+uint64_t ra_state::get_ra_bytes() const
 {
-    // Maximum cache size allowed in bytes.
-    static const uint64_t max_cache =
-        (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
-    assert(max_cache != 0);
-    const uint64_t curr_cache = bytes_chunk_cache::bytes_allocated_g;
-    const double percent_cache = (curr_cache * 100.0) / max_cache;
-    double scale = 1.0;
-
-    /*
-     * Stop readahead fully if we are beyond the max cache size, o/w scale it
-     * down proportionately to keep the cache size less than max cache limit.
-     * We also scale up the readahead to make better utilization of the allowed
-     * cache size, when there are fewer reads they are allowed to use more of
-     * the cache per user for readahead. We increase the scale factor just a
-     * little less than what would exhaust the entire cache, e.g., if percent
-     * cache utilization is ~5% it means if we increase the readahead by 20
-     * times we should be able to get all of cache utilized by readaheads,
-     * we increase the scale factor to slightly less than 20, to 16, and so
-     * on.
-     */
-    if (percent_cache >= 100) {
-        scale = 0;
-    } else if (percent_cache > 95) {
-        scale = 0.5;
-    } else if (percent_cache > 90) {
-        scale = 0.7;
-    } else if (percent_cache > 80) {
-        scale = 0.8;
-    } else if (percent_cache > 70) {
-        scale = 0.9;
-    } else if (percent_cache < 5) {
-        scale = 16;
-    } else if (percent_cache < 10) {
-        scale = 8;
-    } else if (percent_cache < 20) {
-        scale = 4;
-    } else if (percent_cache < 30) {
-        scale = 2;
-    } else if (percent_cache < 50) {
-        scale = 1.5;
-    }
-
-    if (ra_scale_factor != scale) {
-        static uint64_t last_log_usec;
-        const uint64_t now = get_current_usecs();
-        // Don't log more frequently than 5 secs.
-        if ((now - last_log_usec) > (5 * 1000 * 1000)) {
-            AZLogInfo("[Readahead] Scale factor updated ({} -> {})",
-                      ra_scale_factor.load(), scale);
-            last_log_usec = now;
-        }
-        ra_scale_factor = scale;
-    }
+    return ra_bytes * nfs_client::get_ra_scale_factor();
 }
 
 /**
diff --git a/turbonfs/src/rpc_stats.cpp b/turbonfs/src/rpc_stats.cpp
index ae02c730..03c816e8 100644
--- a/turbonfs/src/rpc_stats.cpp
+++ b/turbonfs/src/rpc_stats.cpp
@@ -179,6 +179,11 @@ void rpc_stats_az::dump_stats()
     str += "  " + std::to_string(num_silly_renamed) +
                   " inodes silly-renamed (waiting for last close)\n";
 
+    // Maximum cache size allowed in bytes.
+    static const uint64_t max_cache =
+        (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
+    assert(max_cache != 0);
+
     str += "File Cache statistics:\n";
     if (aznfsc_cfg.cache.data.user.enable) {
         str += "  " + std::to_string(aznfsc_cfg.cache.data.user.max_size_mb) +
@@ -195,12 +200,22 @@ void rpc_stats_az::dump_stats()
                   " file caches\n";
     str += "  " + std::to_string(bytes_chunk_cache::num_chunks_g) +
                   " chunks in chunkmap\n";
+
+    const double allocate_pct =
+        ((bytes_chunk_cache::bytes_allocated_g * 100.0) / max_cache);
     str += "  " + std::to_string(bytes_chunk_cache::bytes_allocated_g) +
-                  " bytes allocated\n";
+                  " bytes allocated (" +
+                  std::to_string(allocate_pct) + "%)\n";
+
     str += "  " + std::to_string(bytes_chunk_cache::bytes_cached_g) +
                   " bytes cached\n";
+
+    const double dirty_pct =
+        ((bytes_chunk_cache::bytes_dirty_g * 100.0) / max_cache);
     str += "  " + std::to_string(bytes_chunk_cache::bytes_dirty_g) +
-                  " bytes dirty\n";
+                  " bytes dirty (" +
+                  std::to_string(dirty_pct) + "%)\n";
+
     str += "  " + std::to_string(bytes_chunk_cache::bytes_flushing_g) +
                   " bytes currently flushing\n";
     str += "  " + std::to_string(bytes_chunk_cache::bytes_commit_pending_g) +
@@ -270,7 +285,8 @@ void rpc_stats_az::dump_stats()
         num_readhead ? (bytes_read_ahead / num_readhead) : 0;
     str += "  " + std::to_string(GET_GBL_STATS(bytes_read_ahead)) +
                   " bytes read by readahead with avg size " +
-                  std::to_string(avg_ra_size) + " bytes\n";
+                  std::to_string(avg_ra_size) + " bytes and ra scale factor " +
+                  std::to_string(nfs_client::get_ra_scale_factor()) + "\n";
 
     const uint64_t avg_write_size =
         tot_write_reqs ? (tot_bytes_written / tot_write_reqs) : 0;
@@ -306,7 +322,8 @@ void rpc_stats_az::dump_stats()
         num_sync_membufs ? (tot_bytes_sync_membufs / num_sync_membufs) : 0;
     str += "  " + std::to_string(GET_GBL_STATS(num_sync_membufs)) +
                   " sync_membufs calls with avg size " +
-                  std::to_string(avg_sync_membufs_size) + " bytes\n";
+                  std::to_string(avg_sync_membufs_size) + " bytes and fc scale factor " +
+                  std::to_string(nfs_client::get_fc_scale_factor()) + "\n";
 
     const double getattr_cache_pct =
         tot_getattr_reqs ?

From e36fbd6d91062d815753965c818a1ae45c2e7e9e Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Sun, 23 Feb 2025 01:31:59 +0000
Subject: [PATCH 4/7] scale based on read and write throughput and not on cache
 size

---
 turbonfs/inc/file_cache.h   |  8 ++--
 turbonfs/inc/nfs_client.h   |  5 +++
 turbonfs/src/nfs_client.cpp | 90 ++++++++++++++++++++++++++++++++-----
 3 files changed, 89 insertions(+), 14 deletions(-)

diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index fea61764..73d9cf24 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -1666,10 +1666,10 @@ class bytes_chunk_cache
          * Following also means that at any time, half of the cache_max_mb
          * can be safely present in the cache.
          */
-        static const uint64_t inline_threshold = (max_total * 0.8);
-        static const uint64_t inline_target = (max_total * 0.7);
-        static const uint64_t periodic_threshold = (max_total * 0.6);
-        static const uint64_t periodic_target = (max_total * 0.5);
+        static const uint64_t inline_threshold = (max_total * 0.9);
+        static const uint64_t inline_target = (max_total * 0.8);
+        static const uint64_t periodic_threshold = (max_total * 0.7);
+        static const uint64_t periodic_target = (max_total * 0.6);
 
         /*
          * Current total cache size in bytes. Save it once to avoid issues
diff --git a/turbonfs/inc/nfs_client.h b/turbonfs/inc/nfs_client.h
index 4708117a..28a9fcb7 100644
--- a/turbonfs/inc/nfs_client.h
+++ b/turbonfs/inc/nfs_client.h
@@ -319,6 +319,11 @@ struct nfs_client
         return w_MBps;
     }
 
+    uint64_t get_rw_genid() const
+    {
+        return rw_genid;
+    }
+
     /*
      * The user should first init the client class before using it.
      */
diff --git a/turbonfs/src/nfs_client.cpp b/turbonfs/src/nfs_client.cpp
index 369e7bc9..8c5cfef6 100644
--- a/turbonfs/src/nfs_client.cpp
+++ b/turbonfs/src/nfs_client.cpp
@@ -216,11 +216,15 @@ void nfs_client::shutdown()
 
 void nfs_client::update_adaptive()
 {
+#if 1
     // Maximum cache size allowed in bytes.
     static const uint64_t max_cache =
         (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
     assert(max_cache != 0);
 
+#endif
+
+#if 1
     /*
      * cache  => total cache allocated, read + write.
      * wcache => cache allocated by writers.
@@ -228,7 +232,7 @@ void nfs_client::update_adaptive()
      */
     const uint64_t cache = bytes_chunk_cache::bytes_allocated_g;
     const uint64_t wcache = bytes_chunk_cache::bytes_dirty_g;
-    const uint64_t rcache = cache - wcache;
+    const uint64_t rcache = (uint64_t) std::max((int64_t)(cache - wcache), 0L);
 
     /*
      * Read cache usage as percent of max_cache.
@@ -240,7 +244,7 @@ void nfs_client::update_adaptive()
 
     /*
      * Write cache usage as percent of max_cache.
-     * This tells how aggressively writes are filling up the cache by adding
+     * This tells how aggressively writees are filling up the cache by adding
      * dirty data. We reduce max_dirty_extent_bytes() if this grows so that
      * dirty data is flushed faster, and increase max_dirty_extent_bytes() if
      * this is low and we want to accumulate more dirty data and flush in bigger
@@ -255,13 +259,63 @@ void nfs_client::update_adaptive()
      * read and write scale factors are reduced accordingly.
      */
     const double pct_cache = (cache * 100.0) / max_cache;
+#endif
 
     double rscale = 1.0;
     double wscale = 1.0;
 
+#if 0
+    static std::atomic<uint64_t> last_rMBps;
+    static std::atomic<uint64_t> last_wMBps;
+    static std::atomic<uint64_t> last_rw_genid;
+
+    if (last_rw_genid == rw_genid) {
+        return;
+    }
+
+    uint64_t expected = last_rw_genid;
+    if (!last_rw_genid.compare_exchange_strong(expected, rw_genid.load())) {
+        return;
+    }
+
+    bool read_gt = (r_MBps > last_rMBps * 1.02);
+    bool read_lt = (r_MBps < last_rMBps * 0.98);
+    bool write_gt = (w_MBps > last_wMBps * 1.02);
+    bool write_lt = (w_MBps < last_wMBps * 0.98);
+
+    if (read_gt) {
+        if (ra_scale_factor >= 1.0) {
+            rscale = ra_scale_factor + 1.0;
+        } else {
+            rscale = ra_scale_factor + 0.1;
+        }
+    } else if (read_lt) {
+        if (ra_scale_factor > 1.0) {
+            rscale = ra_scale_factor - 1.0;
+        } else {
+            rscale = ra_scale_factor - 0.1;
+        }
+        if (rscale < 0)
+            rscale = 0;
+    }
+
+    if (write_gt) {
+        wscale = fc_scale_factor + 1.0/10;
+    } else if (write_lt) {
+        wscale = fc_scale_factor - 1.0/10;
+        if (wscale <= 0) {
+            wscale = 1.0/10;
+        }
+    }
+
+    last_rMBps = r_MBps.load();
+    last_wMBps = w_MBps.load();
+#endif
+
+#if 1
     /*
-     * Stop readahead fully if we are beyond the max cache size, o/w scale it
-     * down proportionately to keep the cache size less than max cache limit.
+     * Stop readahead completely if we are beyond the max cache size, o/w scale
+     * it down proportionately to keep the cache size less than max cache limit.
      * We also scale up the readahead to make better utilization of the allowed
      * cache size, when there are fewer reads they are allowed to use more of
      * the cache per user for readahead. We increase the scale factor just a
@@ -272,6 +326,11 @@ void nfs_client::update_adaptive()
      * on.
      */
     if (pct_rcache >= 100) {
+        /*
+         * reads taking up all the cache space, completely stop readaheads.
+         * This will cause read cache utilization to drop and then we will
+         * increase readaheads, finally it'll settle on an optimal value.
+         */
         rscale = 0;
     } else if (pct_rcache > 95) {
         rscale = 0.5;
@@ -281,6 +340,8 @@ void nfs_client::update_adaptive()
         rscale = 0.8;
     } else if (pct_rcache > 70) {
         rscale = 0.9;
+    } else if (pct_rcache < 3) {
+        rscale = 32;
     } else if (pct_rcache < 5) {
         rscale = 16;
     } else if (pct_rcache < 10) {
@@ -310,8 +371,11 @@ void nfs_client::update_adaptive()
         // 500MB
         wscale = 5.0/10;
     } else if (pct_wcache > 60) {
-        // 800MB
-        wscale = 8.0/10;
+        // 600MB
+        wscale = 6.0/10;
+    } else if (pct_wcache > 50) {
+        // 700MB
+        wscale = 7.0/10;
     }
 
     /*
@@ -333,13 +397,17 @@ void nfs_client::update_adaptive()
         }
     }
 
+#endif
+
     if (fc_scale_factor != wscale) {
         static uint64_t last_log_usec;
         const uint64_t now = get_current_usecs();
         // Don't log more frequently than 5 secs.
         if ((now - last_log_usec) > (5 * 1000 * 1000)) {
-            AZLogInfo("[FC] Scale factor updated ({} -> {})",
-                      fc_scale_factor.load(), wscale);
+            AZLogInfo("[FC] Scale factor updated ({} -> {}), "
+                      "cache util [R: {:0.2f}%, W: {:0.2f}%, T: {:0.2f}%]",
+                      fc_scale_factor.load(), wscale,
+                      pct_rcache, pct_wcache, pct_cache);
             last_log_usec = now;
         }
         fc_scale_factor = wscale;
@@ -350,8 +418,10 @@ void nfs_client::update_adaptive()
         const uint64_t now = get_current_usecs();
         // Don't log more frequently than 5 secs.
         if ((now - last_log_usec) > (5 * 1000 * 1000)) {
-            AZLogInfo("[Readahead] Scale factor updated ({} -> {})",
-                      ra_scale_factor.load(), rscale);
+            AZLogInfo("[RA] Scale factor updated ({} -> {}), "
+                      "cache util [R: {:0.2f}%, W: {:0.2f}%, T: {:0.2f}%]",
+                      ra_scale_factor.load(), rscale,
+                      pct_rcache, pct_wcache, pct_cache);
             last_log_usec = now;
         }
         ra_scale_factor = rscale;

From 46cc30f966e908bcd50e794e95749a1f591285f1 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Sun, 23 Feb 2025 05:56:15 +0000
Subject: [PATCH 5/7] updates

---
 turbonfs/inc/file_cache.h      |   4 +-
 turbonfs/inc/nfs_client.h      |  68 ++++---------
 turbonfs/inc/rpc_stats.h       |   6 +-
 turbonfs/inc/rpc_transport.h   |   4 +-
 turbonfs/src/file_cache.cpp    |   6 +-
 turbonfs/src/nfs_client.cpp    | 171 ++++++++++++++-------------------
 turbonfs/src/readahead.cpp     |   2 -
 turbonfs/src/rpc_stats.cpp     |   6 +-
 turbonfs/src/rpc_task.cpp      |   4 -
 turbonfs/src/rpc_transport.cpp |  26 +++--
 10 files changed, 125 insertions(+), 172 deletions(-)

diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h
index 73d9cf24..12debd1f 100644
--- a/turbonfs/inc/file_cache.h
+++ b/turbonfs/inc/file_cache.h
@@ -1654,8 +1654,8 @@ class bytes_chunk_cache
         assert(max_total != 0);
 
         /*
-         * If cache usage grows to 80% of max, we enforce inline pruning for
-         * writers. When cache usage grows more than 60% we recommend periodic
+         * If cache usage grows to 90% of max, we enforce inline pruning for
+         * writers. When cache usage grows more than 70% we recommend periodic
          * pruning. If the cache size is sufficient, hopefully we will not need
          * inline pruning too often, as it hurts application write performance.
          * Once curr_bytes_total exceeds inline_threshold we need to perform
diff --git a/turbonfs/inc/nfs_client.h b/turbonfs/inc/nfs_client.h
index 28a9fcb7..489d8c55 100644
--- a/turbonfs/inc/nfs_client.h
+++ b/turbonfs/inc/nfs_client.h
@@ -138,9 +138,9 @@ struct nfs_client
 #endif
 
     /*
-     * Latest read and write throughput.
+     * Last 5 sec read and write throughput.
      * rw_genid is updated everytime these values are updated, so can be used
-     * to see throughput is changing.
+     * to check when throughput is updated.
      */
     std::atomic<uint64_t> r_MBps = 0;
     std::atomic<uint64_t> w_MBps = 0;
@@ -153,7 +153,7 @@ struct nfs_client
      * - commit_required()
      * - do_inline_write()
      *
-     * fc_scale_factor is computed by update_adaptive() according to the global
+     * fc_scale_factor is computed by periodic_updater() according to the global
      * cache pressure. If global cache pressure is high we want the local
      * flush/commit limits to be reduced so that each file flushes/commits
      * faster thus easing the global cache pressure. This promotes fair sharing
@@ -164,7 +164,7 @@ struct nfs_client
     static std::atomic<double> fc_scale_factor;
 
     /*
-     * update_adaptive() will update this scaling factor to force all ra_state
+     * periodic_updater() will update this scaling factor to force all ra_state
      * machines to slow down readahead in case of high memory pressure.
      */
     static std::atomic<double> ra_scale_factor;
@@ -219,11 +219,13 @@ struct nfs_client
 
     static double get_fc_scale_factor()
     {
+        assert(fc_scale_factor >= 1.0/10);
         return fc_scale_factor;
     }
 
     static double get_ra_scale_factor()
     {
+        assert(ra_scale_factor >= 0);
         return ra_scale_factor;
     }
 
@@ -257,63 +259,27 @@ struct nfs_client
     }
 
     /**
-     * Update various adaptive scale factors that decide following things:
-     * - how much we readahead, and
-     * - how long we keep dirty data before flushing.
+     * Update various stuff that needs to be periodically updated, like:
+     * - Last 5 sec read and write throughput.
+     * - Readahead scale factor for controlling readahead amount, and
+     * - Flush/commit dirty data scale factor for controlling how long we keep
+     *   dirty data before flushing/committing.
      *
-     * It monitors various things like, how much of the cache is occupied
-     * by read or write data, whether read/write speed is increasing by
-     * chaning the various scale factors, etc.
+     * Call this from some place that's called very frequently.
      */
-    void update_adaptive();
+    void periodic_updater();
 
     /**
-     * Call this whenever a read/write completes at the server.
-     * This tracks the read/write speed provided by the server.
+     * Get last 5 sec read throughput in MBps.
      */
-    void on_rw_complete(uint64_t r_bytes, uint64_t w_bytes)
-    {
-        static std::atomic<uint64_t> last_usec;
-        static std::atomic<uint64_t> last_read;
-        static std::atomic<uint64_t> tot_read;
-        static std::atomic<uint64_t> last_written;
-        static std::atomic<uint64_t> tot_written;
-        static std::atomic<uint64_t> last_genid;
-        /*
-         * Measure read/write speed no sooner than 10 msec interval.
-         * Anything smaller and we may not get accurate reading and anything
-         * larger and it will be less valuable for readers.
-         */
-        const uint64_t sample_intvl = 5 * 1000 * 1000;
-        const uint64_t now_usec = get_current_usecs();
-
-        tot_read += r_bytes;
-        tot_written += w_bytes;
-
-        /*
-         * Every sample_intvl, compute read/write throughput for the last
-         * interval.
-         */
-        const uint64_t intvl = now_usec - last_usec;
-        if (intvl >= sample_intvl) {
-            uint64_t expected = last_genid.load();
-            if (rw_genid.compare_exchange_strong(expected, expected + 1)) {
-                w_MBps = (tot_written - last_written) / intvl;
-                r_MBps = (tot_read - last_read) / intvl;
-
-                last_usec = now_usec;
-                last_read = tot_read.load();
-                last_written = tot_written.load();
-                last_genid = rw_genid.load();
-            }
-        }
-    }
-
     uint64_t get_read_MBps() const
     {
         return r_MBps;
     }
 
+    /**
+     * Get last 5 sec read throughput in MBps.
+     */
     uint64_t get_write_MBps() const
     {
         return w_MBps;
diff --git a/turbonfs/inc/rpc_stats.h b/turbonfs/inc/rpc_stats.h
index f681e0c7..8676bfc1 100644
--- a/turbonfs/inc/rpc_stats.h
+++ b/turbonfs/inc/rpc_stats.h
@@ -337,7 +337,9 @@ class rpc_stats_az
      *                        This will indicate our readahead effectiveness.
      * bytes_zeroed_from_cache: How many bytes were read from unmapped parts
      *                          of the cache and hence were zero filled.
-     * bytes_read_ahead: How many bytes were read ahead.
+     * num_readhead: Number of readahead calls made.
+     * bytes_read_ahead: How many bytes were read ahead using num_readhead
+     *                   calls.
      * tot_getattr_reqs: How many getattr requests were received from fuse.
      * getattr_served_from_cache: How many were served from inode->attr cache.
      * tot_lookup_reqs: How many lookup requests were received from fuse.
@@ -363,6 +365,8 @@ class rpc_stats_az
      *           beyond configured limit.
      * commit_gp: How many time commit was issued as global cache grew beyond
      *           configured limit.
+     * num_sync_membufs: How many times sync_membufs() was called?
+     * tot_bytes_sync_membufs: Total bytes flushed by sync_membufs().
      */
     static std::atomic<uint64_t> tot_read_reqs;
     static std::atomic<uint64_t> failed_read_reqs;
diff --git a/turbonfs/inc/rpc_transport.h b/turbonfs/inc/rpc_transport.h
index d677eeba..37acecf0 100644
--- a/turbonfs/inc/rpc_transport.h
+++ b/turbonfs/inc/rpc_transport.h
@@ -23,7 +23,9 @@ typedef enum
      * Use CONN_SCHED_RR_R for read requests and CONN_SCHED_RR_W for write
      * requests. This helps scheduler ensure read and write requests use
      * exclusive connections else small write responses may get stuck behind
-     * large read responses.
+     * large read responses and small read requests may get stuck behind
+     * large write requests. Note that this is not completely avoidable even
+     * though we prioritize smaller read requests over larger write requests.
      */
     CONN_SCHED_RR_R     = 2,
     CONN_SCHED_RR_W     = 3,
diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp
index 65c976b5..4d4d917d 100644
--- a/turbonfs/src/file_cache.cpp
+++ b/turbonfs/src/file_cache.cpp
@@ -2446,10 +2446,10 @@ void bytes_chunk_cache::inline_prune()
     uint64_t pruned_bytes = 0;
 
     /*
-     * See if we need to slowdown/speedup readahead and flush/commit more
-     * promptly, per the current memory pressure.
+     * Update various client level stuff that needs to be updated periodically,
+     * like various read/write scale factors, last 5 secs throughput, etc.
      */
-    nfs_client::get_instance().update_adaptive();
+    nfs_client::get_instance().periodic_updater();
 
     get_prune_goals(&inline_bytes, nullptr);
 
diff --git a/turbonfs/src/nfs_client.cpp b/turbonfs/src/nfs_client.cpp
index 8c5cfef6..47a9e268 100644
--- a/turbonfs/src/nfs_client.cpp
+++ b/turbonfs/src/nfs_client.cpp
@@ -214,24 +214,68 @@ void nfs_client::shutdown()
     jukebox_thread.join();
 }
 
-void nfs_client::update_adaptive()
+void nfs_client::periodic_updater()
 {
-#if 1
     // Maximum cache size allowed in bytes.
     static const uint64_t max_cache =
         (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
     assert(max_cache != 0);
 
-#endif
+    /*
+     * #1 Calculate recent read/write throughput.
+     */
+    static std::atomic<time_t> last_sec;
+    static std::atomic<uint64_t> last_bytes_written;
+    static std::atomic<uint64_t> last_bytes_read;
+    static std::atomic<uint64_t> last_genid;
+    time_t now_sec = ::time(NULL);
+    const int sample_intvl = 5;
+
+    assert(GET_GBL_STATS(tot_bytes_written) >= last_bytes_written);
+    assert(GET_GBL_STATS(tot_bytes_read) >= last_bytes_read);
+    assert(now_sec >= last_sec);
+
+    /*
+     * Every sample_intvl, compute read/write throughput for the last
+     * interval. Only one thread should update the throughput.
+     */
+    const int intvl = now_sec - last_sec;
+    if (intvl >= sample_intvl) {
+        uint64_t expected = last_genid.load();
+        if (rw_genid.compare_exchange_strong(expected, expected + 1)) {
+            w_MBps = (GET_GBL_STATS(tot_bytes_written) - last_bytes_written) /
+                     ((now_sec - last_sec) * 1000'000);
+            r_MBps = (GET_GBL_STATS(tot_bytes_read) - last_bytes_read) /
+                     ((now_sec - last_sec) * 1000'000);
+
+            last_sec = now_sec;
+            last_bytes_read = GET_GBL_STATS(tot_bytes_read);
+            last_bytes_written = GET_GBL_STATS(tot_bytes_written);
+            last_genid = rw_genid.load();
+        }
+    }
+
+    /*
+     * #2 Update scaling factor for readahead and flush/commit.
+     *    The way we decide optimal values of these scaling factors is by
+     *    letting each of these grow to take up more space as soon as its
+     *    share of the cache grows and reduce the scaling to take up less
+     *    space as its share grows. These competing forces result in both
+     *    reaching equilibrium.
+     */
 
-#if 1
     /*
      * cache  => total cache allocated, read + write.
      * wcache => cache allocated by writers.
      * rcache => cache allocated by reades.
+     *
+     * Note: We use dirty and commit_pending to indicate cache used by writers.
+     *       This assumes that cache is released immediately after write or
+     *       commit completes, else we will account for write cache as read.
      */
     const uint64_t cache = bytes_chunk_cache::bytes_allocated_g;
-    const uint64_t wcache = bytes_chunk_cache::bytes_dirty_g;
+    const uint64_t wcache = bytes_chunk_cache::bytes_dirty_g +
+                            bytes_chunk_cache::bytes_commit_pending_g;
     const uint64_t rcache = (uint64_t) std::max((int64_t)(cache - wcache), 0L);
 
     /*
@@ -259,71 +303,22 @@ void nfs_client::update_adaptive()
      * read and write scale factors are reduced accordingly.
      */
     const double pct_cache = (cache * 100.0) / max_cache;
-#endif
+
+    assert(pct_cache <= 100.0);
+    assert((pct_rcache + pct_wcache) <= pct_cache);
 
     double rscale = 1.0;
     double wscale = 1.0;
 
-#if 0
-    static std::atomic<uint64_t> last_rMBps;
-    static std::atomic<uint64_t> last_wMBps;
-    static std::atomic<uint64_t> last_rw_genid;
-
-    if (last_rw_genid == rw_genid) {
-        return;
-    }
-
-    uint64_t expected = last_rw_genid;
-    if (!last_rw_genid.compare_exchange_strong(expected, rw_genid.load())) {
-        return;
-    }
-
-    bool read_gt = (r_MBps > last_rMBps * 1.02);
-    bool read_lt = (r_MBps < last_rMBps * 0.98);
-    bool write_gt = (w_MBps > last_wMBps * 1.02);
-    bool write_lt = (w_MBps < last_wMBps * 0.98);
-
-    if (read_gt) {
-        if (ra_scale_factor >= 1.0) {
-            rscale = ra_scale_factor + 1.0;
-        } else {
-            rscale = ra_scale_factor + 0.1;
-        }
-    } else if (read_lt) {
-        if (ra_scale_factor > 1.0) {
-            rscale = ra_scale_factor - 1.0;
-        } else {
-            rscale = ra_scale_factor - 0.1;
-        }
-        if (rscale < 0)
-            rscale = 0;
-    }
-
-    if (write_gt) {
-        wscale = fc_scale_factor + 1.0/10;
-    } else if (write_lt) {
-        wscale = fc_scale_factor - 1.0/10;
-        if (wscale <= 0) {
-            wscale = 1.0/10;
-        }
-    }
-
-    last_rMBps = r_MBps.load();
-    last_wMBps = w_MBps.load();
-#endif
-
-#if 1
     /*
      * Stop readahead completely if we are beyond the max cache size, o/w scale
      * it down proportionately to keep the cache size less than max cache limit.
      * We also scale up the readahead to make better utilization of the allowed
      * cache size, when there are fewer reads they are allowed to use more of
-     * the cache per user for readahead. We increase the scale factor just a
-     * little less than what would exhaust the entire cache, e.g., if percent
-     * cache utilization is ~5% it means if we increase the readahead by 20
-     * times we should be able to get all of cache utilized by readaheads,
-     * we increase the scale factor to slightly less than 20, to 16, and so
-     * on.
+     * the cache per user for readahead.
+     *
+     * Just like read, write also tries to mop up cache space. Those two apply
+     * opposing forces finally reaching an equilibrium.
      */
     if (pct_rcache >= 100) {
         /*
@@ -343,15 +338,15 @@ void nfs_client::update_adaptive()
     } else if (pct_rcache < 3) {
         rscale = 32;
     } else if (pct_rcache < 5) {
-        rscale = 16;
+        rscale = 24;
     } else if (pct_rcache < 10) {
-        rscale = 8;
+        rscale = 12;
     } else if (pct_rcache < 20) {
-        rscale = 4;
+        rscale = 6;
     } else if (pct_rcache < 30) {
-        rscale = 2;
+        rscale = 4;
     } else if (pct_rcache < 50) {
-        rscale = 1.5;
+        rscale = 2.5;
     }
 
     if (pct_wcache > 95) {
@@ -368,8 +363,8 @@ void nfs_client::update_adaptive()
         // 300MB
         wscale = 3.0/10;
     } else if (pct_wcache > 70) {
-        // 500MB
-        wscale = 5.0/10;
+        // 400MB
+        wscale = 4.0/10;
     } else if (pct_wcache > 60) {
         // 600MB
         wscale = 6.0/10;
@@ -378,53 +373,35 @@ void nfs_client::update_adaptive()
         wscale = 7.0/10;
     }
 
-    /*
-     * In the end check total cache utilization and reduce wscale/rscale if
-     * total cache utilization is high.
-     */
-    if (pct_cache >= 100) {
-        rscale = 0;
-        wscale = 1.0/10;
-    } else if (pct_cache >= 75) {
-        wscale -= 1.0/10;
-        if (wscale < 1.0/10) {
-            wscale = 1.0/10;
-        }
-        if (rscale > 1.0) {
-            rscale--;
-        } else if (rscale > 0.3) {
-            rscale -= 0.1;
-        }
-    }
+    static uint64_t last_log_sec;
+    bool log_now = false;
 
-#endif
+    // Don't log more frequently than 5 secs.
+    if ((now_sec - last_log_sec) >= 5) {
+        log_now = true;
+        last_log_sec = now_sec;
+    }
 
     if (fc_scale_factor != wscale) {
-        static uint64_t last_log_usec;
-        const uint64_t now = get_current_usecs();
-        // Don't log more frequently than 5 secs.
-        if ((now - last_log_usec) > (5 * 1000 * 1000)) {
+        if (log_now) {
             AZLogInfo("[FC] Scale factor updated ({} -> {}), "
                       "cache util [R: {:0.2f}%, W: {:0.2f}%, T: {:0.2f}%]",
                       fc_scale_factor.load(), wscale,
                       pct_rcache, pct_wcache, pct_cache);
-            last_log_usec = now;
         }
         fc_scale_factor = wscale;
+        assert(fc_scale_factor >= 1.0/10);
     }
 
     if (ra_scale_factor != rscale) {
-        static uint64_t last_log_usec;
-        const uint64_t now = get_current_usecs();
-        // Don't log more frequently than 5 secs.
-        if ((now - last_log_usec) > (5 * 1000 * 1000)) {
+        if (log_now) {
             AZLogInfo("[RA] Scale factor updated ({} -> {}), "
                       "cache util [R: {:0.2f}%, W: {:0.2f}%, T: {:0.2f}%]",
                       ra_scale_factor.load(), rscale,
                       pct_rcache, pct_wcache, pct_cache);
-            last_log_usec = now;
         }
         ra_scale_factor = rscale;
+        assert(ra_scale_factor >= 0);
     }
 }
 
diff --git a/turbonfs/src/readahead.cpp b/turbonfs/src/readahead.cpp
index d4416fdf..f03bee01 100644
--- a/turbonfs/src/readahead.cpp
+++ b/turbonfs/src/readahead.cpp
@@ -385,8 +385,6 @@ static void readahead_callback (
     // Success or failure, report readahead completion.
     inode->get_rastate()->on_readahead_complete(bc->offset, bc->length);
 
-    nfs_client::get_instance().on_rw_complete(bc->length, 0);
-
     // Free the readahead RPC task.
     task->free_rpc_task();
 
diff --git a/turbonfs/src/rpc_stats.cpp b/turbonfs/src/rpc_stats.cpp
index 03c816e8..b85ac5d8 100644
--- a/turbonfs/src/rpc_stats.cpp
+++ b/turbonfs/src/rpc_stats.cpp
@@ -380,9 +380,11 @@ do { \
                         std::to_string(ops.total_usec / (ops.count * 1000.0)) + \
                         " msec\n"; \
         if (opcode == FUSE_READ) { \
-            str += "        " + std::to_string(client.get_read_MBps()) + " MBps\n"; \
+            str += "        Last 5 sec throughput: " + \
+                            std::to_string(client.get_read_MBps()) + " MBps\n"; \
         } else if (opcode == FUSE_WRITE) { \
-            str += "        " + std::to_string(client.get_write_MBps()) + " MBps\n"; \
+            str += "        Last 5 sec throughput: " + \
+                            std::to_string(client.get_write_MBps()) + " MBps\n"; \
         } \
         if (!ops.error_map.empty()) { \
             str += "        Errors encountered: \n"; \
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index fb2b0a60..a491e6a3 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -1590,8 +1590,6 @@ void rpc_task::issue_write_rpc()
             ::sleep(5);
         }
     } while (rpc_retry);
-
-    client->on_rw_complete(0, bciov->orig_length);
 }
 
 static void statfs_callback(
@@ -4167,8 +4165,6 @@ static void read_callback(
     filecache_handle->release(bc->offset, bc->length);
 #endif
 
-    task->get_client()->on_rw_complete(bc->length, 0);
-
     // For failed status we must never mark the buffer uptodate.
     assert(!status || !bc->get_membuf()->is_uptodate());
 
diff --git a/turbonfs/src/rpc_transport.cpp b/turbonfs/src/rpc_transport.cpp
index d5f20428..c3c7617a 100644
--- a/turbonfs/src/rpc_transport.cpp
+++ b/turbonfs/src/rpc_transport.cpp
@@ -106,7 +106,6 @@ void rpc_transport::close()
 /*
  * This function decides which connection should be chosen for sending
  * the current request.
- * TODO: This is round-robined for now, should be modified later.
  */
 struct nfs_context *rpc_transport::get_nfs_context(conn_sched_t csched,
                                                    uint32_t fh_hash) const
@@ -131,17 +130,17 @@ struct nfs_context *rpc_transport::get_nfs_context(conn_sched_t csched,
         const uint64_t w_MBps = client->get_write_MBps();
 
         /*
-         * If both read and write are happening fast, assign them to separate
+         * If both read and write are happening, assign them to separate
          * connection pool, else writes may slow down reads as small write
-         * responses have to wait behind large read responses.
+         * responses have to wait behind large read responses and v.v.
          */
-        if (w_MBps > 500 && r_MBps > 500) {
+        if (w_MBps > 100 && r_MBps > 100) {
             rnw = true;
-            AZLogInfo("[RNW] Write: {} Gbps, Read: {} Gbps",
+            AZLogInfo("[RNW=true] Write: {} Gbps, Read: {} Gbps",
                       (w_MBps * 8.0) / 1000, (r_MBps * 8.0) / 1000);
         } else {
             rnw = false;
-            AZLogInfo("Write: {} Gbps, Read: {} Gbps",
+            AZLogInfo("[RNW=false] Write: {} Gbps, Read: {} Gbps",
                       (w_MBps * 8.0) / 1000, (r_MBps * 8.0) / 1000);
         }
 
@@ -153,18 +152,27 @@ struct nfs_context *rpc_transport::get_nfs_context(conn_sched_t csched,
             idx = 0;
             break;
         case CONN_SCHED_RR_R:
-            // Reads get 2nd half of connections.
+            /*
+             * Reads get top 1/3rd of connections when there are simultaneous
+             * readers and writers, else they get all the connections.
+             */
             idx = (rnw ? (wconn + (last_context++ % rconn))
                        : (last_context++ % nconn));
             break;
         case CONN_SCHED_RR_W:
-            // Writes get 1st half of connections.
+            /*
+             * Writes get low 2/3rd of connections when there are simultaneous
+             * readers and writers, else they get all the connections.
+             *
+             * TODO: We should not change write connection ranges while there
+             *       are old writes still pending else we will get slowed down
+             *       by optimistic concurrency backoff.
+             */
             idx = (rnw ? (last_context++ % wconn)
                        : (last_context++ % nconn));
             break;
         case CONN_SCHED_FH_HASH:
             assert(fh_hash != 0);
-            // Writes and other non-read requests get 1st half of connections.
             idx = rnw ? (fh_hash % wconn) : (fh_hash % nconn);
             break;
         default:

From b1680bb4ba269d82ee0270f942c1c3a8d8f37159 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Sun, 23 Feb 2025 13:06:11 +0000
Subject: [PATCH 6/7] Update total bytes read/written at the correct place

---
 turbonfs/inc/aznfsc.h             |  1 +
 turbonfs/inc/rpc_task.h           |  1 -
 turbonfs/sample-turbo-config.yaml | 23 ++++++++++++++++-------
 turbonfs/src/config.cpp           |  2 +-
 turbonfs/src/nfs_client.cpp       |  2 +-
 turbonfs/src/readahead.cpp        |  1 +
 turbonfs/src/rpc_task.cpp         | 13 ++++++++++++-
 turbonfs/src/rpc_transport.cpp    |  3 ++-
 8 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/turbonfs/inc/aznfsc.h b/turbonfs/inc/aznfsc.h
index 3734cb32..fa4a3eb6 100644
--- a/turbonfs/inc/aznfsc.h
+++ b/turbonfs/inc/aznfsc.h
@@ -48,6 +48,7 @@ static_assert(AZNFSCFG_WSIZE_MAX == AZNFSCFG_RSIZE_MAX);
 #define AZNFSCFG_READDIR_MAX    4194304
 #define AZNFSCFG_READAHEAD_KB_MIN 128
 #define AZNFSCFG_READAHEAD_KB_MAX 1048576
+#define AZNFSCFG_READAHEAD_KB_DEF 16384
 #define AZNFSCFG_FUSE_MAX_BG_MIN 1
 #define AZNFSCFG_FUSE_MAX_BG_MAX 65536
 #define AZNFSCFG_FUSE_MAX_BG_DEF 4096
diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h
index 4771ea2d..9b2fe01a 100644
--- a/turbonfs/inc/rpc_task.h
+++ b/turbonfs/inc/rpc_task.h
@@ -2225,7 +2225,6 @@ struct rpc_task
             DEC_GBL_STATS(fuse_responses_awaited, 1);
         }
 
-        INC_GBL_STATS(tot_bytes_written, count);
         free_rpc_task();
     }
 
diff --git a/turbonfs/sample-turbo-config.yaml b/turbonfs/sample-turbo-config.yaml
index 03063a6f..992c08d1 100644
--- a/turbonfs/sample-turbo-config.yaml
+++ b/turbonfs/sample-turbo-config.yaml
@@ -6,7 +6,7 @@
 account: sjc22prdste06hnfsv3acc1
 container: nfsv3test
 cloud_suffix: blob.preprod.core.windows.net
-port: 2047
+port: 2048
 
 #
 # Auth Config
@@ -20,7 +20,7 @@ auth: false
 # readdir_maxcount
 # lookupcache: all|none|pos|positive
 #
-nconnect: 1
+nconnect: 96
 timeo: 600
 retrans: 2
 acregmin: 3
@@ -112,7 +112,13 @@ fuse_max_background: 4096
 # Memory backed caches are controlled using cache.data.* configs, while
 # file backed cache are controlled using filecache.* configs.
 #
-readahead_kb: 16384
+# Readahead is automatically scaled (up and down) based on the available cache
+# and whether there are ongoing writes competing for the cache. readahead_kb
+# is the initial value which is the scaled appropriately. It can be set to 0
+# for disabling readaheads completely.
+# For most cases you don't need to specify readahead_kb explicitly.
+#
+#readahead_kb: 16384
 cache.attr.user.enable: true
 cache.readdir.kernel.enable: true
 cache.readdir.user.enable: true
@@ -120,7 +126,10 @@ cache.data.kernel.enable: true
 cache.data.user.enable: true
 cache.data.user.max_size_mb: 4096
 
-filecache.enable: false
-filecache.cachedir: /mnt
-filecache.max_size_gb: 1000
-cache_max_mb: 4096
+#
+# These are currently not supported.
+#
+#filecache.enable: false
+#filecache.cachedir: /mnt
+#filecache.max_size_gb: 1000
+#cache_max_mb: 4096
diff --git a/turbonfs/src/config.cpp b/turbonfs/src/config.cpp
index f009479a..3ee1c26c 100644
--- a/turbonfs/src/config.cpp
+++ b/turbonfs/src/config.cpp
@@ -262,7 +262,7 @@ void aznfsc_cfg::set_defaults_and_sanitize()
     if (readdir_maxcount == -1)
         readdir_maxcount = 1048576;
     if (readahead_kb == -1)
-        readahead_kb = 16384;
+        readahead_kb = AZNFSCFG_READAHEAD_KB_DEF;
     if (cache.data.user.enable) {
         if (cache.data.user.max_size_mb == -1)
             cache.data.user.max_size_mb = AZNFSCFG_CACHE_MAX_MB_DEF;
diff --git a/turbonfs/src/nfs_client.cpp b/turbonfs/src/nfs_client.cpp
index 47a9e268..add4b826 100644
--- a/turbonfs/src/nfs_client.cpp
+++ b/turbonfs/src/nfs_client.cpp
@@ -228,7 +228,7 @@ void nfs_client::periodic_updater()
     static std::atomic<uint64_t> last_bytes_written;
     static std::atomic<uint64_t> last_bytes_read;
     static std::atomic<uint64_t> last_genid;
-    time_t now_sec = ::time(NULL);
+    const time_t now_sec = ::time(NULL);
     const int sample_intvl = 5;
 
     assert(GET_GBL_STATS(tot_bytes_written) >= last_bytes_written);
diff --git a/turbonfs/src/readahead.cpp b/turbonfs/src/readahead.cpp
index f03bee01..45ef796b 100644
--- a/turbonfs/src/readahead.cpp
+++ b/turbonfs/src/readahead.cpp
@@ -189,6 +189,7 @@ static void readahead_callback (
         goto delete_ctx;
     } else {
         UPDATE_INODE_ATTR(inode, res->READ3res_u.resok.file_attributes);
+        INC_GBL_STATS(tot_bytes_read, res->READ3res_u.resok.count);
 
         /*
          * Only first read call would have bc->pvt == 0, for subsequent calls
diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp
index a491e6a3..f278c547 100644
--- a/turbonfs/src/rpc_task.cpp
+++ b/turbonfs/src/rpc_task.cpp
@@ -1590,6 +1590,12 @@ void rpc_task::issue_write_rpc()
             ::sleep(5);
         }
     } while (rpc_retry);
+
+    /*
+     * Write bytes are counted when we send them to libnfs as that's when they
+     * appear on the wire.
+     */
+    INC_GBL_STATS(tot_bytes_written, bciov->length);
 }
 
 static void statfs_callback(
@@ -3750,7 +3756,6 @@ void rpc_task::send_read_response()
         AZLogDebug("[{}] Sending empty read response", ino);
         reply_iov(nullptr, 0);
     } else {
-        INC_GBL_STATS(tot_bytes_read, bytes_read);
         AZLogDebug("[{}] Sending success read response, iovec={}, "
                    "bytes_read={}",
                    ino, count, bytes_read);
@@ -3866,6 +3871,12 @@ static void read_callback(
          */
         UPDATE_INODE_ATTR(inode, res->READ3res_u.resok.file_attributes);
 
+        /*
+         * Reads are counted in the callback as that's when the read
+         * responses have just come on the wire.
+         */
+        INC_GBL_STATS(tot_bytes_read, res->READ3res_u.resok.count);
+
 #ifdef ENABLE_PRESSURE_POINTS
         /*
          * Short read pressure point, skip when eof received.
diff --git a/turbonfs/src/rpc_transport.cpp b/turbonfs/src/rpc_transport.cpp
index c3c7617a..471867ca 100644
--- a/turbonfs/src/rpc_transport.cpp
+++ b/turbonfs/src/rpc_transport.cpp
@@ -135,7 +135,7 @@ struct nfs_context *rpc_transport::get_nfs_context(conn_sched_t csched,
          * responses have to wait behind large read responses and v.v.
          */
         if (w_MBps > 100 && r_MBps > 100) {
-            rnw = true;
+            rnw = (nconn >= 4);
             AZLogInfo("[RNW=true] Write: {} Gbps, Read: {} Gbps",
                       (w_MBps * 8.0) / 1000, (r_MBps * 8.0) / 1000);
         } else {
@@ -144,6 +144,7 @@ struct nfs_context *rpc_transport::get_nfs_context(conn_sched_t csched,
                       (w_MBps * 8.0) / 1000, (r_MBps * 8.0) / 1000);
         }
 
+        assert(!rnw || (rconn > 0 && wconn > 0));
         last_sec = now_sec;
     }
 

From 1bedf9a6485996f2c1375a058f8e9e69b2fc6677 Mon Sep 17 00:00:00 2001
From: Nagendra Tomar <natomar@microsoft.com>
Date: Mon, 24 Feb 2025 00:24:54 +0000
Subject: [PATCH 7/7] Use latest libnfs update, with fix for getaddrinfo temp
 failure

---
 turbonfs/extern/libnfs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/turbonfs/extern/libnfs b/turbonfs/extern/libnfs
index c9b72868..689a2018 160000
--- a/turbonfs/extern/libnfs
+++ b/turbonfs/extern/libnfs
@@ -1 +1 @@
-Subproject commit c9b7286878bc0388f033d9d6c56d0ddf7bb4e49c
+Subproject commit 689a201851cf46e07dd7c45941ede87da5e096ff