diff --git a/turbonfs/extern/libnfs b/turbonfs/extern/libnfs index c9b72868..689a2018 160000 --- a/turbonfs/extern/libnfs +++ b/turbonfs/extern/libnfs @@ -1 +1 @@ -Subproject commit c9b7286878bc0388f033d9d6c56d0ddf7bb4e49c +Subproject commit 689a201851cf46e07dd7c45941ede87da5e096ff diff --git a/turbonfs/inc/aznfsc.h b/turbonfs/inc/aznfsc.h index 3734cb32..fa4a3eb6 100644 --- a/turbonfs/inc/aznfsc.h +++ b/turbonfs/inc/aznfsc.h @@ -48,6 +48,7 @@ static_assert(AZNFSCFG_WSIZE_MAX == AZNFSCFG_RSIZE_MAX); #define AZNFSCFG_READDIR_MAX 4194304 #define AZNFSCFG_READAHEAD_KB_MIN 128 #define AZNFSCFG_READAHEAD_KB_MAX 1048576 +#define AZNFSCFG_READAHEAD_KB_DEF 16384 #define AZNFSCFG_FUSE_MAX_BG_MIN 1 #define AZNFSCFG_FUSE_MAX_BG_MAX 65536 #define AZNFSCFG_FUSE_MAX_BG_DEF 4096 diff --git a/turbonfs/inc/fcsm.h b/turbonfs/inc/fcsm.h index 12ddbbaf..91dcd944 100644 --- a/turbonfs/inc/fcsm.h +++ b/turbonfs/inc/fcsm.h @@ -225,17 +225,6 @@ class fcsm void ctgtq_cleanup(); void ftgtq_cleanup(); - /** - * Update fc_scale_factor according to the current cache pressure. - * When global cache utilization is high, it reduces fc_scale_factor so - * that all writers flush/commit early, for easing global memory pressure. - */ - static void update_fc_scale_factor(); - static double get_fc_scale_factor() - { - return fc_scale_factor; - } - private: /* * The singleton nfs_client, for convenience. @@ -345,23 +334,6 @@ class fcsm * The state machine starts in an idle state. */ std::atomic running = false; - - /* - * Value returned by max_dirty_extent_bytes() is scaled down by this much - * before it's used by: - * - flush_required() - * - commit_required() - * - do_inline_write() - * - * fc_scale_factor is computed by update_fc_scale_factor() according to - * the global cache pressure. If global cache pressure is high we want the - * local flush/commit limits to be reduced so that each file flushes/commits - * faster thus easing the global cache pressure. This promotes fair sharing - * of global cache space while also maintaining enough contiguous data to - * the server, needed for better write throughput. Stable and unstable - * write may use this scale factor differently. - */ - static std::atomic fc_scale_factor; }; struct FC_CB_TRACKER diff --git a/turbonfs/inc/file_cache.h b/turbonfs/inc/file_cache.h index 9fb37c1a..12debd1f 100644 --- a/turbonfs/inc/file_cache.h +++ b/turbonfs/inc/file_cache.h @@ -1389,37 +1389,7 @@ class bytes_chunk_cache * scheduler an opportunity to merge better. * For unstable writes this allows us enough PB parallelism. */ - static uint64_t max_dirty_extent_bytes() - { - // Maximum cache size allowed in bytes. - static const uint64_t max_total = - (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); - assert(max_total != 0); - - /* - * Capped due to global cache size. One single file should not use - * more than 60% of the cache. - */ - static const uint64_t max_dirty_extent_g = (max_total * 0.6); - - /* - * Capped due to per-file cache discipline. - * Every file wants to keep 10 full sized blocks but that can be - * reduced as per the current cache pressure, but never less than - * one full size block. - */ - static const uint64_t max_dirty_extent_l = - (10 * AZNFSC_MAX_BLOCK_SIZE) * fcsm::get_fc_scale_factor(); - assert(max_dirty_extent_l >= AZNFSC_MAX_BLOCK_SIZE); - - const uint64_t max_dirty_extent = - std::min(max_dirty_extent_g, max_dirty_extent_l); - - // At least one full sized block. - assert(max_dirty_extent >= AZNFSC_MAX_BLOCK_SIZE); - - return max_dirty_extent; - } + static uint64_t max_dirty_extent_bytes(); /** * Get the amount of dirty data that needs to be flushed. @@ -1684,8 +1654,8 @@ class bytes_chunk_cache assert(max_total != 0); /* - * If cache usage grows to 80% of max, we enforce inline pruning for - * writers. When cache usage grows more than 60% we recommend periodic + * If cache usage grows to 90% of max, we enforce inline pruning for + * writers. When cache usage grows more than 70% we recommend periodic * pruning. If the cache size is sufficient, hopefully we will not need * inline pruning too often, as it hurts application write performance. * Once curr_bytes_total exceeds inline_threshold we need to perform @@ -1696,10 +1666,10 @@ class bytes_chunk_cache * Following also means that at any time, half of the cache_max_mb * can be safely present in the cache. */ - static const uint64_t inline_threshold = (max_total * 0.8); - static const uint64_t inline_target = (max_total * 0.7); - static const uint64_t periodic_threshold = (max_total * 0.6); - static const uint64_t periodic_target = (max_total * 0.5); + static const uint64_t inline_threshold = (max_total * 0.9); + static const uint64_t inline_target = (max_total * 0.8); + static const uint64_t periodic_threshold = (max_total * 0.7); + static const uint64_t periodic_target = (max_total * 0.6); /* * Current total cache size in bytes. Save it once to avoid issues diff --git a/turbonfs/inc/nfs_client.h b/turbonfs/inc/nfs_client.h index 84a78c09..489d8c55 100644 --- a/turbonfs/inc/nfs_client.h +++ b/turbonfs/inc/nfs_client.h @@ -137,6 +137,38 @@ struct nfs_client std::atomic max_ino = 0; #endif + /* + * Last 5 sec read and write throughput. + * rw_genid is updated everytime these values are updated, so can be used + * to check when throughput is updated. + */ + std::atomic r_MBps = 0; + std::atomic w_MBps = 0; + std::atomic rw_genid = 0; + + /* + * Value returned by max_dirty_extent_bytes() is scaled down by this much + * before it's used by: + * - flush_required() + * - commit_required() + * - do_inline_write() + * + * fc_scale_factor is computed by periodic_updater() according to the global + * cache pressure. If global cache pressure is high we want the local + * flush/commit limits to be reduced so that each file flushes/commits + * faster thus easing the global cache pressure. This promotes fair sharing + * of global cache space while also maintaining enough contiguous data to + * the server, needed for better write throughput. Stable and unstable + * write may use this scale factor differently. + */ + static std::atomic fc_scale_factor; + + /* + * periodic_updater() will update this scaling factor to force all ra_state + * machines to slow down readahead in case of high memory pressure. + */ + static std::atomic ra_scale_factor; + /* * Set in shutdown() to let others know that nfs_client is shutting * down. They can use this to quit what they are doing and plan for @@ -185,6 +217,18 @@ struct nfs_client return client; } + static double get_fc_scale_factor() + { + assert(fc_scale_factor >= 1.0/10); + return fc_scale_factor; + } + + static double get_ra_scale_factor() + { + assert(ra_scale_factor >= 0); + return ra_scale_factor; + } + /** * Returns true if nfs_client is shutting down. */ @@ -214,6 +258,38 @@ struct nfs_client return inode_map_lock_0; } + /** + * Update various stuff that needs to be periodically updated, like: + * - Last 5 sec read and write throughput. + * - Readahead scale factor for controlling readahead amount, and + * - Flush/commit dirty data scale factor for controlling how long we keep + * dirty data before flushing/committing. + * + * Call this from some place that's called very frequently. + */ + void periodic_updater(); + + /** + * Get last 5 sec read throughput in MBps. + */ + uint64_t get_read_MBps() const + { + return r_MBps; + } + + /** + * Get last 5 sec read throughput in MBps. + */ + uint64_t get_write_MBps() const + { + return w_MBps; + } + + uint64_t get_rw_genid() const + { + return rw_genid; + } + /* * The user should first init the client class before using it. */ diff --git a/turbonfs/inc/readahead.h b/turbonfs/inc/readahead.h index 75a6d0f5..16f86715 100644 --- a/turbonfs/inc/readahead.h +++ b/turbonfs/inc/readahead.h @@ -305,22 +305,10 @@ class ra_state max_byte_read = UINT64_MAX; } - /** - * Update ra_scale_factor according to the current cache pressure. - * When global cache utilization is high, it reduces ra_scale_factor so - * that all readers use less ra window, for easing global memory pressure. - * Likewise when global cache utilization is low it increases the - * ra_scale_factor to let readers use higher readahead. - */ - static void update_scale_factor(); - /** * Returns the scaled ra window that caller can safely use. */ - uint64_t get_ra_bytes() const - { - return ra_bytes * ra_scale_factor; - } + uint64_t get_ra_bytes() const; /** * This will run self tests to test the correctness of this class. diff --git a/turbonfs/inc/rpc_stats.h b/turbonfs/inc/rpc_stats.h index 457bb737..8676bfc1 100644 --- a/turbonfs/inc/rpc_stats.h +++ b/turbonfs/inc/rpc_stats.h @@ -337,7 +337,9 @@ class rpc_stats_az * This will indicate our readahead effectiveness. * bytes_zeroed_from_cache: How many bytes were read from unmapped parts * of the cache and hence were zero filled. - * bytes_read_ahead: How many bytes were read ahead. + * num_readhead: Number of readahead calls made. + * bytes_read_ahead: How many bytes were read ahead using num_readhead + * calls. * tot_getattr_reqs: How many getattr requests were received from fuse. * getattr_served_from_cache: How many were served from inode->attr cache. * tot_lookup_reqs: How many lookup requests were received from fuse. @@ -363,6 +365,8 @@ class rpc_stats_az * beyond configured limit. * commit_gp: How many time commit was issued as global cache grew beyond * configured limit. + * num_sync_membufs: How many times sync_membufs() was called? + * tot_bytes_sync_membufs: Total bytes flushed by sync_membufs(). */ static std::atomic tot_read_reqs; static std::atomic failed_read_reqs; @@ -371,6 +375,7 @@ class rpc_stats_az static std::atomic bytes_read_from_cache; static std::atomic bytes_zeroed_from_cache; static std::atomic bytes_read_ahead; + static std::atomic num_readhead; static std::atomic tot_getattr_reqs; static std::atomic getattr_served_from_cache; static std::atomic tot_lookup_reqs; @@ -387,6 +392,8 @@ class rpc_stats_az static std::atomic flush_gp; static std::atomic commit_lp; static std::atomic commit_gp; + static std::atomic num_sync_membufs; + static std::atomic tot_bytes_sync_membufs; static std::atomic rpc_tasks_allocated; static std::atomic fuse_responses_awaited; @@ -395,7 +402,7 @@ class rpc_stats_az #define INC_GBL_STATS(var, inc) rpc_stats_az::var += (inc) #define DEC_GBL_STATS(var, dec) {assert(rpc_stats_az::var >= dec); rpc_stats_az::var -= (dec);} -#define GET_GBL_STATS(var) rpc_stats_az::var +#define GET_GBL_STATS(var) rpc_stats_az::var.load() struct fuse_req_stats { diff --git a/turbonfs/inc/rpc_task.h b/turbonfs/inc/rpc_task.h index 29f15c8e..9b2fe01a 100644 --- a/turbonfs/inc/rpc_task.h +++ b/turbonfs/inc/rpc_task.h @@ -2225,7 +2225,6 @@ struct rpc_task DEC_GBL_STATS(fuse_responses_awaited, 1); } - INC_GBL_STATS(tot_bytes_written, count); free_rpc_task(); } @@ -2607,7 +2606,7 @@ class rpc_task_helper * used. Later init_*() method can set it to a more appropriate value. */ task->csched = (task->client->mnt_options.nfs_port == 2047) ? - CONN_SCHED_RR : CONN_SCHED_FH_HASH; + CONN_SCHED_RR_W : CONN_SCHED_FH_HASH; #ifdef ENABLE_PARANOID task->issuing_tid = ::gettid(); diff --git a/turbonfs/inc/rpc_transport.h b/turbonfs/inc/rpc_transport.h index a15cccbf..37acecf0 100644 --- a/turbonfs/inc/rpc_transport.h +++ b/turbonfs/inc/rpc_transport.h @@ -20,15 +20,22 @@ typedef enum /* * Round robin requests over all connections. + * Use CONN_SCHED_RR_R for read requests and CONN_SCHED_RR_W for write + * requests. This helps scheduler ensure read and write requests use + * exclusive connections else small write responses may get stuck behind + * large read responses and small read requests may get stuck behind + * large write requests. Note that this is not completely avoidable even + * though we prioritize smaller read requests over larger write requests. */ - CONN_SCHED_RR = 2, + CONN_SCHED_RR_R = 2, + CONN_SCHED_RR_W = 3, /* * Every file is affined to one connection based on the FH hash, so all * requests to one file go over the same connection while different files * will use different connections. */ - CONN_SCHED_FH_HASH = 3, + CONN_SCHED_FH_HASH = 4, } conn_sched_t; /* diff --git a/turbonfs/sample-turbo-config.yaml b/turbonfs/sample-turbo-config.yaml index 03063a6f..992c08d1 100644 --- a/turbonfs/sample-turbo-config.yaml +++ b/turbonfs/sample-turbo-config.yaml @@ -6,7 +6,7 @@ account: sjc22prdste06hnfsv3acc1 container: nfsv3test cloud_suffix: blob.preprod.core.windows.net -port: 2047 +port: 2048 # # Auth Config @@ -20,7 +20,7 @@ auth: false # readdir_maxcount # lookupcache: all|none|pos|positive # -nconnect: 1 +nconnect: 96 timeo: 600 retrans: 2 acregmin: 3 @@ -112,7 +112,13 @@ fuse_max_background: 4096 # Memory backed caches are controlled using cache.data.* configs, while # file backed cache are controlled using filecache.* configs. # -readahead_kb: 16384 +# Readahead is automatically scaled (up and down) based on the available cache +# and whether there are ongoing writes competing for the cache. readahead_kb +# is the initial value which is the scaled appropriately. It can be set to 0 +# for disabling readaheads completely. +# For most cases you don't need to specify readahead_kb explicitly. +# +#readahead_kb: 16384 cache.attr.user.enable: true cache.readdir.kernel.enable: true cache.readdir.user.enable: true @@ -120,7 +126,10 @@ cache.data.kernel.enable: true cache.data.user.enable: true cache.data.user.max_size_mb: 4096 -filecache.enable: false -filecache.cachedir: /mnt -filecache.max_size_gb: 1000 -cache_max_mb: 4096 +# +# These are currently not supported. +# +#filecache.enable: false +#filecache.cachedir: /mnt +#filecache.max_size_gb: 1000 +#cache_max_mb: 4096 diff --git a/turbonfs/src/config.cpp b/turbonfs/src/config.cpp index f009479a..3ee1c26c 100644 --- a/turbonfs/src/config.cpp +++ b/turbonfs/src/config.cpp @@ -262,7 +262,7 @@ void aznfsc_cfg::set_defaults_and_sanitize() if (readdir_maxcount == -1) readdir_maxcount = 1048576; if (readahead_kb == -1) - readahead_kb = 16384; + readahead_kb = AZNFSCFG_READAHEAD_KB_DEF; if (cache.data.user.enable) { if (cache.data.user.max_size_mb == -1) cache.data.user.max_size_mb = AZNFSCFG_CACHE_MAX_MB_DEF; diff --git a/turbonfs/src/fcsm.cpp b/turbonfs/src/fcsm.cpp index 9918c5e3..08068ecd 100644 --- a/turbonfs/src/fcsm.cpp +++ b/turbonfs/src/fcsm.cpp @@ -3,7 +3,6 @@ #include "nfs_inode.h" namespace aznfsc { -/* static */ std::atomic fcsm::fc_scale_factor = 1.0; /** * This is called from alloc_fcsm() with exclusive lock on ilock_1. @@ -110,51 +109,6 @@ void fcsm::add_committing(uint64_t bytes) assert(committing_seq_num <= flushed_seq_num); } -/* static */ -void fcsm::update_fc_scale_factor() -{ - // Maximum cache size allowed in bytes. - static const uint64_t max_cache = - (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); - assert(max_cache != 0); - const uint64_t curr_cache = bytes_chunk_cache::bytes_allocated_g; - const double percent_cache = (curr_cache * 100.0) / max_cache; - double scale = 1.0; - - if (percent_cache > 95) { - /* - * Every file has fundamental right to 100MB of cache space. - * If we reduce it further we will end up in sub-optimal writes - * to the server. - */ - scale = 1.0/10; - } else if (percent_cache > 90) { - // 200MB - scale = 2.0/10; - } else if (percent_cache > 80) { - // 300MB - scale = 3.0/10; - } else if (percent_cache > 70) { - // 500MB - scale = 5.0/10; - } else if (percent_cache > 60) { - // 800MB - scale = 8.0/10; - } - - if (fc_scale_factor != scale) { - static uint64_t last_log_usec; - const uint64_t now = get_current_usecs(); - // Don't log more frequently than 5 secs. - if ((now - last_log_usec) > (5 * 1000 * 1000)) { - AZLogInfo("[FC] Scale factor updated ({} -> {})", - fc_scale_factor.load(), scale); - last_log_usec = now; - } - fc_scale_factor = scale; - } -} - void fcsm::run(struct rpc_task *task, uint64_t extent_left, uint64_t extent_right) diff --git a/turbonfs/src/file_cache.cpp b/turbonfs/src/file_cache.cpp index 6a837bdc..4d4d917d 100644 --- a/turbonfs/src/file_cache.cpp +++ b/turbonfs/src/file_cache.cpp @@ -2404,6 +2404,39 @@ int bytes_chunk_cache::truncate(uint64_t trunc_len, return mb_skipped; } +/* static */ +uint64_t bytes_chunk_cache::max_dirty_extent_bytes() +{ + // Maximum cache size allowed in bytes. + static const uint64_t max_total = + (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); + assert(max_total != 0); + + /* + * Capped due to global cache size. One single file should not use + * more than 60% of the cache. + */ + static const uint64_t max_dirty_extent_g = (max_total * 0.6); + + /* + * Capped due to per-file cache discipline. + * Every file wants to keep 10 full sized blocks but that can be + * reduced as per the current cache pressure, but never less than + * one full size block. + */ + static const uint64_t max_dirty_extent_l = + (10 * AZNFSC_MAX_BLOCK_SIZE) * nfs_client::get_fc_scale_factor(); + assert(max_dirty_extent_l >= AZNFSC_MAX_BLOCK_SIZE); + + const uint64_t max_dirty_extent = + std::min(max_dirty_extent_g, max_dirty_extent_l); + + // At least one full sized block. + assert(max_dirty_extent >= AZNFSC_MAX_BLOCK_SIZE); + + return max_dirty_extent; +} + /* * TODO: Add pruning stats. */ @@ -2413,11 +2446,10 @@ void bytes_chunk_cache::inline_prune() uint64_t pruned_bytes = 0; /* - * See if we need to slowdown/speedup readahead and flush/commit more - * promptly, per the current memory pressure. + * Update various client level stuff that needs to be updated periodically, + * like various read/write scale factors, last 5 secs throughput, etc. */ - ra_state::update_scale_factor(); - fcsm::update_fc_scale_factor(); + nfs_client::get_instance().periodic_updater(); get_prune_goals(&inline_bytes, nullptr); diff --git a/turbonfs/src/nfs_client.cpp b/turbonfs/src/nfs_client.cpp index 924a2c77..add4b826 100644 --- a/turbonfs/src/nfs_client.cpp +++ b/turbonfs/src/nfs_client.cpp @@ -4,6 +4,11 @@ #include "rpc_task.h" #include "rpc_readdir.h" +/* static */ +std::atomic nfs_client::ra_scale_factor = 1.0; +/* static */ +std::atomic nfs_client::fc_scale_factor = 1.0; + // The user should first init the client class before using it. bool nfs_client::init() { @@ -209,6 +214,197 @@ void nfs_client::shutdown() jukebox_thread.join(); } +void nfs_client::periodic_updater() +{ + // Maximum cache size allowed in bytes. + static const uint64_t max_cache = + (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); + assert(max_cache != 0); + + /* + * #1 Calculate recent read/write throughput. + */ + static std::atomic last_sec; + static std::atomic last_bytes_written; + static std::atomic last_bytes_read; + static std::atomic last_genid; + const time_t now_sec = ::time(NULL); + const int sample_intvl = 5; + + assert(GET_GBL_STATS(tot_bytes_written) >= last_bytes_written); + assert(GET_GBL_STATS(tot_bytes_read) >= last_bytes_read); + assert(now_sec >= last_sec); + + /* + * Every sample_intvl, compute read/write throughput for the last + * interval. Only one thread should update the throughput. + */ + const int intvl = now_sec - last_sec; + if (intvl >= sample_intvl) { + uint64_t expected = last_genid.load(); + if (rw_genid.compare_exchange_strong(expected, expected + 1)) { + w_MBps = (GET_GBL_STATS(tot_bytes_written) - last_bytes_written) / + ((now_sec - last_sec) * 1000'000); + r_MBps = (GET_GBL_STATS(tot_bytes_read) - last_bytes_read) / + ((now_sec - last_sec) * 1000'000); + + last_sec = now_sec; + last_bytes_read = GET_GBL_STATS(tot_bytes_read); + last_bytes_written = GET_GBL_STATS(tot_bytes_written); + last_genid = rw_genid.load(); + } + } + + /* + * #2 Update scaling factor for readahead and flush/commit. + * The way we decide optimal values of these scaling factors is by + * letting each of these grow to take up more space as soon as its + * share of the cache grows and reduce the scaling to take up less + * space as its share grows. These competing forces result in both + * reaching equilibrium. + */ + + /* + * cache => total cache allocated, read + write. + * wcache => cache allocated by writers. + * rcache => cache allocated by reades. + * + * Note: We use dirty and commit_pending to indicate cache used by writers. + * This assumes that cache is released immediately after write or + * commit completes, else we will account for write cache as read. + */ + const uint64_t cache = bytes_chunk_cache::bytes_allocated_g; + const uint64_t wcache = bytes_chunk_cache::bytes_dirty_g + + bytes_chunk_cache::bytes_commit_pending_g; + const uint64_t rcache = (uint64_t) std::max((int64_t)(cache - wcache), 0L); + + /* + * Read cache usage as percent of max_cache. + * This tells how aggressively readers are filling up the cache by + * readahead. We increase readahead if this is low and reduce readahead + * as this grows. + */ + const double pct_rcache = (rcache * 100.0) / max_cache; + + /* + * Write cache usage as percent of max_cache. + * This tells how aggressively writees are filling up the cache by adding + * dirty data. We reduce max_dirty_extent_bytes() if this grows so that + * dirty data is flushed faster, and increase max_dirty_extent_bytes() if + * this is low and we want to accumulate more dirty data and flush in bigger + * chunks, for fast backend write throughput. + */ + const double pct_wcache = (wcache * 100.0) / max_cache; + + /* + * Total cache usage (read + write) as percent of max_cache. + * Reads and writes are controlled individually as per pct_rcache and + * pct_wcache, but in the end total cache usage is consulted and the + * read and write scale factors are reduced accordingly. + */ + const double pct_cache = (cache * 100.0) / max_cache; + + assert(pct_cache <= 100.0); + assert((pct_rcache + pct_wcache) <= pct_cache); + + double rscale = 1.0; + double wscale = 1.0; + + /* + * Stop readahead completely if we are beyond the max cache size, o/w scale + * it down proportionately to keep the cache size less than max cache limit. + * We also scale up the readahead to make better utilization of the allowed + * cache size, when there are fewer reads they are allowed to use more of + * the cache per user for readahead. + * + * Just like read, write also tries to mop up cache space. Those two apply + * opposing forces finally reaching an equilibrium. + */ + if (pct_rcache >= 100) { + /* + * reads taking up all the cache space, completely stop readaheads. + * This will cause read cache utilization to drop and then we will + * increase readaheads, finally it'll settle on an optimal value. + */ + rscale = 0; + } else if (pct_rcache > 95) { + rscale = 0.5; + } else if (pct_rcache > 90) { + rscale = 0.7; + } else if (pct_rcache > 80) { + rscale = 0.8; + } else if (pct_rcache > 70) { + rscale = 0.9; + } else if (pct_rcache < 3) { + rscale = 32; + } else if (pct_rcache < 5) { + rscale = 24; + } else if (pct_rcache < 10) { + rscale = 12; + } else if (pct_rcache < 20) { + rscale = 6; + } else if (pct_rcache < 30) { + rscale = 4; + } else if (pct_rcache < 50) { + rscale = 2.5; + } + + if (pct_wcache > 95) { + /* + * Every file has fundamental right to 100MB of cache space. + * If we reduce it further we will end up in sub-optimal writes + * to the server. + */ + wscale = 1.0/10; + } else if (pct_wcache > 90) { + // 200MB + wscale = 2.0/10; + } else if (pct_wcache > 80) { + // 300MB + wscale = 3.0/10; + } else if (pct_wcache > 70) { + // 400MB + wscale = 4.0/10; + } else if (pct_wcache > 60) { + // 600MB + wscale = 6.0/10; + } else if (pct_wcache > 50) { + // 700MB + wscale = 7.0/10; + } + + static uint64_t last_log_sec; + bool log_now = false; + + // Don't log more frequently than 5 secs. + if ((now_sec - last_log_sec) >= 5) { + log_now = true; + last_log_sec = now_sec; + } + + if (fc_scale_factor != wscale) { + if (log_now) { + AZLogInfo("[FC] Scale factor updated ({} -> {}), " + "cache util [R: {:0.2f}%, W: {:0.2f}%, T: {:0.2f}%]", + fc_scale_factor.load(), wscale, + pct_rcache, pct_wcache, pct_cache); + } + fc_scale_factor = wscale; + assert(fc_scale_factor >= 1.0/10); + } + + if (ra_scale_factor != rscale) { + if (log_now) { + AZLogInfo("[RA] Scale factor updated ({} -> {}), " + "cache util [R: {:0.2f}%, W: {:0.2f}%, T: {:0.2f}%]", + ra_scale_factor.load(), rscale, + pct_rcache, pct_wcache, pct_cache); + } + ra_scale_factor = rscale; + assert(ra_scale_factor >= 0); + } +} + void nfs_client::jukebox_runner() { AZLogDebug("Started jukebox_runner"); diff --git a/turbonfs/src/nfs_inode.cpp b/turbonfs/src/nfs_inode.cpp index 3087d856..98f30516 100644 --- a/turbonfs/src/nfs_inode.cpp +++ b/turbonfs/src/nfs_inode.cpp @@ -663,6 +663,8 @@ void nfs_inode::sync_membufs(std::vector &bc_vec, */ assert(!is_commit_in_progress()); + INC_GBL_STATS(num_sync_membufs, 1); + if (bc_vec.empty()) { return; } @@ -818,6 +820,8 @@ void nfs_inode::sync_membufs(std::vector &bc_vec, continue; } + INC_GBL_STATS(tot_bytes_sync_membufs, mb->length.load()); + if (write_task == nullptr) { write_task = get_client()->get_rpc_task_helper()->alloc_rpc_task(FUSE_WRITE); @@ -1422,7 +1426,7 @@ int nfs_inode::flush_cache_and_wait() " flush to complete still waiting, iter: {}", get_fuse_ino(), iter); } - ::usleep(1000); + ::usleep(10 * 1000); } return get_write_error(); diff --git a/turbonfs/src/readahead.cpp b/turbonfs/src/readahead.cpp index bcb32a3c..45ef796b 100644 --- a/turbonfs/src/readahead.cpp +++ b/turbonfs/src/readahead.cpp @@ -18,8 +18,6 @@ namespace aznfsc { -/* static */ std::atomic ra_state::ra_scale_factor = 1.0; - /** * This is called from alloc_rastate() with exclusive lock on ilock_1. */ @@ -64,62 +62,9 @@ ra_state::ra_state(struct nfs_client *_client, inode->get_fuse_ino(), ra_bytes, def_ra_size); } -/* static */ -void ra_state::update_scale_factor() +uint64_t ra_state::get_ra_bytes() const { - // Maximum cache size allowed in bytes. - static const uint64_t max_cache = - (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); - assert(max_cache != 0); - const uint64_t curr_cache = bytes_chunk_cache::bytes_allocated_g; - const double percent_cache = (curr_cache * 100.0) / max_cache; - double scale = 1.0; - - /* - * Stop readahead fully if we are beyond the max cache size, o/w scale it - * down proportionately to keep the cache size less than max cache limit. - * We also scale up the readahead to make better utilization of the allowed - * cache size, when there are fewer reads they are allowed to use more of - * the cache per user for readahead. We increase the scale factor just a - * little less than what would exhaust the entire cache, e.g., if percent - * cache utilization is ~5% it means if we increase the readahead by 20 - * times we should be able to get all of cache utilized by readaheads, - * we increase the scale factor to slightly less than 20, to 16, and so - * on. - */ - if (percent_cache >= 100) { - scale = 0; - } else if (percent_cache > 95) { - scale = 0.5; - } else if (percent_cache > 90) { - scale = 0.7; - } else if (percent_cache > 80) { - scale = 0.8; - } else if (percent_cache > 70) { - scale = 0.9; - } else if (percent_cache < 5) { - scale = 16; - } else if (percent_cache < 10) { - scale = 8; - } else if (percent_cache < 20) { - scale = 4; - } else if (percent_cache < 30) { - scale = 2; - } else if (percent_cache < 50) { - scale = 1.5; - } - - if (ra_scale_factor != scale) { - static uint64_t last_log_usec; - const uint64_t now = get_current_usecs(); - // Don't log more frequently than 5 secs. - if ((now - last_log_usec) > (5 * 1000 * 1000)) { - AZLogInfo("[Readahead] Scale factor updated ({} -> {})", - ra_scale_factor.load(), scale); - last_log_usec = now; - } - ra_scale_factor = scale; - } + return ra_bytes * nfs_client::get_ra_scale_factor(); } /** @@ -244,6 +189,7 @@ static void readahead_callback ( goto delete_ctx; } else { UPDATE_INODE_ATTR(inode, res->READ3res_u.resok.file_attributes); + INC_GBL_STATS(tot_bytes_read, res->READ3res_u.resok.count); /* * Only first read call would have bc->pvt == 0, for subsequent calls @@ -722,6 +668,8 @@ int ra_state::issue_readaheads() inode->get_fuse_ino(), num_no_readahead.load(), ra_offset); } + } else { + INC_GBL_STATS(num_readhead, 1); } return ra_issued; diff --git a/turbonfs/src/rpc_stats.cpp b/turbonfs/src/rpc_stats.cpp index 2e29eed1..b85ac5d8 100644 --- a/turbonfs/src/rpc_stats.cpp +++ b/turbonfs/src/rpc_stats.cpp @@ -18,6 +18,7 @@ namespace aznfsc { /* static */ std::atomic rpc_stats_az::bytes_read_from_cache = 0; /* static */ std::atomic rpc_stats_az::bytes_zeroed_from_cache = 0; /* static */ std::atomic rpc_stats_az::bytes_read_ahead = 0; +/* static */ std::atomic rpc_stats_az::num_readhead = 0; /* static */ std::atomic rpc_stats_az::tot_getattr_reqs = 0; /* static */ std::atomic rpc_stats_az::getattr_served_from_cache = 0; /* static */ std::atomic rpc_stats_az::tot_lookup_reqs = 0; @@ -34,6 +35,8 @@ namespace aznfsc { /* static */ std::atomic rpc_stats_az::commit_lp = 0; /* static */ std::atomic rpc_stats_az::commit_gp = 0; /* static */ std::atomic rpc_stats_az::writes_np = 0; +/* static */ std::atomic rpc_stats_az::num_sync_membufs = 0; +/* static */ std::atomic rpc_stats_az::tot_bytes_sync_membufs = 0; /* static */ std::atomic rpc_stats_az::rpc_tasks_allocated = 0; /* static */ std::atomic rpc_stats_az::fuse_responses_awaited = 0; /* static */ std::atomic rpc_stats_az::fuse_reply_failed = 0; @@ -176,6 +179,11 @@ void rpc_stats_az::dump_stats() str += " " + std::to_string(num_silly_renamed) + " inodes silly-renamed (waiting for last close)\n"; + // Maximum cache size allowed in bytes. + static const uint64_t max_cache = + (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); + assert(max_cache != 0); + str += "File Cache statistics:\n"; if (aznfsc_cfg.cache.data.user.enable) { str += " " + std::to_string(aznfsc_cfg.cache.data.user.max_size_mb) + @@ -192,12 +200,22 @@ void rpc_stats_az::dump_stats() " file caches\n"; str += " " + std::to_string(bytes_chunk_cache::num_chunks_g) + " chunks in chunkmap\n"; + + const double allocate_pct = + ((bytes_chunk_cache::bytes_allocated_g * 100.0) / max_cache); str += " " + std::to_string(bytes_chunk_cache::bytes_allocated_g) + - " bytes allocated\n"; + " bytes allocated (" + + std::to_string(allocate_pct) + "%)\n"; + str += " " + std::to_string(bytes_chunk_cache::bytes_cached_g) + " bytes cached\n"; + + const double dirty_pct = + ((bytes_chunk_cache::bytes_dirty_g * 100.0) / max_cache); str += " " + std::to_string(bytes_chunk_cache::bytes_dirty_g) + - " bytes dirty\n"; + " bytes dirty (" + + std::to_string(dirty_pct) + "%)\n"; + str += " " + std::to_string(bytes_chunk_cache::bytes_flushing_g) + " bytes currently flushing\n"; str += " " + std::to_string(bytes_chunk_cache::bytes_commit_pending_g) + @@ -262,8 +280,13 @@ void rpc_stats_az::dump_stats() str += " " + std::to_string(GET_GBL_STATS(bytes_zeroed_from_cache)) + " bytes holes read from cache (" + std::to_string(hole_cache_pct) + "%)\n"; + + const uint64_t avg_ra_size = + num_readhead ? (bytes_read_ahead / num_readhead) : 0; str += " " + std::to_string(GET_GBL_STATS(bytes_read_ahead)) + - " bytes read by readahead\n"; + " bytes read by readahead with avg size " + + std::to_string(avg_ra_size) + " bytes and ra scale factor " + + std::to_string(nfs_client::get_ra_scale_factor()) + "\n"; const uint64_t avg_write_size = tot_write_reqs ? (tot_bytes_written / tot_write_reqs) : 0; @@ -275,6 +298,7 @@ void rpc_stats_az::dump_stats() str += " " + std::to_string(GET_GBL_STATS(failed_write_reqs)) + " application writes failed\n"; } + str += " " + std::to_string(GET_GBL_STATS(writes_np)) + " writes did not hit any memory pressure\n"; str += " " + std::to_string(GET_GBL_STATS(inline_writes)) + @@ -294,6 +318,13 @@ void rpc_stats_az::dump_stats() str += " " + std::to_string(GET_GBL_STATS(commit_gp)) + " commits triggered as global cache limit was reached\n"; + const uint64_t avg_sync_membufs_size = + num_sync_membufs ? (tot_bytes_sync_membufs / num_sync_membufs) : 0; + str += " " + std::to_string(GET_GBL_STATS(num_sync_membufs)) + + " sync_membufs calls with avg size " + + std::to_string(avg_sync_membufs_size) + " bytes and fc scale factor " + + std::to_string(nfs_client::get_fc_scale_factor()) + "\n"; + const double getattr_cache_pct = tot_getattr_reqs ? ((getattr_served_from_cache * 100.0) / tot_getattr_reqs) : 0; @@ -348,6 +379,13 @@ do { \ str += " Avg Total execute time: " + \ std::to_string(ops.total_usec / (ops.count * 1000.0)) + \ " msec\n"; \ + if (opcode == FUSE_READ) { \ + str += " Last 5 sec throughput: " + \ + std::to_string(client.get_read_MBps()) + " MBps\n"; \ + } else if (opcode == FUSE_WRITE) { \ + str += " Last 5 sec throughput: " + \ + std::to_string(client.get_write_MBps()) + " MBps\n"; \ + } \ if (!ops.error_map.empty()) { \ str += " Errors encountered: \n"; \ for (const auto& entry : ops.error_map) { \ diff --git a/turbonfs/src/rpc_task.cpp b/turbonfs/src/rpc_task.cpp index 488eb36f..f278c547 100644 --- a/turbonfs/src/rpc_task.cpp +++ b/turbonfs/src/rpc_task.cpp @@ -674,7 +674,7 @@ void rpc_task::init_read_be(fuse_ino_t ino, * * TODO: Control this with a config. */ - set_csched(CONN_SCHED_RR); + set_csched(CONN_SCHED_RR_R); assert(!rpc_api->read_task.is_fe()); assert(rpc_api->read_task.is_be()); @@ -1563,7 +1563,7 @@ void rpc_task::issue_write_rpc() * issues as seen by stable writes. */ if (!inode->is_stable_write()) { - set_csched(CONN_SCHED_RR); + set_csched(CONN_SCHED_RR_W); } do { @@ -1590,6 +1590,12 @@ void rpc_task::issue_write_rpc() ::sleep(5); } } while (rpc_retry); + + /* + * Write bytes are counted when we send them to libnfs as that's when they + * appear on the wire. + */ + INC_GBL_STATS(tot_bytes_written, bciov->length); } static void statfs_callback( @@ -3750,7 +3756,6 @@ void rpc_task::send_read_response() AZLogDebug("[{}] Sending empty read response", ino); reply_iov(nullptr, 0); } else { - INC_GBL_STATS(tot_bytes_read, bytes_read); AZLogDebug("[{}] Sending success read response, iovec={}, " "bytes_read={}", ino, count, bytes_read); @@ -3866,6 +3871,12 @@ static void read_callback( */ UPDATE_INODE_ATTR(inode, res->READ3res_u.resok.file_attributes); + /* + * Reads are counted in the callback as that's when the read + * responses have just come on the wire. + */ + INC_GBL_STATS(tot_bytes_read, res->READ3res_u.resok.count); + #ifdef ENABLE_PRESSURE_POINTS /* * Short read pressure point, skip when eof received. diff --git a/turbonfs/src/rpc_transport.cpp b/turbonfs/src/rpc_transport.cpp index c8c58222..471867ca 100644 --- a/turbonfs/src/rpc_transport.cpp +++ b/turbonfs/src/rpc_transport.cpp @@ -106,27 +106,81 @@ void rpc_transport::close() /* * This function decides which connection should be chosen for sending * the current request. - * TODO: This is round-robined for now, should be modified later. */ struct nfs_context *rpc_transport::get_nfs_context(conn_sched_t csched, uint32_t fh_hash) const { - uint32_t idx = 0; + int idx = 0; + const int nconn = client->mnt_options.num_connections; + assert(nconn > 0); + const int rconn = nconn / 3; + const int wconn = nconn - rconn; + + static std::atomic last_sec; + static std::atomic rnw = false; + uint64_t now_sec = ::time(NULL); + + assert(now_sec >= last_sec); + + /* + * Take stock of things, no sooner than 5 secs. + */ + if (now_sec > (last_sec + 5)) { + const uint64_t r_MBps = client->get_read_MBps(); + const uint64_t w_MBps = client->get_write_MBps(); + + /* + * If both read and write are happening, assign them to separate + * connection pool, else writes may slow down reads as small write + * responses have to wait behind large read responses and v.v. + */ + if (w_MBps > 100 && r_MBps > 100) { + rnw = (nconn >= 4); + AZLogInfo("[RNW=true] Write: {} Gbps, Read: {} Gbps", + (w_MBps * 8.0) / 1000, (r_MBps * 8.0) / 1000); + } else { + rnw = false; + AZLogInfo("[RNW=false] Write: {} Gbps, Read: {} Gbps", + (w_MBps * 8.0) / 1000, (r_MBps * 8.0) / 1000); + } + + assert(!rnw || (rconn > 0 && wconn > 0)); + last_sec = now_sec; + } switch (csched) { case CONN_SCHED_FIRST: idx = 0; break; - case CONN_SCHED_RR: - idx = (last_context++ % client->mnt_options.num_connections); + case CONN_SCHED_RR_R: + /* + * Reads get top 1/3rd of connections when there are simultaneous + * readers and writers, else they get all the connections. + */ + idx = (rnw ? (wconn + (last_context++ % rconn)) + : (last_context++ % nconn)); + break; + case CONN_SCHED_RR_W: + /* + * Writes get low 2/3rd of connections when there are simultaneous + * readers and writers, else they get all the connections. + * + * TODO: We should not change write connection ranges while there + * are old writes still pending else we will get slowed down + * by optimistic concurrency backoff. + */ + idx = (rnw ? (last_context++ % wconn) + : (last_context++ % nconn)); break; case CONN_SCHED_FH_HASH: assert(fh_hash != 0); - idx = fh_hash % client->mnt_options.num_connections; + idx = rnw ? (fh_hash % wconn) : (fh_hash % nconn); break; default: assert(0); } + assert(idx >= 0 && idx < client->mnt_options.num_connections); + return nfs_connections[idx]->get_nfs_context(); }