Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion turbonfs/extern/libnfs
Submodule libnfs updated 1 files
+48 −24 lib/socket.c
1 change: 1 addition & 0 deletions turbonfs/inc/aznfsc.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ static_assert(AZNFSCFG_WSIZE_MAX == AZNFSCFG_RSIZE_MAX);
#define AZNFSCFG_READDIR_MAX 4194304
#define AZNFSCFG_READAHEAD_KB_MIN 128
#define AZNFSCFG_READAHEAD_KB_MAX 1048576
#define AZNFSCFG_READAHEAD_KB_DEF 16384
#define AZNFSCFG_FUSE_MAX_BG_MIN 1
#define AZNFSCFG_FUSE_MAX_BG_MAX 65536
#define AZNFSCFG_FUSE_MAX_BG_DEF 4096
Expand Down
28 changes: 0 additions & 28 deletions turbonfs/inc/fcsm.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,17 +225,6 @@ class fcsm
void ctgtq_cleanup();
void ftgtq_cleanup();

/**
* Update fc_scale_factor according to the current cache pressure.
* When global cache utilization is high, it reduces fc_scale_factor so
* that all writers flush/commit early, for easing global memory pressure.
*/
static void update_fc_scale_factor();
static double get_fc_scale_factor()
{
return fc_scale_factor;
}

private:
/*
* The singleton nfs_client, for convenience.
Expand Down Expand Up @@ -345,23 +334,6 @@ class fcsm
* The state machine starts in an idle state.
*/
std::atomic<bool> running = false;

/*
* Value returned by max_dirty_extent_bytes() is scaled down by this much
* before it's used by:
* - flush_required()
* - commit_required()
* - do_inline_write()
*
* fc_scale_factor is computed by update_fc_scale_factor() according to
* the global cache pressure. If global cache pressure is high we want the
* local flush/commit limits to be reduced so that each file flushes/commits
* faster thus easing the global cache pressure. This promotes fair sharing
* of global cache space while also maintaining enough contiguous data to
* the server, needed for better write throughput. Stable and unstable
* write may use this scale factor differently.
*/
static std::atomic<double> fc_scale_factor;
};

struct FC_CB_TRACKER
Expand Down
44 changes: 7 additions & 37 deletions turbonfs/inc/file_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -1389,37 +1389,7 @@ class bytes_chunk_cache
* scheduler an opportunity to merge better.
* For unstable writes this allows us enough PB parallelism.
*/
static uint64_t max_dirty_extent_bytes()
{
// Maximum cache size allowed in bytes.
static const uint64_t max_total =
(aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
assert(max_total != 0);

/*
* Capped due to global cache size. One single file should not use
* more than 60% of the cache.
*/
static const uint64_t max_dirty_extent_g = (max_total * 0.6);

/*
* Capped due to per-file cache discipline.
* Every file wants to keep 10 full sized blocks but that can be
* reduced as per the current cache pressure, but never less than
* one full size block.
*/
static const uint64_t max_dirty_extent_l =
(10 * AZNFSC_MAX_BLOCK_SIZE) * fcsm::get_fc_scale_factor();
assert(max_dirty_extent_l >= AZNFSC_MAX_BLOCK_SIZE);

const uint64_t max_dirty_extent =
std::min(max_dirty_extent_g, max_dirty_extent_l);

// At least one full sized block.
assert(max_dirty_extent >= AZNFSC_MAX_BLOCK_SIZE);

return max_dirty_extent;
}
static uint64_t max_dirty_extent_bytes();

/**
* Get the amount of dirty data that needs to be flushed.
Expand Down Expand Up @@ -1684,8 +1654,8 @@ class bytes_chunk_cache
assert(max_total != 0);

/*
* If cache usage grows to 80% of max, we enforce inline pruning for
* writers. When cache usage grows more than 60% we recommend periodic
* If cache usage grows to 90% of max, we enforce inline pruning for
* writers. When cache usage grows more than 70% we recommend periodic
* pruning. If the cache size is sufficient, hopefully we will not need
* inline pruning too often, as it hurts application write performance.
* Once curr_bytes_total exceeds inline_threshold we need to perform
Expand All @@ -1696,10 +1666,10 @@ class bytes_chunk_cache
* Following also means that at any time, half of the cache_max_mb
* can be safely present in the cache.
*/
static const uint64_t inline_threshold = (max_total * 0.8);
static const uint64_t inline_target = (max_total * 0.7);
static const uint64_t periodic_threshold = (max_total * 0.6);
static const uint64_t periodic_target = (max_total * 0.5);
static const uint64_t inline_threshold = (max_total * 0.9);
static const uint64_t inline_target = (max_total * 0.8);
static const uint64_t periodic_threshold = (max_total * 0.7);
static const uint64_t periodic_target = (max_total * 0.6);

/*
* Current total cache size in bytes. Save it once to avoid issues
Expand Down
76 changes: 76 additions & 0 deletions turbonfs/inc/nfs_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,38 @@ struct nfs_client
std::atomic<uint64_t> max_ino = 0;
#endif

/*
* Last 5 sec read and write throughput.
* rw_genid is updated everytime these values are updated, so can be used
* to check when throughput is updated.
*/
std::atomic<uint64_t> r_MBps = 0;
std::atomic<uint64_t> w_MBps = 0;
std::atomic<uint64_t> rw_genid = 0;

/*
* Value returned by max_dirty_extent_bytes() is scaled down by this much
* before it's used by:
* - flush_required()
* - commit_required()
* - do_inline_write()
*
* fc_scale_factor is computed by periodic_updater() according to the global
* cache pressure. If global cache pressure is high we want the local
* flush/commit limits to be reduced so that each file flushes/commits
* faster thus easing the global cache pressure. This promotes fair sharing
* of global cache space while also maintaining enough contiguous data to
* the server, needed for better write throughput. Stable and unstable
* write may use this scale factor differently.
*/
static std::atomic<double> fc_scale_factor;

/*
* periodic_updater() will update this scaling factor to force all ra_state
* machines to slow down readahead in case of high memory pressure.
*/
static std::atomic<double> ra_scale_factor;

/*
* Set in shutdown() to let others know that nfs_client is shutting
* down. They can use this to quit what they are doing and plan for
Expand Down Expand Up @@ -185,6 +217,18 @@ struct nfs_client
return client;
}

static double get_fc_scale_factor()
{
assert(fc_scale_factor >= 1.0/10);
return fc_scale_factor;
}

static double get_ra_scale_factor()
{
assert(ra_scale_factor >= 0);
return ra_scale_factor;
}

/**
* Returns true if nfs_client is shutting down.
*/
Expand Down Expand Up @@ -214,6 +258,38 @@ struct nfs_client
return inode_map_lock_0;
}

/**
* Update various stuff that needs to be periodically updated, like:
* - Last 5 sec read and write throughput.
* - Readahead scale factor for controlling readahead amount, and
* - Flush/commit dirty data scale factor for controlling how long we keep
* dirty data before flushing/committing.
*
* Call this from some place that's called very frequently.
*/
void periodic_updater();

/**
* Get last 5 sec read throughput in MBps.
*/
uint64_t get_read_MBps() const
{
return r_MBps;
}

/**
* Get last 5 sec read throughput in MBps.
*/
uint64_t get_write_MBps() const
{
return w_MBps;
}

uint64_t get_rw_genid() const
{
return rw_genid;
}

/*
* The user should first init the client class before using it.
*/
Expand Down
14 changes: 1 addition & 13 deletions turbonfs/inc/readahead.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,22 +305,10 @@ class ra_state
max_byte_read = UINT64_MAX;
}

/**
* Update ra_scale_factor according to the current cache pressure.
* When global cache utilization is high, it reduces ra_scale_factor so
* that all readers use less ra window, for easing global memory pressure.
* Likewise when global cache utilization is low it increases the
* ra_scale_factor to let readers use higher readahead.
*/
static void update_scale_factor();

/**
* Returns the scaled ra window that caller can safely use.
*/
uint64_t get_ra_bytes() const
{
return ra_bytes * ra_scale_factor;
}
uint64_t get_ra_bytes() const;

/**
* This will run self tests to test the correctness of this class.
Expand Down
11 changes: 9 additions & 2 deletions turbonfs/inc/rpc_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,9 @@ class rpc_stats_az
* This will indicate our readahead effectiveness.
* bytes_zeroed_from_cache: How many bytes were read from unmapped parts
* of the cache and hence were zero filled.
* bytes_read_ahead: How many bytes were read ahead.
* num_readhead: Number of readahead calls made.
* bytes_read_ahead: How many bytes were read ahead using num_readhead
* calls.
* tot_getattr_reqs: How many getattr requests were received from fuse.
* getattr_served_from_cache: How many were served from inode->attr cache.
* tot_lookup_reqs: How many lookup requests were received from fuse.
Expand All @@ -363,6 +365,8 @@ class rpc_stats_az
* beyond configured limit.
* commit_gp: How many time commit was issued as global cache grew beyond
* configured limit.
* num_sync_membufs: How many times sync_membufs() was called?
* tot_bytes_sync_membufs: Total bytes flushed by sync_membufs().
*/
static std::atomic<uint64_t> tot_read_reqs;
static std::atomic<uint64_t> failed_read_reqs;
Expand All @@ -371,6 +375,7 @@ class rpc_stats_az
static std::atomic<uint64_t> bytes_read_from_cache;
static std::atomic<uint64_t> bytes_zeroed_from_cache;
static std::atomic<uint64_t> bytes_read_ahead;
static std::atomic<uint64_t> num_readhead;
static std::atomic<uint64_t> tot_getattr_reqs;
static std::atomic<uint64_t> getattr_served_from_cache;
static std::atomic<uint64_t> tot_lookup_reqs;
Expand All @@ -387,6 +392,8 @@ class rpc_stats_az
static std::atomic<uint64_t> flush_gp;
static std::atomic<uint64_t> commit_lp;
static std::atomic<uint64_t> commit_gp;
static std::atomic<uint64_t> num_sync_membufs;
static std::atomic<uint64_t> tot_bytes_sync_membufs;

static std::atomic<uint64_t> rpc_tasks_allocated;
static std::atomic<uint64_t> fuse_responses_awaited;
Expand All @@ -395,7 +402,7 @@ class rpc_stats_az

#define INC_GBL_STATS(var, inc) rpc_stats_az::var += (inc)
#define DEC_GBL_STATS(var, dec) {assert(rpc_stats_az::var >= dec); rpc_stats_az::var -= (dec);}
#define GET_GBL_STATS(var) rpc_stats_az::var
#define GET_GBL_STATS(var) rpc_stats_az::var.load()

struct fuse_req_stats
{
Expand Down
3 changes: 1 addition & 2 deletions turbonfs/inc/rpc_task.h
Original file line number Diff line number Diff line change
Expand Up @@ -2225,7 +2225,6 @@ struct rpc_task
DEC_GBL_STATS(fuse_responses_awaited, 1);
}

INC_GBL_STATS(tot_bytes_written, count);
free_rpc_task();
}

Expand Down Expand Up @@ -2607,7 +2606,7 @@ class rpc_task_helper
* used. Later init_*() method can set it to a more appropriate value.
*/
task->csched = (task->client->mnt_options.nfs_port == 2047) ?
CONN_SCHED_RR : CONN_SCHED_FH_HASH;
CONN_SCHED_RR_W : CONN_SCHED_FH_HASH;

#ifdef ENABLE_PARANOID
task->issuing_tid = ::gettid();
Expand Down
11 changes: 9 additions & 2 deletions turbonfs/inc/rpc_transport.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,22 @@ typedef enum

/*
* Round robin requests over all connections.
* Use CONN_SCHED_RR_R for read requests and CONN_SCHED_RR_W for write
* requests. This helps scheduler ensure read and write requests use
* exclusive connections else small write responses may get stuck behind
* large read responses and small read requests may get stuck behind
* large write requests. Note that this is not completely avoidable even
* though we prioritize smaller read requests over larger write requests.
*/
CONN_SCHED_RR = 2,
CONN_SCHED_RR_R = 2,
CONN_SCHED_RR_W = 3,

/*
* Every file is affined to one connection based on the FH hash, so all
* requests to one file go over the same connection while different files
* will use different connections.
*/
CONN_SCHED_FH_HASH = 3,
CONN_SCHED_FH_HASH = 4,
} conn_sched_t;

/*
Expand Down
23 changes: 16 additions & 7 deletions turbonfs/sample-turbo-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
account: sjc22prdste06hnfsv3acc1
container: nfsv3test
cloud_suffix: blob.preprod.core.windows.net
port: 2047
port: 2048

#
# Auth Config
Expand All @@ -20,7 +20,7 @@ auth: false
# readdir_maxcount
# lookupcache: all|none|pos|positive
#
nconnect: 1
nconnect: 96
timeo: 600
retrans: 2
acregmin: 3
Expand Down Expand Up @@ -112,15 +112,24 @@ fuse_max_background: 4096
# Memory backed caches are controlled using cache.data.* configs, while
# file backed cache are controlled using filecache.* configs.
#
readahead_kb: 16384
# Readahead is automatically scaled (up and down) based on the available cache
# and whether there are ongoing writes competing for the cache. readahead_kb
# is the initial value which is the scaled appropriately. It can be set to 0
# for disabling readaheads completely.
# For most cases you don't need to specify readahead_kb explicitly.
#
#readahead_kb: 16384
cache.attr.user.enable: true
cache.readdir.kernel.enable: true
cache.readdir.user.enable: true
cache.data.kernel.enable: true
cache.data.user.enable: true
cache.data.user.max_size_mb: 4096

filecache.enable: false
filecache.cachedir: /mnt
filecache.max_size_gb: 1000
cache_max_mb: 4096
#
# These are currently not supported.
#
#filecache.enable: false
#filecache.cachedir: /mnt
#filecache.max_size_gb: 1000
#cache_max_mb: 4096
2 changes: 1 addition & 1 deletion turbonfs/src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ void aznfsc_cfg::set_defaults_and_sanitize()
if (readdir_maxcount == -1)
readdir_maxcount = 1048576;
if (readahead_kb == -1)
readahead_kb = 16384;
readahead_kb = AZNFSCFG_READAHEAD_KB_DEF;
if (cache.data.user.enable) {
if (cache.data.user.max_size_mb == -1)
cache.data.user.max_size_mb = AZNFSCFG_CACHE_MAX_MB_DEF;
Expand Down
Loading