Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 22 additions & 5 deletions turbonfs/inc/fcsm.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,7 @@ class fcsm
*
* LOCKS: flush_lock.
*/
void ensure_flush(uint64_t flush_bytes,
uint64_t write_off,
void ensure_flush(uint64_t write_off,
uint64_t write_len,
struct rpc_task *task = nullptr);

Expand All @@ -107,16 +106,15 @@ class fcsm
* it'll add a blocking commit target for completing task when given commit
* goal is met.
*/
void ensure_commit(uint64_t commit_bytes,
struct rpc_task *task = nullptr);
void ensure_commit(struct rpc_task *task = nullptr);

/**
* Callbacks to be called when flush/commit successfully complete.
* These will update flushed_seq_num/committed_seq_num and run flush/commit
* targets from ftgtq/ctgtq as appropriate.
*/
void on_flush_complete(uint64_t flush_bytes);
void on_commit_complete();
void on_commit_complete(uint64_t commit_bytes);

/**
* Is the state machine currently running, i.e. it has sent (one or more)
Expand Down Expand Up @@ -148,6 +146,10 @@ class fcsm
void mark_running();
void clear_running();

void run(struct rpc_task *task,
uint64_t extent_left,
uint64_t extent_right);

/**
* Call when more writes are dispatched, or prepared to be dispatched.
* This MUST be called before the write callback can be called.
Expand Down Expand Up @@ -180,6 +182,21 @@ class fcsm
return (fc_cb_count() > 0);
}

/**
* Call when more commit are dispatched, or prepared to be dispatched.
* This MUST be called before the commit_callback can be called.
*/
void add_committing(uint64_t bytes)
{
assert(committed_seq_num <= committing_seq_num);
committing_seq_num += bytes;
}

/*
* ctgtq_cleanup() is called when we switch to stable writes.
*/
void ctgtq_cleanup();

private:
/*
* The singleton nfs_client, for convenience.
Expand Down
166 changes: 160 additions & 6 deletions turbonfs/inc/file_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -1239,6 +1239,16 @@ class bytes_chunk_cache
std::vector<bytes_chunk> get_flushing_bc_range(uint64_t st_off,
uint64_t end_off) const;

/**
* Returns contiguous dirty (and not flushing) chunks from chunmap, starting
* with the lowest dirty offset, and returns the total number of (dirty)
* bytes contained in the returned chunks.
* Before returning it increases the inuse count of underlying membuf(s).
* Caller will typically flush these to the backing Blob as UNSTABLE
* writes.
*/
std::vector<bytes_chunk> get_contiguous_dirty_bcs(uint64_t *bytes) const;

/*
* Returns *all* commit pending chunks in chunkmap.
* Before returning it increases the inuse count of underlying membuf(s)
Expand All @@ -1257,7 +1267,7 @@ class bytes_chunk_cache
* MUST check for that after holding the membuf lock, before it tries
* to commit those membuf(s).
*/
std::vector<bytes_chunk> get_commit_pending_bcs() const;
std::vector<bytes_chunk> get_commit_pending_bcs(uint64_t *bytes) const;

/**
* Drop cached data in the given range.
Expand Down Expand Up @@ -1341,7 +1351,7 @@ class bytes_chunk_cache
* We want to send as prompt as possible to utilize the n/w b/w but slow
* enough to give the write scheduler an opportunity to merge better.
*/
uint64_t max_dirty_extent_bytes() const
static uint64_t max_dirty_extent_bytes()
{
// Maximum cache size allowed in bytes.
static const uint64_t max_total =
Expand Down Expand Up @@ -1407,24 +1417,156 @@ class bytes_chunk_cache
return bytes_flushing > 0;
}

/**
* Maximum size of commit_pending data that can be in cache, before we
* must commit it to Blob.
* It should be greater than or equal to the flush threshold (as returned
* by max_dirty_extent_bytes()) and smaller than the inline write threshold
* (as suggested by do_inline_write()), to minimize inline flush waits as
* much as possible, in steady state.
*/
static uint64_t max_commit_bytes()
{
// Maximum cache size allowed in bytes.
static const uint64_t max_total =
(aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
assert(max_total != 0);

/*
* Minimum of 60% of max cache and 2 times the flush limit.
* We want to commit as soon as possible w/o affecting performance.
* If we commit too often, since commit is a serializing operation,
* it'll affect the write throughput, otoh, if we commit too late
* then we might hit the inline write threshold, which again would
* serialize writes, bringing down throughput.
*/
static const uint64_t max_commit_bytes =
std::min((uint64_t)(max_total * 0.6),
2 * max_dirty_extent_bytes());
assert(max_commit_bytes > 0);

return max_commit_bytes;
}

/**
* Check if we must initiate a COMMIT RPC now. Note that the caller would
* just send the COMMIT RPC and not necessarily block the user write
* request till the COMMIT RPC completes, i.e., it's not an inline commit.
*
* We must start commit if:
* 1. We have enough commit_pending data for this file/cache, or,
* 2. Global memory pressure dictates that we commit now to free up
* memory. In this case we might be committing more frequently which
* won't necessarily be optimal, but we have no choice due to the
* memory pressure.
*/
bool commit_required() const
{
const bool local_pressure =
(bytes_commit_pending >= max_commit_bytes());

if (local_pressure) {
return true;
}

/*
* TODO: Take cue from global memory pressure.
*/
return false;
}

/**
* Check if we must initiate flush of some cached data. Note that the caller
* would just send the corresponding WRITE RPC and not necessarily block the
* user write request till the WRITE RPC completes, i.e., it's not an inline
* write.
*
* We must start flush/write if:
* 1. We have enough bytes to flush so that we can write a full sized
* block, or for the case of stable write, we have enough data to fill
* the scheduler queue.
* 2. Global memory pressure dictates that we flush now to free up memory.
* In this case we might be flushing more frequently which won't
* necessarily be optimal, but we have no choice due to the memory
* pressure.
*/
bool flush_required() const
{
const bool local_pressure =
(get_bytes_to_flush() >= max_dirty_extent_bytes());

if (local_pressure) {
return true;
}

/*
* TODO: Take cue from global memory pressure.
*/
return false;
}

/**
* This should be called by writer threads to find out if they must wait
* for the write to complete. This will check both the cache specific and
* global memory pressure.
*/
bool do_inline_write() const
{
/*
* Allow four dirty extents before we force inline write.
* This way 2 extents are commit_pending, 1 is dirty. We can issue commit
* for the commit_pending extents and accumulate new writes in the dirty.
* Before we hit inline limit, 2GB worth of space is freed up. This cycle
* should be good enough to keep the cache size in check.
*/
static const uint64_t max_dirty_allowed_per_cache =
max_dirty_extent_bytes() * 4;
const bool local_pressure = (bytes_dirty + bytes_commit_pending) > max_dirty_allowed_per_cache;

if (local_pressure) {
return true;
}

/*
* Global pressure is when get_prune_goals() returns non-zero bytes
* to be pruned inline.
*/
uint64_t inline_bytes;

get_prune_goals(&inline_bytes, nullptr);
return (inline_bytes > 0);
}

/**
* Inline write/flush means that we are under sufficient memory pressure
* that we want to slow down the application writes by not completing them
* after copying the data to cache (using copy_to_cache()) but instead
* complete application writes only after the flush completes.
*
* This function returns a non-zero number representing the number of bytes
* that we need to write inline, else if not under memory pressure returns
* zero.
*/
uint64_t get_inline_flush_bytes() const
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have do_inline_write() and get_inline_flush_bytes() bith which return "should we do inline writes" separately and may return different results, which is nto good.
we should have just one function.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that we don't use it. I'll remove it.

{
/*
* Allow two dirty extents before we force inline write.
* This way one of the extent can be getting flushed and we can populate
* the second one.
*/
static const uint64_t max_dirty_allowed_per_cache =
max_dirty_extent_bytes() * 2;
const bool local_pressure = bytes_dirty > max_dirty_allowed_per_cache;
(max_dirty_extent_bytes() * 2);
const bool local_pressure =
((int64_t) bytes_dirty > (int64_t) max_dirty_allowed_per_cache);

if (local_pressure) {
return true;
/*
* Leave one max_dirty_extent_bytes worth of dirty bytes, and
* flush the rest.
*/
const int64_t flush_now =
(bytes_dirty - max_dirty_extent_bytes());
return flush_now;
}

/*
Expand All @@ -1433,8 +1575,20 @@ class bytes_chunk_cache
*/
uint64_t inline_bytes;

/*
* TODO: Noisy neighbor syndrome, where one file is hogging the cache,
* inline pruning will be triggered for other files.
*/
get_prune_goals(&inline_bytes, nullptr);
return (inline_bytes > 0);

/*
* (bytes_flushing + bytes_commit_pending) represents the data which
* is either already flushed or being flushed. Exclude that from the
* needs-to-be-flushed data.
*/
const int64_t flush_now =
(inline_bytes - (bytes_flushing + bytes_commit_pending));
return std::max((int64_t) 0, flush_now);
}

/**
Expand Down
2 changes: 2 additions & 0 deletions turbonfs/inc/nfs_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,8 @@ struct nfs_client

void jukebox_write(struct api_task_info *rpc_api);

void jukebox_flush(struct api_task_info *rpc_api);

/**
* Convert between NFS fattr3 and POSIX struct stat.
*/
Expand Down
58 changes: 54 additions & 4 deletions turbonfs/inc/nfs_inode.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,29 @@ struct nfs_inode
*/
struct stat attr;

/**
* We maintain following multiple views of the file and thus multiple file
* sizes for those views.
* - Cached.
* This is the view of the file that comprises of data that has been
* written by the application and saved in file cache. It may or may not
* have been flushed and/or committed. This is the most uptodate view of
* the file and applications must use this view.
* cached_filesize tracks the file size for this view.
* - Uncommited.
* This is the view of the file that tracks data that has been flushed
* using UNSTABLE writes but not yet COMMITted to the Blob. This view of
* the file is only used to see if the next PB call will write after the
* last PB'ed byte and thus can be appended.
* putblock_filesize tracks the file size for this view.
* - Committed.
* This is the view of the file that tracks data committed to the Blob.
* Other clients will see this view.
* attr.st_size tracks the file size for this view.
*/
off_t cached_filesize = 0;
mutable off_t putblock_filesize = 0;

/*
* Has this inode seen any non-append write?
* This starts as false and remains false as long as copy_to_cache() only
Expand All @@ -310,7 +333,7 @@ struct nfs_inode
* Note: As of now, we are not using this flag as commit changes not yet
* integrated, so we are setting this flag to true.
*/
bool stable_write = true;
bool stable_write = false;

public:
/*
Expand Down Expand Up @@ -1400,11 +1423,10 @@ struct nfs_inode
* initiate any new flush operations while some truncate call is in
* progress (which must have held the flush_lock).
*/
int flush_cache_and_wait(uint64_t start_off = 0,
uint64_t end_off = UINT64_MAX);
int flush_cache_and_wait();

/**
* Wait for currently flushing membufs to complete.
* Wait for currently flushing/committing membufs to complete.
* Returns 0 on success and a positive errno value on error.
*
* Note : Caller must hold the inode flush_lock to ensure that
Expand All @@ -1416,6 +1438,34 @@ struct nfs_inode
int wait_for_ongoing_flush(uint64_t start_off = 0,
uint64_t end_off = UINT64_MAX);

/**
* commit_membufs() is called to commit uncommitted membufs to the Blob.
* It creates commit RPC and sends it to the NFS server.
*/
void commit_membufs(std::vector<bytes_chunk> &bcs);

/**
* switch_to_stable_write() is called to switch the inode to stable write
* mode. It waits for all ongoing flush and subsequent commit to complete.
* If not already scheduled, it'll perform an explicit commit after the
* flush complete.
* Post that it'll mark inode for stable write and return. From then on
* any writes to this inode will be sent as stable writes.
*/
void switch_to_stable_write();

/**
* Check if stable write is required for the given offset.
* Given offset is the start of contiguous dirty membufs that need to be
* flushed to the Blob.
*/
bool check_stable_write_required(off_t offset);

/**
* Wait for ongoing commit operation to complete.
*/
void wait_for_ongoing_commit();

/**
* Sync the dirty membufs in the file cache to the NFS server.
* All contiguous dirty membufs are clubbed together and sent to the
Expand Down
Loading