From 95b74ae64f7094766994dd174fdffe81f9d7c943 Mon Sep 17 00:00:00 2001 From: Matt Benjamin Date: Tue, 2 Dec 2014 18:33:58 -0500 Subject: [PATCH 1/4] Include common/likely.h in buffer.h Signed-off-by: Matt Benjamin --- src/common/buffer.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/buffer.cc b/src/common/buffer.cc index 9a1ac5fee6..4ab786a3f5 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -19,6 +19,7 @@ #include "common/safe_io.h" #include "common/simple_spin.h" #include "common/strtol.h" +#include "common/likely.h" #include "include/atomic.h" #include "common/Mutex.h" #include "include/types.h" From bf0b88850d2d5dae608b89f306da8259b4234c11 Mon Sep 17 00:00:00 2001 From: Matt Benjamin Date: Fri, 21 Nov 2014 09:41:38 -0800 Subject: [PATCH 2/4] Add safe-sharing to buffer::list and buffer::ptr. Formerly known as "strong claim," this change introduces a new sharable() property to ceph::buffer::raw objects, which can be used to prevent unplanned sharing of buffers. The default value of sharable() is true, but derived classes may override this. For example, Accelio message buffers may point to registered or pool-allocated memory which should not be held for long periods. Cloning of non-sharable buffers is now the unmarked case, so is performed automatically in the buffer::list copy constructor. Low-level operations should allow explicit volatile sharing, so copying a buffer::list:iterator or a buffer::ptr works as normal. For explicit sharing of "non-sharable" buffers, a new boolean is added to the public claim_* methods, and a buffer::list::share method is also added, to share an entire sequence. Signed-off-by: Vu Pham Signed-off-by: Matt Benjamin Signed-off-by: Matt Benjamin Conflicts: src/include/buffer.h Signed-off-by: Matt Benjamin Conflicts: src/common/buffer.cc Signed-off-by: Matt Benjamin new naming for "strong claim": clone_nonsharable Signed-off-by: Matt Benjamin --- src/common/buffer.cc | 32 ++++++++++++++++++++++++++------ src/include/buffer.h | 38 ++++++++++++++++++++++++++++++++------ src/msg/Message.h | 4 ++-- 3 files changed, 60 insertions(+), 14 deletions(-) diff --git a/src/common/buffer.cc b/src/common/buffer.cc index 4ab786a3f5..94311767e1 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -162,6 +162,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; bool is_n_page_sized() { return (len & ~CEPH_PAGE_MASK) == 0; } + virtual bool sharable() { + // true if safe to claim-hold due to, e.g., special registration + return true; + } bool get_crc(const pair &fromto, pair *crc) const { Mutex::Locker l(crc_lock); @@ -603,6 +607,18 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; return _raw->clone(); } + buffer::ptr& buffer::ptr::clone_nonsharable() { + if (_raw && !_raw->sharable()) { + buffer::raw *tr = _raw; + _raw = tr->clone(); + _raw->nref.set(1); + if (unlikely(tr->nref.dec() == 0)) { + delete tr; + } + } + return *this; + } + void buffer::ptr::swap(ptr& other) { raw *r = _raw; @@ -1171,27 +1187,31 @@ void buffer::list::rebuild_page_aligned() } // sort-of-like-assignment-op - void buffer::list::claim(list& bl) + void buffer::list::claim(list& bl, bool clone_nonsharable) { // free my buffers clear(); - claim_append(bl); + claim_append(bl, clone_nonsharable); } - void buffer::list::claim_append(list& bl) + void buffer::list::claim_append(list& bl, bool clone_nonsharable) { // steal the other guy's buffers _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); + if (clone_nonsharable) + bl.clone_nonsharable(); + _buffers.splice(_buffers.end(), bl._buffers ); bl._len = 0; bl.last_p = bl.begin(); } - void buffer::list::claim_prepend(list& bl) + void buffer::list::claim_prepend(list& bl, bool clone_nonsharable) { // steal the other guy's buffers _len += bl._len; - _buffers.splice( _buffers.begin(), bl._buffers ); + if (clone_nonsharable) + bl.clone_nonsharable(); + _buffers.splice(_buffers.begin(), bl._buffers ); bl._len = 0; bl.last_p = bl.begin(); } diff --git a/src/include/buffer.h b/src/include/buffer.h index aac2f80a65..5027686e18 100644 --- a/src/include/buffer.h +++ b/src/include/buffer.h @@ -179,6 +179,7 @@ class CEPH_BUFFER_API buffer { raw *clone(); void swap(ptr& other); + ptr& clone_nonsharable(); // misc bool at_buffer_head() const { return _off == 0; } @@ -329,12 +330,17 @@ class CEPH_BUFFER_API buffer { append_buffer.set_length(0); // unused, so far. } ~list() {} - - list(const list& other) : _buffers(other._buffers), _len(other._len), _memcopy_count(other._memcopy_count),last_p(this) { } + list(const list& other) : _buffers(other._buffers), _len(other._len), + _memcopy_count(other._memcopy_count), last_p(this) { + // make deep copy semantics the default + clone_nonsharable(); + } list& operator= (const list& other) { if (this != &other) { _buffers = other._buffers; _len = other._len; + // make deep copy semantics the default + clone_nonsharable(); } return *this; } @@ -404,10 +410,30 @@ class CEPH_BUFFER_API buffer { unsigned align_memory); void rebuild_page_aligned(); - // sort-of-like-assignment-op - void claim(list& bl); - void claim_append(list& bl); - void claim_prepend(list& bl); + // assignment-op with move semantics + void claim(list& bl, bool clone_nonsharable=true); + void claim_append(list& bl, bool clone_nonsharable=true); + void claim_prepend(list& bl, bool clone_nonsharable=true); + + // clone non-sharable buffers (make sharable) + void clone_nonsharable() { + std::list::iterator pb; + for (pb = _buffers.begin(); pb != _buffers.end(); ++pb) { + (void) pb->clone_nonsharable(); + } + } + + // copy with explicit volatile-sharing semantics + void share(list& bl) + { + if (this != &bl) { + clear(); + std::list::iterator pb; + for (pb = bl._buffers.begin(); pb != bl._buffers.end(); ++pb) { + push_back(*pb); + } + } + } iterator begin() { return iterator(this, 0); diff --git a/src/msg/Message.h b/src/msg/Message.h index cb47058d0c..66f225688c 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -337,10 +337,10 @@ class Message : public RefCountedObject { } bufferlist& get_data() { return data; } - void claim_data(bufferlist& bl) { + void claim_data(bufferlist& bl, bool strong=true) { if (byte_throttler) byte_throttler->put(data.length()); - bl.claim(data); + bl.claim(data, strong); } off_t get_data_len() { return data.length(); } From dde2b0708d798d11114a74118f2eebb50660e160 Mon Sep 17 00:00:00 2001 From: Matt Benjamin Date: Wed, 24 Sep 2014 11:06:05 -0500 Subject: [PATCH 3/4] Restore zero-copy buffers in OSD fast path. This change restores volatile sharing semantics in the Message decode path, and also in the OSD write path for FileStore/FileJournal. This can be verified with a breakpoint set at the clone/COW case in buffer::ptr::clone_nonsharable(currently buffer.cc:690). Signed-off-by: Matt Benjamin --- src/msg/Message.h | 12 ++++++------ src/os/FileJournal.cc | 2 +- src/os/FileJournal.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/msg/Message.h b/src/msg/Message.h index 66f225688c..cadf1af4e6 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -314,7 +314,7 @@ class Message : public RefCountedObject { void set_payload(bufferlist& bl) { if (byte_throttler) byte_throttler->put(payload.length()); - payload.claim(bl); + payload.claim(bl, false /* !clone_nonsharable */); if (byte_throttler) byte_throttler->take(payload.length()); } @@ -322,25 +322,25 @@ class Message : public RefCountedObject { void set_middle(bufferlist& bl) { if (byte_throttler) byte_throttler->put(payload.length()); - middle.claim(bl); + middle.claim(bl, false /* !clone_nonsharable */); if (byte_throttler) byte_throttler->take(payload.length()); } bufferlist& get_middle() { return middle; } - void set_data(const bufferlist &d) { + void set_data(bufferlist &bl) { if (byte_throttler) byte_throttler->put(data.length()); - data = d; + data.claim(bl, false /* !clone_nonsharable */); if (byte_throttler) byte_throttler->take(data.length()); } bufferlist& get_data() { return data; } - void claim_data(bufferlist& bl, bool strong=true) { + void claim_data(bufferlist& bl, bool clone_nonsharable=true) { if (byte_throttler) byte_throttler->put(data.length()); - bl.claim(data, strong); + bl.claim(data, clone_nonsharable); } off_t get_data_len() { return data.length(); } diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc index 388190f188..2a868a5dc5 100644 --- a/src/os/FileJournal.cc +++ b/src/os/FileJournal.cc @@ -913,7 +913,7 @@ int FileJournal::prepare_single_write(bufferlist& bl, off64_t& queue_pos, uint64 bufferptr bp = buffer::create_static(pre_pad, zero_buf); bl.push_back(bp); } - bl.claim_append(ebl); + bl.claim_append(ebl, false /* !clone_nonsharable */); // potential zero-copy if (h.post_pad) { bufferptr bp = buffer::create_static(post_pad, zero_buf); diff --git a/src/os/FileJournal.h b/src/os/FileJournal.h index 878b9094b6..0a2311d105 100644 --- a/src/os/FileJournal.h +++ b/src/os/FileJournal.h @@ -54,7 +54,7 @@ class FileJournal : public Journal { TrackedOpRef tracked_op; write_item(uint64_t s, bufferlist& b, int al, TrackedOpRef opref) : seq(s), alignment(al), tracked_op(opref) { - bl.claim(b); + bl.claim(b, false /* !clone_nonsharable */); // potential zero-copy } write_item() : seq(0), alignment(0) {} }; From c25fc7bda496602cccd384e9a8120eb0da70f49a Mon Sep 17 00:00:00 2001 From: Matt Benjamin Date: Mon, 1 Dec 2014 14:31:22 -0500 Subject: [PATCH 4/4] Message::set_data cannot preserve constness. Signed-off-by: Matt Benjamin --- src/client/Client.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 5cd6e861c3..ac197a539c 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -4296,7 +4296,7 @@ int Client::mds_command( // Construct and send MCommand MCommand *m = new MCommand(monclient->get_fsid()); m->cmd = cmd; - m->set_data(inbl); + m->set_data(const_cast(inbl)); m->set_tid(tid); conn->send_message(m); }