Commit 2c730d6f authored by Stepan Palamarchuk's avatar Stepan Palamarchuk Committed by Facebook Github Bot

Improve fast path of Cursor

Summary:
This change simplifies the fastpath by reducing it to bare minimum (i.e. check length, load data) and removes indirection to IOBuf.
Additionally it adds `skipNoAdvance` method to have 1-instruction skip.

Disassembly of `read<signed char>` is over 35 instructions (just hot path). With this change it's doesn to 8.
Disassembly after:
  Dump of assembler code for function folly::io::detail::CursorBase<folly::io::Cursor, folly::IOBuf const>::read<unsigned char>():
     0x000000000041f0f0 <+0>:     mov    0x18(%rdi),%rax
     0x000000000041f0f4 <+4>:     lea    0x1(%rax),%rcx
     0x000000000041f0f8 <+8>:     cmp    0x10(%rdi),%rcx
     0x000000000041f0fc <+12>:    ja     0x41f105 <folly::io::detail::CursorBase<folly::io::Cursor, folly::IOBuf const>::read<unsigned char>()+21>
     0x000000000041f0fe <+14>:    mov    (%rax),%al
     0x000000000041f100 <+16>:    mov    %rcx,0x18(%rdi)
     0x000000000041f104 <+20>:    retq
     0x000000000041f105 <+21>:    jmpq   0x41f110 <folly::io::detail::CursorBase<folly::io::Cursor, folly::IOBuf const>::readSlow<unsigned char>()>

With this diff Thrift deserialization becomes ~20% faster (with prod workloads).

Thrift benchmark:
Before:
  ============================================================================
  thrift/lib/cpp2/test/ProtocolBench.cpp          relative  time/iter  iters/s
  ============================================================================
  BinaryProtocol_read_Empty                                   12.98ns   77.03M
  BinaryProtocol_read_SmallInt                                20.94ns   47.76M
  BinaryProtocol_read_BigInt                                  20.86ns   47.93M
  BinaryProtocol_read_SmallString                             34.64ns   28.86M
  BinaryProtocol_read_BigString                              185.53ns    5.39M
  BinaryProtocol_read_BigBinary                               67.34ns   14.85M
  BinaryProtocol_read_LargeBinary                             62.23ns   16.07M
  BinaryProtocol_read_Mixed                                   58.74ns   17.03M
  BinaryProtocol_read_SmallListInt                            89.99ns   11.11M
  BinaryProtocol_read_BigListInt                              39.92us   25.05K
  BinaryProtocol_read_BigListMixed                           616.20us    1.62K
  BinaryProtocol_read_LargeListMixed                          83.49ms    11.98
  CompactProtocol_read_Empty                                  11.28ns   88.67M
  CompactProtocol_read_SmallInt                               19.15ns   52.22M
  CompactProtocol_read_BigInt                                 26.14ns   38.25M
  CompactProtocol_read_SmallString                            31.04ns   32.22M
  CompactProtocol_read_BigString                             184.55ns    5.42M
  CompactProtocol_read_BigBinary                              69.73ns   14.34M
  CompactProtocol_read_LargeBinary                            64.39ns   15.53M
  CompactProtocol_read_Mixed                                  58.73ns   17.03M
  CompactProtocol_read_SmallListInt                           76.50ns   13.07M
  CompactProtocol_read_BigListInt                             25.93us   38.56K
  CompactProtocol_read_BigListMixed                          623.15us    1.60K
  CompactProtocol_read_LargeListMixed                         80.57ms    12.41
  ============================================================================

After:
  ============================================================================
  thrift/lib/cpp2/test/ProtocolBench.cpp          relative  time/iter  iters/s
  ============================================================================
  BinaryProtocol_read_Empty                                   10.40ns   96.17M
  BinaryProtocol_read_SmallInt                                15.14ns   66.03M
  BinaryProtocol_read_BigInt                                  15.19ns   65.84M
  BinaryProtocol_read_SmallString                             25.19ns   39.70M
  BinaryProtocol_read_BigString                              172.85ns    5.79M
  BinaryProtocol_read_BigBinary                               56.88ns   17.58M
  BinaryProtocol_read_LargeBinary                             56.77ns   17.61M
  BinaryProtocol_read_Mixed                                   43.98ns   22.74M
  BinaryProtocol_read_SmallListInt                            58.19ns   17.19M
  BinaryProtocol_read_BigListInt                              19.75us   50.63K
  BinaryProtocol_read_BigListMixed                           440.20us    2.27K
  BinaryProtocol_read_LargeListMixed                          56.94ms    17.56
  CompactProtocol_read_Empty                                   9.35ns  106.93M
  CompactProtocol_read_SmallInt                               13.07ns   76.49M
  CompactProtocol_read_BigInt                                 18.23ns   54.87M
  CompactProtocol_read_SmallString                            25.61ns   39.05M
  CompactProtocol_read_BigString                             174.46ns    5.73M
  CompactProtocol_read_BigBinary                              59.77ns   16.73M
  CompactProtocol_read_LargeBinary                            60.81ns   16.44M
  CompactProtocol_read_Mixed                                  42.70ns   23.42M
  CompactProtocol_read_SmallListInt                           66.89ns   14.95M
  CompactProtocol_read_BigListInt                             25.08us   39.87K
  CompactProtocol_read_BigListMixed                          427.93us    2.34K
  CompactProtocol_read_LargeListMixed                         56.11ms    17.82
  ============================================================================

Reviewed By: yfeldblum

Differential Revision: D6635325

fbshipit-source-id: 393fc1005689042977c03f37f5a898ebe7814d44
parent d7b6ad49
......@@ -13,7 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cassert>
......@@ -58,7 +57,12 @@ class CursorBase {
// Make all the templated classes friends for copy constructor.
template <class D, typename B> friend class CursorBase;
public:
explicit CursorBase(BufType* buf) : crtBuf_(buf), buffer_(buf) { }
explicit CursorBase(BufType* buf) : crtBuf_(buf), buffer_(buf) {
if (crtBuf_) {
crtPos_ = crtBegin_ = crtBuf_->data();
crtEnd_ = crtBuf_->tail();
}
}
/**
* Copy constructor.
......@@ -68,9 +72,11 @@ class CursorBase {
*/
template <class OtherDerived, class OtherBuf>
explicit CursorBase(const CursorBase<OtherDerived, OtherBuf>& cursor)
: crtBuf_(cursor.crtBuf_),
offset_(cursor.offset_),
buffer_(cursor.buffer_) { }
: crtBuf_(cursor.crtBuf_),
crtBegin_(cursor.crtBegin_),
crtEnd_(cursor.crtEnd_),
crtPos_(cursor.crtPos_),
buffer_(cursor.buffer_) {}
/**
* Reset cursor to point to a new buffer.
......@@ -78,11 +84,14 @@ class CursorBase {
void reset(BufType* buf) {
crtBuf_ = buf;
buffer_ = buf;
offset_ = 0;
if (crtBuf_) {
crtPos_ = crtBegin_ = crtBuf_->data();
crtEnd_ = crtBuf_->tail();
}
}
const uint8_t* data() const {
return crtBuf_->data() + offset_;
return crtPos_;
}
/**
......@@ -94,7 +103,7 @@ class CursorBase {
* pointing at the end of a buffer.
*/
size_t length() const {
return crtBuf_->length() - offset_;
return crtEnd_ - crtPos_;
}
/**
......@@ -102,10 +111,10 @@ class CursorBase {
*/
size_t totalLength() const {
if (crtBuf_ == buffer_) {
return crtBuf_->computeChainDataLength() - offset_;
return crtBuf_->computeChainDataLength() - (crtPos_ - crtBegin_);
}
CursorBase end(buffer_->prev());
end.offset_ = end.buffer_->length();
end.crtPos_ = end.crtEnd_;
return end - *this;
}
......@@ -135,7 +144,7 @@ class CursorBase {
*/
bool isAtEnd() const {
// Check for the simple cases first.
if (offset_ != crtBuf_->length()) {
if (crtPos_ != crtEnd_) {
return false;
}
if (crtBuf_ == buffer_->prev()) {
......@@ -158,7 +167,8 @@ class CursorBase {
* Advances the cursor to the end of the entire IOBuf chain.
*/
void advanceToEnd() {
offset_ = buffer_->prev()->length();
crtBegin_ = buffer_->prev()->data();
crtPos_ = crtEnd_ = buffer_->prev()->tail();
if (crtBuf_ != buffer_->prev()) {
crtBuf_ = buffer_->prev();
static_cast<Derived*>(this)->advanceDone();
......@@ -194,7 +204,23 @@ class CursorBase {
* same IOBuf chain.
*/
bool operator==(const Derived& other) const {
return (offset_ == other.offset_) && (crtBuf_ == other.crtBuf_);
const IOBuf* crtBuf = crtBuf_;
auto crtPos = crtPos_;
// We can be pointing to the end of a buffer chunk, find first non-empty.
while (crtPos == crtBuf->tail() && crtBuf != buffer_->prev()) {
crtBuf = crtBuf->next();
crtPos = crtBuf->data();
}
const IOBuf* crtBufOther = other.crtBuf_;
auto crtPosOther = other.crtPos_;
// We can be pointing to the end of a buffer chunk, find first non-empty.
while (crtPosOther == crtBufOther->tail() &&
crtBufOther != other.buffer_->prev()) {
crtBufOther = crtBufOther->next();
crtPosOther = crtBufOther->data();
}
return (crtPos == crtPosOther) && (crtBuf == crtBufOther);
}
bool operator!=(const Derived& other) const {
return !operator==(other);
......@@ -203,10 +229,9 @@ class CursorBase {
template <class T>
typename std::enable_if<std::is_arithmetic<T>::value, bool>::type tryRead(
T& val) {
if (LIKELY(length() >= sizeof(T))) {
if (LIKELY(crtPos_ + sizeof(T) <= crtEnd_)) {
val = loadUnaligned<T>(data());
offset_ += sizeof(T);
advanceBufferIfEmpty();
crtPos_ += sizeof(T);
return true;
}
return pullAtMostSlow(&val, sizeof(T)) == sizeof(T);
......@@ -228,11 +253,13 @@ class CursorBase {
template <class T>
T read() {
T val{};
if (!tryRead(val)) {
std::__throw_out_of_range("underflow");
if (LIKELY(crtPos_ + sizeof(T) <= crtEnd_)) {
T val = loadUnaligned<T>(data());
crtPos_ += sizeof(T);
return val;
} else {
return readSlow<T>();
}
return val;
}
template <class T>
......@@ -257,8 +284,7 @@ class CursorBase {
str.reserve(len);
if (LIKELY(length() >= len)) {
str.append(reinterpret_cast<const char*>(data()), len);
offset_ += len;
advanceBufferIfEmpty();
crtPos_ += len;
} else {
readFixedStringSlow(&str, len);
}
......@@ -307,34 +333,41 @@ class CursorBase {
void skipWhile(const Predicate& predicate);
size_t skipAtMost(size_t len) {
if (LIKELY(length() >= len)) {
offset_ += len;
advanceBufferIfEmpty();
if (LIKELY(crtPos_ + len < crtEnd_)) {
crtPos_ += len;
return len;
}
return skipAtMostSlow(len);
}
void skip(size_t len) {
if (LIKELY(length() >= len)) {
offset_ += len;
advanceBufferIfEmpty();
if (LIKELY(crtPos_ + len < crtEnd_)) {
crtPos_ += len;
} else {
skipSlow(len);
}
}
/**
* Skip bytes in the current IOBuf without advancing to the next one.
* Precondition: length() >= len
*/
void skipNoAdvance(size_t len) {
DCHECK_LE(len, length());
crtPos_ += len;
}
size_t retreatAtMost(size_t len) {
if (len <= offset_) {
offset_ -= len;
if (len <= static_cast<size_t>(crtPos_ - crtBegin_)) {
crtPos_ -= len;
return len;
}
return retreatAtMostSlow(len);
}
void retreat(size_t len) {
if (len <= offset_) {
offset_ -= len;
if (len <= static_cast<size_t>(crtPos_ - crtBegin_)) {
crtPos_ -= len;
} else {
retreatSlow(len);
}
......@@ -342,20 +375,18 @@ class CursorBase {
size_t pullAtMost(void* buf, size_t len) {
// Fast path: it all fits in one buffer.
if (LIKELY(length() >= len)) {
if (LIKELY(crtPos_ + len <= crtEnd_)) {
memcpy(buf, data(), len);
offset_ += len;
advanceBufferIfEmpty();
crtPos_ += len;
return len;
}
return pullAtMostSlow(buf, len);
}
void pull(void* buf, size_t len) {
if (LIKELY(length() >= len)) {
if (LIKELY(crtPos_ + len <= crtEnd_)) {
memcpy(buf, data(), len);
offset_ += len;
advanceBufferIfEmpty();
crtPos_ += len;
} else {
pullSlow(buf, len);
}
......@@ -399,6 +430,9 @@ class CursorBase {
}
size_t cloneAtMost(folly::IOBuf& buf, size_t len) {
// We might be at the end of buffer.
advanceBufferIfEmpty();
std::unique_ptr<folly::IOBuf> tmp;
size_t copied = 0;
for (int loopCount = 0; true; ++loopCount) {
......@@ -407,26 +441,26 @@ class CursorBase {
if (LIKELY(available >= len)) {
if (loopCount == 0) {
crtBuf_->cloneOneInto(buf);
buf.trimStart(offset_);
buf.trimStart(crtPos_ - crtBegin_);
buf.trimEnd(buf.length() - len);
} else {
tmp = crtBuf_->cloneOne();
tmp->trimStart(offset_);
tmp->trimStart(crtPos_ - crtBegin_);
tmp->trimEnd(tmp->length() - len);
buf.prependChain(std::move(tmp));
}
offset_ += len;
crtPos_ += len;
advanceBufferIfEmpty();
return copied + len;
}
if (loopCount == 0) {
crtBuf_->cloneOneInto(buf);
buf.trimStart(offset_);
buf.trimStart(crtPos_ - crtBegin_);
} else {
tmp = crtBuf_->cloneOne();
tmp->trimStart(offset_);
tmp->trimStart(crtPos_ - crtBegin_);
buf.prependChain(std::move(tmp));
}
......@@ -453,7 +487,7 @@ class CursorBase {
size_t len = 0;
if (otherBuf != crtBuf_) {
len += otherBuf->length() - other.offset_;
len += other.crtEnd_ - other.crtPos_;
for (otherBuf = otherBuf->next();
otherBuf != crtBuf_ && otherBuf != other.buffer_;
......@@ -465,13 +499,13 @@ class CursorBase {
std::__throw_out_of_range("wrap-around");
}
len += offset_;
len += crtPos_ - crtBegin_;
} else {
if (offset_ < other.offset_) {
if (crtPos_ < other.crtPos_) {
std::__throw_out_of_range("underflow");
}
len += offset_ - other.offset_;
len += crtPos_ - other.crtPos_;
}
return len;
......@@ -492,7 +526,7 @@ class CursorBase {
}
}
len += offset_;
len += crtPos_ - crtBegin_;
return len;
}
......@@ -506,37 +540,48 @@ class CursorBase {
bool tryAdvanceBuffer() {
BufType* nextBuf = crtBuf_->next();
if (UNLIKELY(nextBuf == buffer_)) {
offset_ = crtBuf_->length();
crtPos_ = crtEnd_;
return false;
}
offset_ = 0;
crtBuf_ = nextBuf;
crtPos_ = crtBegin_ = crtBuf_->data();
crtEnd_ = crtBuf_->tail();
static_cast<Derived*>(this)->advanceDone();
return true;
}
bool tryRetreatBuffer() {
if (UNLIKELY(crtBuf_ == buffer_)) {
offset_ = 0;
crtPos_ = crtBegin_;
return false;
}
crtBuf_ = crtBuf_->prev();
offset_ = crtBuf_->length();
crtBegin_ = crtBuf_->data();
crtPos_ = crtEnd_ = crtBuf_->tail();
static_cast<Derived*>(this)->advanceDone();
return true;
}
void advanceBufferIfEmpty() {
if (length() == 0) {
if (crtPos_ == crtEnd_) {
tryAdvanceBuffer();
}
}
BufType* crtBuf_;
size_t offset_ = 0;
const uint8_t* crtBegin_{nullptr};
const uint8_t* crtEnd_{nullptr};
const uint8_t* crtPos_{nullptr};
private:
template <class T>
FOLLY_NOINLINE T readSlow() {
T val;
pullSlow(&val, sizeof(T));
return val;
}
void readFixedStringSlow(std::string* str, size_t len) {
for (size_t available; (available = length()) < len; ) {
str->append(reinterpret_cast<const char*>(data()), available);
......@@ -546,7 +591,7 @@ class CursorBase {
len -= available;
}
str->append(reinterpret_cast<const char*>(data()), len);
offset_ += len;
crtPos_ += len;
advanceBufferIfEmpty();
}
......@@ -563,7 +608,7 @@ class CursorBase {
len -= available;
}
memcpy(p, data(), len);
offset_ += len;
crtPos_ += len;
advanceBufferIfEmpty();
return copied + len;
}
......@@ -583,7 +628,7 @@ class CursorBase {
}
len -= available;
}
offset_ += len;
crtPos_ += len;
advanceBufferIfEmpty();
return skipped + len;
}
......@@ -596,14 +641,14 @@ class CursorBase {
size_t retreatAtMostSlow(size_t len) {
size_t retreated = 0;
for (size_t available; (available = offset_) < len;) {
for (size_t available; (available = crtPos_ - crtBegin_) < len;) {
retreated += available;
if (UNLIKELY(!tryRetreatBuffer())) {
return retreated;
}
len -= available;
}
offset_ -= len;
crtPos_ -= len;
return retreated + len;
}
......@@ -748,11 +793,19 @@ class RWCursor
if (this->crtBuf_ != this->head() && this->totalLength() < n) {
throw std::overflow_error("cannot gather() past the end of the chain");
}
this->crtBuf_->gather(this->offset_ + n);
size_t offset = this->crtPos_ - this->crtBegin_;
this->crtBuf_->gather(offset + n);
this->crtBegin_ = this->crtBuf_->data();
this->crtEnd_ = this->crtBuf_->tail();
this->crtPos_ = this->crtBegin_ + offset;
}
void gatherAtMost(size_t n) {
size_t size = std::min(n, this->totalLength());
return this->crtBuf_->gather(this->offset_ + size);
size_t offset = this->crtPos_ - this->crtBegin_;
this->crtBuf_->gather(offset + size);
this->crtBegin_ = this->crtBuf_->data();
this->crtEnd_ = this->crtBuf_->tail();
this->crtPos_ = this->crtBegin_ + offset;
}
using detail::Writable<RWCursor<access>>::pushAtMost;
......@@ -774,7 +827,7 @@ class RWCursor
maybeUnshare();
}
memcpy(writableData(), buf, len);
this->offset_ += len;
this->crtPos_ += len;
return copied + len;
}
......@@ -793,16 +846,16 @@ class RWCursor
void insert(std::unique_ptr<folly::IOBuf> buf) {
folly::IOBuf* nextBuf;
if (this->offset_ == 0) {
if (this->crtPos_ == this->crtBegin_) {
// Can just prepend
nextBuf = this->crtBuf_;
this->crtBuf_->prependChain(std::move(buf));
} else {
std::unique_ptr<folly::IOBuf> remaining;
if (this->crtBuf_->length() - this->offset_ > 0) {
if (this->crtPos_ != this->crtEnd_) {
// Need to split current IOBuf in two.
remaining = this->crtBuf_->cloneOne();
remaining->trimStart(this->offset_);
remaining->trimStart(this->crtPos_ - this->crtBegin_);
nextBuf = remaining.get();
buf->prependChain(std::move(remaining));
} else {
......@@ -811,20 +864,26 @@ class RWCursor
}
this->crtBuf_->trimEnd(this->length());
this->crtBuf_->appendChain(std::move(buf));
// Jump past the new links
this->crtBuf_ = nextBuf;
this->crtPos_ = this->crtBegin_ = this->crtBuf_->data();
this->crtEnd_ = this->crtBuf_->tail();
}
// Jump past the new links
this->offset_ = 0;
this->crtBuf_ = nextBuf;
}
uint8_t* writableData() {
return this->crtBuf_->writableData() + this->offset_;
return this->crtBuf_->writableData() + (this->crtPos_ - this->crtBegin_);
}
private:
void maybeUnshare() {
if (UNLIKELY(maybeShared_)) {
size_t offset = this->crtPos_ - this->crtBegin_;
this->crtBuf_->unshareOne();
this->crtBegin_ = this->crtBuf_->data();
this->crtEnd_ = this->crtBuf_->tail();
this->crtPos_ = this->crtBegin_ + offset;
maybeShared_ = false;
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment