Commit 173356a3 authored by Philip Pronin's avatar Philip Pronin Committed by Dave Watson

improve io::Cursor read() performance for small sizeof(T)

Summary:
I just found that gcc (4.8.2) failed to unroll the loop in
`pullAtMost()`, so it didn't replace `memcpy` with a simple load
for small `len`.

Test Plan:
fbconfig -r folly/io/test thrift/lib/cpp2/test && fbmake runtests_opt -j32

Ran unicorn-specific thrift deserialization benchmark from
D1724070, verified 50% improvement in `SearchRequest` deserialization
performance.

`thrift/lib/cpp2/test/ProtocolBench` results:

```
|---- before -----| |---- after  -----|
================================================================================================
thrift/lib/cpp2/test/ProtocolBench.cpp          relative  time/iter  iters/s  time/iter  iters/s
================================================================================================
BinaryProtocol_read_Empty                                   21.72ns   46.04M    17.58ns   56.89M
BinaryProtocol_read_SmallInt                                43.03ns   23.24M    23.64ns   42.30M
BinaryProtocol_read_BigInt                                  43.72ns   22.87M    22.03ns   45.38M
BinaryProtocol_read_SmallString                             88.57ns   11.29M    47.01ns   21.27M
BinaryProtocol_read_BigString                              365.76ns    2.73M   323.58ns    3.09M
BinaryProtocol_read_BigBinary                              207.78ns    4.81M   169.09ns    5.91M
BinaryProtocol_read_LargeBinary                            187.81ns    5.32M   172.09ns    5.81M
BinaryProtocol_read_Mixed                                  161.18ns    6.20M    68.41ns   14.62M
BinaryProtocol_read_SmallListInt                           177.32ns    5.64M    96.91ns   10.32M
BinaryProtocol_read_BigListInt                              77.03us   12.98K    15.88us   62.97K
BinaryProtocol_read_BigListMixed                             1.79ms   557.79   923.99us    1.08K
BinaryProtocol_read_LargeListMixed                         195.01ms     5.13   103.78ms     9.64
================================================================================================
```

Reviewed By: soren@fb.com

Subscribers: alandau, bmatheny, mshneer, trunkagent, njormrod, folly-diffs@

FB internal diff: D1724111

Tasks: 5770136

Signature: t1:1724111:1417977810:b7d643d0c819a0bbac77fa0048206153929e50a8
parent 2bda6641
......@@ -159,7 +159,7 @@ inline uint64_t decodeVarintFromCursor(folly::io::Cursor& cursor) {
uint64_t val = 0;
int8_t b = 0;
for (int shift = 0; shift <= 63; shift += 7) {
b = cursor.pullByte();
b = cursor.read<int8_t>();
val |= static_cast<uint64_t>(b & 0x7f) << shift;
if (b >= 0) {
break;
......
......@@ -49,16 +49,42 @@
* access to the buffer (you need to call unshare() yourself if necessary).
**/
namespace folly { namespace io {
namespace detail {
template <class Derived, typename BufType>
template <class Derived, class BufType>
class CursorBase {
// Make all the templated classes friends for copy constructor.
template <class D, typename B> friend class CursorBase;
public:
explicit CursorBase(BufType* buf) : crtBuf_(buf), buffer_(buf) { }
/**
* Copy constructor.
*
* This also allows constructing a CursorBase from other derived types.
* For instance, this allows constructing a Cursor from an RWPrivateCursor.
*/
template <class OtherDerived, class OtherBuf>
explicit CursorBase(const CursorBase<OtherDerived, OtherBuf>& cursor)
: crtBuf_(cursor.crtBuf_),
offset_(cursor.offset_),
buffer_(cursor.buffer_) { }
/**
* Reset cursor to point to a new buffer.
*/
void reset(BufType* buf) {
crtBuf_ = buf;
buffer_ = buf;
offset_ = 0;
}
const uint8_t* data() const {
return crtBuf_->data() + offset_;
}
/*
/**
* Return the remaining space available in the current IOBuf.
*
* May return 0 if the cursor is at the end of an IOBuf. Use peek() instead
......@@ -70,7 +96,7 @@ class CursorBase {
return crtBuf_->length() - offset_;
}
/*
/**
* Return the space available until the end of the entire IOBuf chain.
*/
size_t totalLength() const {
......@@ -107,10 +133,14 @@ class CursorBase {
}
template <class T>
typename std::enable_if<std::is_arithmetic<T>::value, T>::type
read() {
typename std::enable_if<std::is_arithmetic<T>::value, T>::type read() {
T val;
pull(&val, sizeof(T));
if (LIKELY(length() >= sizeof(T))) {
val = loadUnaligned<T>(data());
offset_ += sizeof(T);
} else {
pullSlow(&val, sizeof(T));
}
return val;
}
......@@ -133,23 +163,14 @@ class CursorBase {
*/
std::string readFixedString(size_t len) {
std::string str;
str.reserve(len);
for (;;) {
// Fast path: it all fits in one buffer.
size_t available = length();
if (LIKELY(available >= len)) {
if (LIKELY(length() >= len)) {
str.append(reinterpret_cast<const char*>(data()), len);
offset_ += len;
return str;
}
str.append(reinterpret_cast<const char*>(data()), available);
if (UNLIKELY(!tryAdvanceBuffer())) {
throw std::out_of_range("string underflow");
}
len -= available;
} else {
readFixedStringSlow(&str, len);
}
return str;
}
/**
......@@ -194,31 +215,39 @@ class CursorBase {
}
}
explicit CursorBase(BufType* buf)
: crtBuf_(buf)
, offset_(0)
, buffer_(buf) {}
size_t skipAtMost(size_t len) {
if (LIKELY(length() >= len)) {
offset_ += len;
return len;
}
return skipAtMostSlow(len);
}
// Make all the templated classes friends for copy constructor.
template <class D, typename B> friend class CursorBase;
void skip(size_t len) {
if (LIKELY(length() >= len)) {
offset_ += len;
} else {
skipSlow(len);
}
}
/*
* Copy constructor.
*
* This also allows constructing a CursorBase from other derived types.
* For instance, this allows constructing a Cursor from an RWPrivateCursor.
*/
template <class OtherDerived, class OtherBuf>
explicit CursorBase(const CursorBase<OtherDerived, OtherBuf>& cursor)
: crtBuf_(cursor.crtBuf_),
offset_(cursor.offset_),
buffer_(cursor.buffer_) {}
size_t pullAtMost(void* buf, size_t len) {
// Fast path: it all fits in one buffer.
if (LIKELY(length() >= len)) {
memcpy(buf, data(), len);
offset_ += len;
return len;
}
return pullAtMostSlow(buf, len);
}
// reset cursor to point to a new buffer.
void reset(BufType* buf) {
crtBuf_ = buf;
buffer_ = buf;
offset_ = 0;
void pull(void* buf, size_t len) {
if (LIKELY(length() >= len)) {
memcpy(buf, data(), len);
offset_ += len;
} else {
pullSlow(buf, len);
}
}
/**
......@@ -232,35 +261,9 @@ class CursorBase {
while (UNLIKELY(available == 0 && tryAdvanceBuffer())) {
available = length();
}
return std::make_pair(data(), available);
}
/**
* Read one byte. Equivalent to pull(&byte, 1) but faster.
*/
uint8_t pullByte() {
// fast path
if (LIKELY(length() >= 1)) {
uint8_t byte = *data();
offset_++;
return byte;
}
// slow path
uint8_t byte;
if (UNLIKELY(pullAtMost(&byte, 1) != 1)) {
throw std::out_of_range("underflow");
}
return byte;
}
void pull(void* buf, size_t len) {
if (UNLIKELY(pullAtMost(buf, len) != len)) {
throw std::out_of_range("underflow");
}
}
void clone(std::unique_ptr<folly::IOBuf>& buf, size_t len) {
if (UNLIKELY(cloneAtMost(buf, len) != len)) {
throw std::out_of_range("underflow");
......@@ -273,36 +276,6 @@ class CursorBase {
}
}
void skip(size_t len) {
if (LIKELY(length() >= len)) {
offset_ += len;
return;
}
skipSlow(len);
}
size_t pullAtMost(void* buf, size_t len) {
uint8_t* p = reinterpret_cast<uint8_t*>(buf);
size_t copied = 0;
for (;;) {
// Fast path: it all fits in one buffer.
size_t available = length();
if (LIKELY(available >= len)) {
memcpy(p, data(), len);
offset_ += len;
return copied + len;
}
memcpy(p, data(), available);
copied += available;
if (UNLIKELY(!tryAdvanceBuffer())) {
return copied;
}
p += available;
len -= available;
}
}
size_t cloneAtMost(folly::IOBuf& buf, size_t len) {
buf = folly::IOBuf();
......@@ -327,7 +300,6 @@ class CursorBase {
return copied + len;
}
if (loopCount == 0) {
crtBuf_->cloneOneInto(buf);
buf.trimStart(offset_);
......@@ -349,28 +321,9 @@ class CursorBase {
if (!buf) {
buf = make_unique<folly::IOBuf>();
}
return cloneAtMost(*buf, len);
}
size_t skipAtMost(size_t len) {
size_t skipped = 0;
for (;;) {
// Fast path: it all fits in one buffer.
size_t available = length();
if (LIKELY(available >= len)) {
offset_ += len;
return skipped + len;
}
skipped += available;
if (UNLIKELY(!tryAdvanceBuffer())) {
return skipped;
}
len -= available;
}
}
/**
* Return the distance between two cursors.
*/
......@@ -423,10 +376,7 @@ class CursorBase {
}
protected:
BufType* crtBuf_;
size_t offset_;
~CursorBase(){}
~CursorBase() { }
BufType* head() {
return buffer_;
......@@ -445,20 +395,71 @@ class CursorBase {
return true;
}
BufType* crtBuf_;
size_t offset_ = 0;
private:
void advanceDone() {
void readFixedStringSlow(std::string* str, size_t len) {
for (size_t available; (available = length()) < len; ) {
str->append(reinterpret_cast<const char*>(data()), available);
if (UNLIKELY(!tryAdvanceBuffer())) {
throw std::out_of_range("string underflow");
}
len -= available;
}
str->append(reinterpret_cast<const char*>(data()), len);
offset_ += len;
}
size_t pullAtMostSlow(void* buf, size_t len) {
uint8_t* p = reinterpret_cast<uint8_t*>(buf);
size_t copied = 0;
for (size_t available; (available = length()) < len; ) {
memcpy(p, data(), available);
copied += available;
if (UNLIKELY(!tryAdvanceBuffer())) {
return copied;
}
p += available;
len -= available;
}
memcpy(p, data(), len);
offset_ += len;
return copied + len;
}
void pullSlow(void* buf, size_t len) {
if (UNLIKELY(pullAtMostSlow(buf, len) != len)) {
throw std::out_of_range("underflow");
}
}
size_t skipAtMostSlow(size_t len) {
size_t skipped = 0;
for (size_t available; (available = length()) < len; ) {
skipped += available;
if (UNLIKELY(!tryAdvanceBuffer())) {
return skipped;
}
len -= skipped;
}
offset_ += len;
return skipped + len;
}
void skipSlow(size_t len) {
if (UNLIKELY(skipAtMost(len) != len)) {
if (UNLIKELY(skipAtMostSlow(len) != len)) {
throw std::out_of_range("underflow");
}
}
void advanceDone() {
}
BufType* buffer_;
};
} //namespace detail
} // namespace detail
class Cursor : public detail::CursorBase<Cursor, const IOBuf> {
public:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment