Commit ca1e87ed authored by James Sedgwick's avatar James Sedgwick Committed by Sara Golemon

various perf improvements

Summary: Three strategies
1. Optimistic locking
2. Acquire-release memory ordering instead of full sequential consistency
3. Some low-hanging branch miss optimizations

Please review carefully; the dogscience is strong with this one

```
Before:

============================================================================
folly/futures/test/Benchmark.cpp                relative  time/iter  iters/s
============================================================================
constantFuture                                             127.99ns    7.81M
promiseAndFuture                                  94.89%   134.89ns    7.41M
withThen                                          28.40%   450.63ns    2.22M
----------------------------------------------------------------------------
oneThen                                                    446.68ns    2.24M
twoThens                                          58.35%   765.55ns    1.31M
fourThens                                         31.87%     1.40us  713.41K
hundredThens                                       1.61%    27.78us   35.99K
----------------------------------------------------------------------------
no_contention                                                4.63ms   216.00
contention                                        80.79%     5.73ms   174.52
----------------------------------------------------------------------------
throwAndCatch                                               10.91us   91.64K
throwAndCatchWrapped                             127.14%     8.58us  116.51K
throwWrappedAndCatch                             178.22%     6.12us  163.32K
throwWrappedAndCatchWrapped                      793.75%     1.37us  727.38K
----------------------------------------------------------------------------
throwAndCatchContended                                        1.35s  741.33m
throwAndCatchWrappedContended                    139.18%   969.23ms     1.03
throwWrappedAndCatchContended                    169.51%   795.76ms     1.26
throwWrappedAndCatchWrappedContended            17742.23%     7.60ms   131.53
----------------------------------------------------------------------------
complexUnit                                                127.50us    7.84K
complexBlob4                                     100.14%   127.32us    7.85K
complexBlob8                                     100.16%   127.30us    7.86K
complexBlob64                                     96.45%   132.19us    7.57K
complexBlob128                                    92.83%   137.35us    7.28K
complexBlob256                                    87.79%   145.23us    6.89K
complexBlob512                                    81.64%   156.18us    6.40K
complexBlob1024                                   72.54%   175.76us    5.69K
complexBlob2048                                   58.52%   217.89us    4.59K
complexBlob4096                                   32.54%   391.78us    2.55K
============================================================================

After:
============================================================================
folly/futures/test/Benchmark.cpp                relative  time/iter  iters/s
============================================================================
constantFuture                                              85.28ns   11.73M
promiseAndFuture                                  88.63%    96.22ns   10.39M
withThen                                          30.46%   279.99ns    3.57M
----------------------------------------------------------------------------
oneThen                                                    231.18ns    4.33M
twoThens                                          60.57%   381.70ns    2.62M
fourThens                                         33.52%   689.71ns    1.45M
hundredThens                                       1.49%    15.48us   64.58K
----------------------------------------------------------------------------
no_contention                                                3.84ms   260.19
contention                                        88.29%     4.35ms   229.73
----------------------------------------------------------------------------
throwAndCatch                                               10.63us   94.06K
throwAndCatchWrapped                             127.17%     8.36us  119.61K
throwWrappedAndCatch                             179.83%     5.91us  169.15K
throwWrappedAndCatchWrapped                     1014.48%     1.05us  954.19K
----------------------------------------------------------------------------
throwAndCatchContended                                        1.34s  749.03m
throwAndCatchWrappedContended                    140.66%   949.16ms     1.05
throwWrappedAndCatchContended                    164.87%   809.77ms     1.23
throwWrappedAndCatchWrappedContended            49406.39%     2.70ms   370.07
----------------------------------------------------------------------------
complexUnit                                                 86.83us   11.52K
complexBlob4                                      97.42%    89.12us   11.22K
complexBlob8                                      96.63%    89.85us   11.13K
complexBlob64                                     92.53%    93.84us   10.66K
complexBlob128                                    90.85%    95.57us   10.46K
complexBlob256                                    82.56%   105.17us    9.51K
complexBlob512                                    74.13%   117.12us    8.54K
complexBlob1024                                   63.67%   136.37us    7.33K
complexBlob2048                                   50.25%   172.79us    5.79K
complexBlob4096                                   26.63%   326.05us    3.07K
============================================================================
```

Reviewed By: @djwatson

Differential Revision: D2139822
parent 16b7f862
......@@ -106,14 +106,12 @@ Future<T>::thenImplementation(F func, detail::argResult<isTry, F, Args...>) {
// wrap these so we can move them into the lambda
folly::MoveWrapper<Promise<B>> p;
p->setInterruptHandler(core_->getInterruptHandler());
p->core_->setInterruptHandlerNoLock(core_->getInterruptHandler());
folly::MoveWrapper<F> funcm(std::forward<F>(func));
// grab the Future now before we lose our handle on the Promise
auto f = p->getFuture();
if (getExecutor()) {
f.setExecutor(getExecutor());
}
f.core_->setExecutorNoLock(getExecutor());
/* This is a bit tricky.
......@@ -174,13 +172,12 @@ Future<T>::thenImplementation(F func, detail::argResult<isTry, F, Args...>) {
// wrap these so we can move them into the lambda
folly::MoveWrapper<Promise<B>> p;
p->core_->setInterruptHandlerNoLock(core_->getInterruptHandler());
folly::MoveWrapper<F> funcm(std::forward<F>(func));
// grab the Future now before we lose our handle on the Promise
auto f = p->getFuture();
if (getExecutor()) {
f.setExecutor(getExecutor());
}
f.core_->setExecutorNoLock(getExecutor());
setCallback_(
[p, funcm](Try<T>&& t) mutable {
......
......@@ -44,16 +44,19 @@ Promise<T>& Promise<T>::operator=(Promise<T>&& other) noexcept {
template <class T>
void Promise<T>::throwIfFulfilled() {
if (!core_)
if (UNLIKELY(!core_)) {
throw NoState();
if (core_->ready())
}
if (UNLIKELY(core_->ready())) {
throw PromiseAlreadySatisfied();
}
}
template <class T>
void Promise<T>::throwIfRetrieved() {
if (retrieved_)
if (UNLIKELY(retrieved_)) {
throw FutureAlreadyRetrieved();
}
}
template <class T>
......
......@@ -103,6 +103,7 @@ public:
private:
typedef typename Future<T>::corePtr corePtr;
template <class> friend class Future;
// Whether the Future has been retrieved (a one-time operation).
bool retrieved_;
......
......@@ -79,9 +79,9 @@ Try<T>& Try<T>::operator=(const Try<T>& t) {
template <class T>
Try<T>::~Try() {
if (contains_ == Contains::VALUE) {
if (LIKELY(contains_ == Contains::VALUE)) {
value_.~T();
} else if (contains_ == Contains::EXCEPTION) {
} else if (UNLIKELY(contains_ == Contains::EXCEPTION)) {
e_.~unique_ptr<exception_wrapper>();
}
}
......
......@@ -81,12 +81,12 @@ class Core {
Core() {}
explicit Core(Try<T>&& t)
: fsm_(State::OnlyResult),
attached_(1),
result_(std::move(t)) {}
: result_(std::move(t)),
fsm_(State::OnlyResult),
attached_(1) {}
~Core() {
assert(attached_ == 0);
DCHECK(attached_ == 0);
}
// not copyable
......@@ -212,7 +212,7 @@ class Core {
void detachPromise() {
// detachPromise() and setResult() should never be called in parallel
// so we don't need to protect this.
if (!result_) {
if (UNLIKELY(!result_)) {
setResult(Try<T>(exception_wrapper(BrokenPromise())));
}
detachOne();
......@@ -220,63 +220,89 @@ class Core {
/// May call from any thread
void deactivate() {
active_ = false;
active_.store(false, std::memory_order_release);
}
/// May call from any thread
void activate() {
active_ = true;
active_.store(true, std::memory_order_release);
maybeCallback();
}
/// May call from any thread
bool isActive() { return active_; }
bool isActive() { return active_.load(std::memory_order_acquire); }
/// Call only from Future thread
void setExecutor(Executor* x, int8_t priority) {
folly::MSLGuard g(executorLock_);
void setExecutor(Executor* x, int8_t priority = Executor::MID_PRI) {
if (!executorLock_.try_lock()) {
executorLock_.lock();
}
executor_ = x;
priority_ = priority;
executorLock_.unlock();
}
void setExecutorNoLock(Executor* x, int8_t priority = Executor::MID_PRI) {
executor_ = x;
priority_ = priority;
}
Executor* getExecutor() {
folly::MSLGuard g(executorLock_);
return executor_;
}
/// Call only from Future thread
void raise(exception_wrapper e) {
folly::MSLGuard guard(interruptLock_);
if (!interruptLock_.try_lock()) {
interruptLock_.lock();
}
if (!interrupt_ && !hasResult()) {
interrupt_ = folly::make_unique<exception_wrapper>(std::move(e));
if (interruptHandler_) {
interruptHandler_(*interrupt_);
}
}
interruptLock_.unlock();
}
std::function<void(exception_wrapper const&)> getInterruptHandler() {
folly::MSLGuard guard(interruptLock_);
return interruptHandler_;
if (!interruptHandlerSet_.load(std::memory_order_acquire)) {
return nullptr;
}
if (!interruptLock_.try_lock()) {
interruptLock_.lock();
}
auto handler = interruptHandler_;
interruptLock_.unlock();
return handler;
}
/// Call only from Promise thread
void setInterruptHandler(std::function<void(exception_wrapper const&)> fn) {
folly::MSLGuard guard(interruptLock_);
if (!interruptLock_.try_lock()) {
interruptLock_.lock();
}
if (!hasResult()) {
if (interrupt_) {
fn(*interrupt_);
} else {
interruptHandler_ = std::move(fn);
setInterruptHandlerNoLock(std::move(fn));
}
}
interruptLock_.unlock();
}
void setInterruptHandlerNoLock(
std::function<void(exception_wrapper const&)> fn) {
interruptHandlerSet_.store(true, std::memory_order_relaxed);
interruptHandler_ = std::move(fn);
}
protected:
void maybeCallback() {
FSM_START(fsm_)
case State::Armed:
if (active_) {
if (active_.load(std::memory_order_acquire)) {
FSM_UPDATE2(fsm_, State::Done, []{}, [this]{ this->doCallback(); });
}
FSM_BREAK
......@@ -289,17 +315,20 @@ class Core {
void doCallback() {
RequestContext::setContext(context_);
// TODO(6115514) semantic race on reading executor_ and setExecutor()
Executor* x;
Executor* x = executor_;
int8_t priority;
{
folly::MSLGuard g(executorLock_);
if (x) {
if (!executorLock_.try_lock()) {
executorLock_.lock();
}
x = executor_;
priority = priority_;
executorLock_.unlock();
}
if (x) {
++attached_; // keep Core alive until executor did its thing
// keep Core alive until executor did its thing
++attached_;
try {
if (LIKELY(x->getNumPriorities() == 1)) {
x->add([this]() mutable {
......@@ -330,17 +359,21 @@ class Core {
}
}
// lambdaBuf occupies exactly one cache line
static constexpr size_t lambdaBufSize = 8 * sizeof(void*);
char lambdaBuf_[lambdaBufSize];
// place result_ next to increase the likelihood that the value will be
// contained entirely in one cache line
folly::Optional<Try<T>> result_ {};
std::function<void(Try<T>&&)> callback_ {nullptr};
FSM<State> fsm_ {State::Start};
std::atomic<unsigned char> attached_ {2};
std::atomic<bool> active_ {true};
std::atomic<bool> interruptHandlerSet_ {false};
folly::MicroSpinLock interruptLock_ {0};
folly::MicroSpinLock executorLock_ {0};
int8_t priority_ {-1};
Executor* executor_ {nullptr};
folly::Optional<Try<T>> result_ {};
std::function<void(Try<T>&&)> callback_ {nullptr};
static constexpr size_t lambdaBufSize = 8 * sizeof(void*);
char lambdaBuf_[lambdaBufSize];
std::shared_ptr<RequestContext> context_ {nullptr};
std::unique_ptr<exception_wrapper> interrupt_ {};
std::function<void(exception_wrapper const&)> interruptHandler_ {nullptr};
......
......@@ -44,7 +44,7 @@ public:
explicit FSM(Enum startState) : state_(startState) {}
Enum getState() const {
return state_.load(std::memory_order_relaxed);
return state_.load(std::memory_order_acquire);
}
/// Atomically do a state transition with accompanying action.
......@@ -52,10 +52,16 @@ public:
/// @returns true on success, false and action unexecuted otherwise
template <class F>
bool updateState(Enum A, Enum B, F const& action) {
std::lock_guard<Mutex> lock(mutex_);
if (state_ != A) return false;
if (!mutex_.try_lock()) {
mutex_.lock();
}
if (state_.load(std::memory_order_relaxed) != A) {
mutex_.unlock();
return false;
}
action();
state_ = B;
state_.store(B, std::memory_order_relaxed);
mutex_.unlock();
return true;
}
......
......@@ -20,6 +20,7 @@
#include <folly/Benchmark.h>
#include <folly/futures/Future.h>
#include <folly/futures/Promise.h>
#include <folly/futures/InlineExecutor.h>
#include <semaphore.h>
#include <vector>
......@@ -290,6 +291,98 @@ BENCHMARK_RELATIVE(throwWrappedAndCatchWrappedContended) {
contend(throwWrappedAndCatchWrappedImpl);
}
InlineExecutor exe;
template <class T>
Future<T> fGen() {
Promise<T> p;
auto f = p.getFuture()
.then([] (T&& t) {
return std::move(t);
})
.then([] (T&& t) {
return makeFuture(std::move(t));
})
.via(&exe)
.then([] (T&& t) {
return std::move(t);
})
.then([] (T&& t) {
return makeFuture(std::move(t));
});
p.setValue(T());
return f;
}
template <class T>
std::vector<Future<T>> fsGen() {
std::vector<Future<T>> fs;
for (auto i = 0; i < 10; i++) {
fs.push_back(fGen<T>());
}
return fs;
}
template <class T>
void complexBenchmark() {
collect(fsGen<T>());
collectAll(fsGen<T>());
collectAny(fsGen<T>());
futures::map(fsGen<T>(), [] (const T& t) {
return t;
});
futures::map(fsGen<T>(), [] (const T& t) {
return makeFuture(T(t));
});
}
BENCHMARK_DRAW_LINE();
template <size_t S>
struct Blob {
char buf[S];
};
BENCHMARK(complexUnit) {
complexBenchmark<Unit>();
}
BENCHMARK_RELATIVE(complexBlob4) {
complexBenchmark<Blob<4>>();
}
BENCHMARK_RELATIVE(complexBlob8) {
complexBenchmark<Blob<8>>();
}
BENCHMARK_RELATIVE(complexBlob64) {
complexBenchmark<Blob<64>>();
}
BENCHMARK_RELATIVE(complexBlob128) {
complexBenchmark<Blob<128>>();
}
BENCHMARK_RELATIVE(complexBlob256) {
complexBenchmark<Blob<256>>();
}
BENCHMARK_RELATIVE(complexBlob512) {
complexBenchmark<Blob<512>>();
}
BENCHMARK_RELATIVE(complexBlob1024) {
complexBenchmark<Blob<1024>>();
}
BENCHMARK_RELATIVE(complexBlob2048) {
complexBenchmark<Blob<2048>>();
}
BENCHMARK_RELATIVE(complexBlob4096) {
complexBenchmark<Blob<4096>>();
}
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
folly::runBenchmarks();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment