Commit 8961aec7 authored by Nathan Bronson's avatar Nathan Bronson Committed by Andrii Grynenko

add per node mutex for emulated futex

Summary:
The emulated futex (not used on Linux) has an optimization to defer
notification until after the bucket mutex has been unlocked, to avoid
lock collisions between the waiter and waker.  That code will have a
use-after-free problem if the waiter's condition_variable has a spurious
wakeup, which is allowed by the spec.  That code also doesn't do
anything about contention between multiple waiters.

This diff adds a mutex to each wait node, relieving the waiters from
having to acquire the bucket lock on wakeup.  Rather than trying to
perform a racy late notification, we just make sure to release the node
lock immediately after calling notify_one, which seems to have the
desired effect.

Test Plan:
1) existing unit tests
2) new unit tests

Reviewed By: delong.j@fb.com

Subscribers: boseant, njormrod

FB internal diff: D1602360
parent bdde233c
...@@ -123,17 +123,24 @@ FutexResult nativeFutexWaitImpl(void* addr, ...@@ -123,17 +123,24 @@ FutexResult nativeFutexWaitImpl(void* addr,
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
// compatibility implementation using standard C++ API // compatibility implementation using standard C++ API
// Our emulated futex uses 4096 lists of wait nodes. There are two levels
// of locking: a per-list mutex that controls access to the list and a
// per-node mutex, condvar, and bool that are used for the actual wakeups.
// The per-node mutex allows us to do precise wakeups without thundering
// herds.
struct EmulatedFutexWaitNode : public boost::intrusive::list_base_hook<> { struct EmulatedFutexWaitNode : public boost::intrusive::list_base_hook<> {
void* const addr_; void* const addr_;
const uint32_t waitMask_; const uint32_t waitMask_;
bool hasTimeout_;
// tricky: hold both bucket and node mutex to write, either to read
bool signaled_; bool signaled_;
std::mutex mutex_;
std::condition_variable cond_; std::condition_variable cond_;
EmulatedFutexWaitNode(void* addr, uint32_t waitMask, bool hasTimeout) EmulatedFutexWaitNode(void* addr, uint32_t waitMask)
: addr_(addr) : addr_(addr)
, waitMask_(waitMask) , waitMask_(waitMask)
, hasTimeout_(hasTimeout)
, signaled_(false) , signaled_(false)
{ {
} }
...@@ -162,42 +169,24 @@ std::once_flag EmulatedFutexBucket::gBucketInit; ...@@ -162,42 +169,24 @@ std::once_flag EmulatedFutexBucket::gBucketInit;
int emulatedFutexWake(void* addr, int count, uint32_t waitMask) { int emulatedFutexWake(void* addr, int count, uint32_t waitMask) {
auto& bucket = EmulatedFutexBucket::bucketFor(addr); auto& bucket = EmulatedFutexBucket::bucketFor(addr);
std::unique_lock<std::mutex> bucketLock(bucket.mutex_);
int numAwoken = 0; int numAwoken = 0;
boost::intrusive::list<EmulatedFutexWaitNode> deferredWakeups; for (auto iter = bucket.waiters_.begin();
numAwoken < count && iter != bucket.waiters_.end(); ) {
{ auto current = iter;
std::unique_lock<std::mutex> lock(bucket.mutex_); auto& node = *iter++;
if (node.addr_ == addr && (node.waitMask_ & waitMask) != 0) {
for (auto iter = bucket.waiters_.begin(); ++numAwoken;
numAwoken < count && iter != bucket.waiters_.end(); ) {
auto current = iter; // we unlink, but waiter destroys the node
auto& node = *iter++; bucket.waiters_.erase(current);
if (node.addr_ == addr && (node.waitMask_ & waitMask) != 0) {
// We unlink, but waiter destroys the node. We must signal timed std::unique_lock<std::mutex> nodeLock(node.mutex_);
// waiters under the lock, to avoid a race where we release the lock, node.signaled_ = true;
// the waiter times out and deletes the node, and then we try to node.cond_.notify_one();
// signal it. This problem doesn't exist for unbounded waiters,
// so for them we optimize their wakeup by releasing the lock first.
bucket.waiters_.erase(current);
if (node.hasTimeout_) {
node.signaled_ = true;
node.cond_.notify_one();
} else {
deferredWakeups.push_back(node);
}
++numAwoken;
}
} }
} }
while (!deferredWakeups.empty()) {
auto& node = deferredWakeups.front();
deferredWakeups.pop_front();
node.signaled_ = true;
node.cond_.notify_one();
}
return numAwoken; return numAwoken;
} }
...@@ -207,30 +196,39 @@ FutexResult emulatedFutexWaitImpl( ...@@ -207,30 +196,39 @@ FutexResult emulatedFutexWaitImpl(
time_point<system_clock>* absSystemTime, time_point<system_clock>* absSystemTime,
time_point<steady_clock>* absSteadyTime, time_point<steady_clock>* absSteadyTime,
uint32_t waitMask) { uint32_t waitMask) {
bool hasTimeout = absSystemTime != nullptr || absSteadyTime != nullptr;
EmulatedFutexWaitNode node(addr, waitMask, hasTimeout);
auto& bucket = EmulatedFutexBucket::bucketFor(addr); auto& bucket = EmulatedFutexBucket::bucketFor(addr);
std::unique_lock<std::mutex> lock(bucket.mutex_); EmulatedFutexWaitNode node(addr, waitMask);
uint32_t actual; {
memcpy(&actual, addr, sizeof(uint32_t)); std::unique_lock<std::mutex> bucketLock(bucket.mutex_);
if (actual != expected) {
return FutexResult::VALUE_CHANGED; uint32_t actual;
} memcpy(&actual, addr, sizeof(uint32_t));
if (actual != expected) {
return FutexResult::VALUE_CHANGED;
}
bucket.waiters_.push_back(node);
} // bucketLock scope
bucket.waiters_.push_back(node); std::cv_status status = std::cv_status::no_timeout;
while (!node.signaled_) { {
std::cv_status status = std::cv_status::no_timeout; std::unique_lock<std::mutex> nodeLock(node.mutex_);
if (absSystemTime != nullptr) { while (!node.signaled_ && status != std::cv_status::timeout) {
status = node.cond_.wait_until(lock, *absSystemTime); if (absSystemTime != nullptr) {
} else if (absSteadyTime != nullptr) { status = node.cond_.wait_until(nodeLock, *absSystemTime);
status = node.cond_.wait_until(lock, *absSteadyTime); } else if (absSteadyTime != nullptr) {
} else { status = node.cond_.wait_until(nodeLock, *absSteadyTime);
node.cond_.wait(lock); } else {
node.cond_.wait(nodeLock);
}
} }
} // nodeLock scope
if (status == std::cv_status::timeout) { if (status == std::cv_status::timeout) {
// it's not really a timeout until we unlink the unsignaled node
std::unique_lock<std::mutex> bucketLock(bucket.mutex_);
if (!node.signaled_) {
bucket.waiters_.erase(bucket.waiters_.iterator_to(node)); bucket.waiters_.erase(bucket.waiters_.iterator_to(node));
return FutexResult::TIMEDOUT; return FutexResult::TIMEDOUT;
} }
......
...@@ -54,36 +54,36 @@ void run_basic_tests() { ...@@ -54,36 +54,36 @@ void run_basic_tests() {
DSched::join(thr); DSched::join(thr);
} }
template<template<typename> class Atom> template <template<typename> class Atom, typename Clock>
void run_wait_until_tests(); void liveClockWaitUntilTests() {
Futex<Atom> f(0);
template <typename Clock> for (int stress = 0; stress < 1000; ++stress) {
void stdAtomicWaitUntilTests() { auto fp = &f; // workaround for t5336595
Futex<std::atomic> f(0); auto thrA = DSched::thread([fp,stress]{
while (true) {
auto thrA = DSched::thread([&]{ auto deadline = Clock::now() + microseconds(1 << (stress % 20));
while (true) { auto res = fp->futexWaitUntil(0, deadline);
typename Clock::time_point nowPlus2s = Clock::now() + seconds(2); EXPECT_TRUE(res == FutexResult::TIMEDOUT || res == FutexResult::AWOKEN);
auto res = f.futexWaitUntil(0, nowPlus2s); if (res == FutexResult::AWOKEN) {
EXPECT_TRUE(res == FutexResult::TIMEDOUT || res == FutexResult::AWOKEN); break;
if (res == FutexResult::AWOKEN) { }
break;
} }
});
while (f.futexWake() != 1) {
std::this_thread::yield();
} }
});
while (f.futexWake() != 1) { DSched::join(thrA);
std::this_thread::yield();
} }
DSched::join(thrA);
auto start = Clock::now(); auto start = Clock::now();
EXPECT_EQ(f.futexWaitUntil(0, start + milliseconds(100)), EXPECT_EQ(f.futexWaitUntil(0, start + milliseconds(100)),
FutexResult::TIMEDOUT); FutexResult::TIMEDOUT);
LOG(INFO) << "Futex wait timed out after waiting for " LOG(INFO) << "Futex wait timed out after waiting for "
<< duration_cast<milliseconds>(Clock::now() - start).count() << duration_cast<milliseconds>(Clock::now() - start).count()
<< "ms"; << "ms, should be ~100ms";
} }
template <typename Clock> template <typename Clock>
...@@ -96,10 +96,10 @@ void deterministicAtomicWaitUntilTests() { ...@@ -96,10 +96,10 @@ void deterministicAtomicWaitUntilTests() {
EXPECT_TRUE(res == FutexResult::TIMEDOUT || res == FutexResult::INTERRUPTED); EXPECT_TRUE(res == FutexResult::TIMEDOUT || res == FutexResult::INTERRUPTED);
} }
template <> template<template<typename> class Atom>
void run_wait_until_tests<std::atomic>() { void run_wait_until_tests() {
stdAtomicWaitUntilTests<system_clock>(); liveClockWaitUntilTests<Atom, system_clock>();
stdAtomicWaitUntilTests<steady_clock>(); liveClockWaitUntilTests<Atom, steady_clock>();
} }
template <> template <>
...@@ -177,6 +177,11 @@ TEST(Futex, basic_live) { ...@@ -177,6 +177,11 @@ TEST(Futex, basic_live) {
run_wait_until_tests<std::atomic>(); run_wait_until_tests<std::atomic>();
} }
TEST(Futex, basic_emulated) {
run_basic_tests<EmulatedFutexAtomic>();
run_wait_until_tests<EmulatedFutexAtomic>();
}
TEST(Futex, basic_deterministic) { TEST(Futex, basic_deterministic) {
DSched sched(DSched::uniform(0)); DSched sched(DSched::uniform(0));
run_basic_tests<DeterministicAtomic>(); run_basic_tests<DeterministicAtomic>();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment