Commit 1ad7426b authored by Lu Pan's avatar Lu Pan Committed by Facebook GitHub Bot

set max deferred readers for folly::SharedMutex dynamically

Summary: Instead of hard coding the max deferred readers allowed to be 64, statically allocate large enough slots and pick max deferred readers allowed dynamically based on the platform running the service. Specifically, we set the `maxDeferredReaders = 2 * nextPowTwo(numCPU)`, which four times the number of physical cores, to allow faster reads. We are effectively giving each HW thread two slots.

Reviewed By: yfeldblum

Differential Revision: D22407478

fbshipit-source-id: 4001cf96dc502e00f00a27d57c63ba0028a52671
parent ca9225f8
...@@ -21,8 +21,8 @@ namespace folly { ...@@ -21,8 +21,8 @@ namespace folly {
template class SharedMutexImpl<true>; template class SharedMutexImpl<true>;
template class SharedMutexImpl<false>; template class SharedMutexImpl<false>;
namespace detail { namespace shared_mutex_detail {
std::unique_lock<std::mutex> sharedMutexAnnotationGuard(void* ptr) { std::unique_lock<std::mutex> annotationGuard(void* ptr) {
if (folly::kIsSanitizeThread) { if (folly::kIsSanitizeThread) {
// On TSAN builds, we have an array of mutexes and index into them based on // On TSAN builds, we have an array of mutexes and index into them based on
// the address. If the array is of prime size things will work out okay // the address. If the array is of prime size things will work out okay
...@@ -35,6 +35,16 @@ std::unique_lock<std::mutex> sharedMutexAnnotationGuard(void* ptr) { ...@@ -35,6 +35,16 @@ std::unique_lock<std::mutex> sharedMutexAnnotationGuard(void* ptr) {
return std::unique_lock<std::mutex>(); return std::unique_lock<std::mutex>();
} }
} }
} // namespace detail
uint32_t getMaxDeferredReadersSlow(std::atomic<uint32_t>& cache) {
uint32_t maxDeferredReaders = std::min(
static_cast<uint32_t>(
folly::nextPowTwo(CacheLocality::system().numCpus) << 1),
shared_mutex_detail::kMaxDeferredReadersAllocated);
// maxDeferredReaders must be a power of 2
assert(!(maxDeferredReaders & (maxDeferredReaders - 1)));
cache.store(maxDeferredReaders, std::memory_order_release);
return maxDeferredReaders;
}
} // namespace shared_mutex_detail
} // namespace folly } // namespace folly
...@@ -259,11 +259,22 @@ struct SharedMutexToken { ...@@ -259,11 +259,22 @@ struct SharedMutexToken {
uint16_t slot_; uint16_t slot_;
}; };
namespace detail { namespace shared_mutex_detail {
// Returns a guard that gives permission for the current thread to // Returns a guard that gives permission for the current thread to
// annotate, and adjust the annotation bits in, the SharedMutex at ptr. // annotate, and adjust the annotation bits in, the SharedMutex at ptr.
std::unique_lock<std::mutex> sharedMutexAnnotationGuard(void* ptr); std::unique_lock<std::mutex> annotationGuard(void* ptr);
} // namespace detail
constexpr uint32_t kMaxDeferredReadersAllocated = 256 * 2;
FOLLY_NOINLINE uint32_t getMaxDeferredReadersSlow(std::atomic<uint32_t>& cache);
// kMaxDeferredReaders
FOLLY_EXPORT FOLLY_ALWAYS_INLINE uint32_t getMaxDeferredReaders() {
static std::atomic<uint32_t> cache{0};
auto const value = cache.load(std::memory_order_acquire);
return FOLLY_LIKELY(!!value) ? value : getMaxDeferredReadersSlow(cache);
}
} // namespace shared_mutex_detail
template < template <
bool ReaderPriority, bool ReaderPriority,
...@@ -314,7 +325,9 @@ class SharedMutexImpl { ...@@ -314,7 +325,9 @@ class SharedMutexImpl {
// possible they will be set here in a correct system // possible they will be set here in a correct system
assert((state & ~(kWaitingAny | kMayDefer | kAnnotationCreated)) == 0); assert((state & ~(kWaitingAny | kMayDefer | kAnnotationCreated)) == 0);
if ((state & kMayDefer) != 0) { if ((state & kMayDefer) != 0) {
for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) { const uint32_t maxDeferredReaders =
shared_mutex_detail::getMaxDeferredReaders();
for (uint32_t slot = 0; slot < maxDeferredReaders; ++slot) {
auto slotValue = auto slotValue =
deferredReader(slot)->load(std::memory_order_relaxed); deferredReader(slot)->load(std::memory_order_relaxed);
assert(!slotValueIsThis(slotValue)); assert(!slotValueIsThis(slotValue));
...@@ -703,7 +716,7 @@ class SharedMutexImpl { ...@@ -703,7 +716,7 @@ class SharedMutexImpl {
void annotateLazyCreate() { void annotateLazyCreate() {
if (AnnotateForThreadSanitizer && if (AnnotateForThreadSanitizer &&
(state_.load() & kAnnotationCreated) == 0) { (state_.load() & kAnnotationCreated) == 0) {
auto guard = detail::sharedMutexAnnotationGuard(this); auto guard = shared_mutex_detail::annotationGuard(this);
// check again // check again
if ((state_.load() & kAnnotationCreated) == 0) { if ((state_.load() & kAnnotationCreated) == 0) {
state_.fetch_or(kAnnotationCreated); state_.fetch_or(kAnnotationCreated);
...@@ -868,25 +881,26 @@ class SharedMutexImpl { ...@@ -868,25 +881,26 @@ class SharedMutexImpl {
// without managing our own spreader if kMaxDeferredReaders <= // without managing our own spreader if kMaxDeferredReaders <=
// AccessSpreader::kMaxCpus, which is currently 128. // AccessSpreader::kMaxCpus, which is currently 128.
// //
// Our 2-socket E5-2660 machines have 8 L1 caches on each chip, // In order to give each L1 cache its own playground, we need
// with 64 byte cache lines. That means we need 64*16 bytes of // kMaxDeferredReaders >= #L1 caches. We double it, making it
// deferredReaders[] to give each L1 its own playground. On x86_64 // essentially the number of cores, so it doesn't easily run
// each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders // out of deferred reader slots and start inlining the readers.
// * kDeferredSeparationFactor >= 64 * 16 / 8 == 128. If // We do not know the number of cores at compile time, as the code
// can be compiled from different server types than the one running
// the service. So we allocate the static storage large enough to
// hold all the slots (256).
//
// On x86_64 each DeferredReaderSlot is 8 bytes, so we need
// kMaxDeferredReaders
// * kDeferredSeparationFactor >= 64 * #L1 caches / 8 == 128. If
// kDeferredSearchDistance * kDeferredSeparationFactor <= // kDeferredSearchDistance * kDeferredSeparationFactor <=
// 64 / 8 then we will search only within a single cache line, which // 64 / 8 then we will search only within a single cache line, which
// guarantees we won't have inter-L1 contention. We give ourselves // guarantees we won't have inter-L1 contention.
// a factor of 2 on the core count, which should hold us for a couple
// processor generations. deferredReaders[] is 2048 bytes currently.
public: public:
static constexpr uint32_t kMaxDeferredReaders = 64;
static constexpr uint32_t kDeferredSearchDistance = 2; static constexpr uint32_t kDeferredSearchDistance = 2;
static constexpr uint32_t kDeferredSeparationFactor = 4; static constexpr uint32_t kDeferredSeparationFactor = 4;
private: private:
static_assert(
!(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
"kMaxDeferredReaders must be a power of 2");
static_assert( static_assert(
!(kDeferredSearchDistance & (kDeferredSearchDistance - 1)), !(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
"kDeferredSearchDistance must be a power of 2"); "kDeferredSearchDistance must be a power of 2");
...@@ -924,7 +938,9 @@ class SharedMutexImpl { ...@@ -924,7 +938,9 @@ class SharedMutexImpl {
private: private:
alignas(hardware_destructive_interference_size) static DeferredReaderSlot alignas(hardware_destructive_interference_size) static DeferredReaderSlot
deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor]; deferredReaders
[shared_mutex_detail::kMaxDeferredReadersAllocated *
kDeferredSeparationFactor];
// Performs an exclusive lock, waiting for state_ & waitMask to be // Performs an exclusive lock, waiting for state_ & waitMask to be
// zero first // zero first
...@@ -1179,10 +1195,12 @@ class SharedMutexImpl { ...@@ -1179,10 +1195,12 @@ class SharedMutexImpl {
uint32_t slot = 0; uint32_t slot = 0;
uint32_t spinCount = 0; uint32_t spinCount = 0;
const uint32_t maxDeferredReaders =
shared_mutex_detail::getMaxDeferredReaders();
while (true) { while (true) {
while (!slotValueIsThis( while (!slotValueIsThis(
deferredReader(slot)->load(std::memory_order_acquire))) { deferredReader(slot)->load(std::memory_order_acquire))) {
if (++slot == kMaxDeferredReaders) { if (++slot == maxDeferredReaders) {
return; return;
} }
} }
...@@ -1201,6 +1219,8 @@ class SharedMutexImpl { ...@@ -1201,6 +1219,8 @@ class SharedMutexImpl {
std::memset(&usage, 0, sizeof(usage)); std::memset(&usage, 0, sizeof(usage));
long before = -1; long before = -1;
#endif #endif
const uint32_t maxDeferredReaders =
shared_mutex_detail::getMaxDeferredReaders();
for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount; for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
++yieldCount) { ++yieldCount) {
for (int softState = 0; softState < 3; ++softState) { for (int softState = 0; softState < 3; ++softState) {
...@@ -1213,7 +1233,7 @@ class SharedMutexImpl { ...@@ -1213,7 +1233,7 @@ class SharedMutexImpl {
} }
while (!slotValueIsThis( while (!slotValueIsThis(
deferredReader(slot)->load(std::memory_order_acquire))) { deferredReader(slot)->load(std::memory_order_acquire))) {
if (++slot == kMaxDeferredReaders) { if (++slot == maxDeferredReaders) {
return; return;
} }
} }
...@@ -1232,7 +1252,7 @@ class SharedMutexImpl { ...@@ -1232,7 +1252,7 @@ class SharedMutexImpl {
} }
uint32_t movedSlotCount = 0; uint32_t movedSlotCount = 0;
for (; slot < kMaxDeferredReaders; ++slot) { for (; slot < maxDeferredReaders; ++slot) {
auto slotPtr = deferredReader(slot); auto slotPtr = deferredReader(slot);
auto slotValue = slotPtr->load(std::memory_order_acquire); auto slotValue = slotPtr->load(std::memory_order_acquire);
if (slotValueIsThis(slotValue) && if (slotValueIsThis(slotValue) &&
...@@ -1284,7 +1304,9 @@ class SharedMutexImpl { ...@@ -1284,7 +1304,9 @@ class SharedMutexImpl {
// Updates the state in/out argument as if the locks were made inline, // Updates the state in/out argument as if the locks were made inline,
// but does not update state_ // but does not update state_
void cleanupTokenlessSharedDeferred(uint32_t& state) { void cleanupTokenlessSharedDeferred(uint32_t& state) {
for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) { const uint32_t maxDeferredReaders =
shared_mutex_detail::getMaxDeferredReaders();
for (uint32_t i = 0; i < maxDeferredReaders; ++i) {
auto slotPtr = deferredReader(i); auto slotPtr = deferredReader(i);
auto slotValue = slotPtr->load(std::memory_order_relaxed); auto slotValue = slotPtr->load(std::memory_order_relaxed);
if (slotValue == tokenlessSlotValue()) { if (slotValue == tokenlessSlotValue()) {
...@@ -1300,7 +1322,7 @@ class SharedMutexImpl { ...@@ -1300,7 +1322,7 @@ class SharedMutexImpl {
bool tryUnlockTokenlessSharedDeferred(); bool tryUnlockTokenlessSharedDeferred();
bool tryUnlockSharedDeferred(uint32_t slot) { bool tryUnlockSharedDeferred(uint32_t slot) {
assert(slot < kMaxDeferredReaders); assert(slot < shared_mutex_detail::getMaxDeferredReaders());
auto slotValue = tokenfulSlotValue(); auto slotValue = tokenfulSlotValue();
return deferredReader(slot)->compare_exchange_strong(slotValue, 0); return deferredReader(slot)->compare_exchange_strong(slotValue, 0);
} }
...@@ -1566,7 +1588,8 @@ alignas(hardware_destructive_interference_size) typename SharedMutexImpl< ...@@ -1566,7 +1588,8 @@ alignas(hardware_destructive_interference_size) typename SharedMutexImpl<
Atom, Atom,
BlockImmediately, BlockImmediately,
AnnotateForThreadSanitizer>::deferredReaders AnnotateForThreadSanitizer>::deferredReaders
[kMaxDeferredReaders * kDeferredSeparationFactor] = {}; [shared_mutex_detail::kMaxDeferredReadersAllocated *
kDeferredSeparationFactor] = {};
template < template <
bool ReaderPriority, bool ReaderPriority,
...@@ -1608,7 +1631,10 @@ bool SharedMutexImpl< ...@@ -1608,7 +1631,10 @@ bool SharedMutexImpl<
AnnotateForThreadSanitizer>::tryUnlockTokenlessSharedDeferred() { AnnotateForThreadSanitizer>::tryUnlockTokenlessSharedDeferred() {
auto bestSlot = auto bestSlot =
make_atomic_ref(tls_lastTokenlessSlot).load(std::memory_order_relaxed); make_atomic_ref(tls_lastTokenlessSlot).load(std::memory_order_relaxed);
for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) { // use do ... while to avoid calling
// shared_mutex_detail::getMaxDeferredReaders() unless necessary
uint32_t i = 0;
do {
auto slotPtr = deferredReader(bestSlot ^ i); auto slotPtr = deferredReader(bestSlot ^ i);
auto slotValue = slotPtr->load(std::memory_order_relaxed); auto slotValue = slotPtr->load(std::memory_order_relaxed);
if (slotValue == tokenlessSlotValue() && if (slotValue == tokenlessSlotValue() &&
...@@ -1617,7 +1643,8 @@ bool SharedMutexImpl< ...@@ -1617,7 +1643,8 @@ bool SharedMutexImpl<
.store(bestSlot ^ i, std::memory_order_relaxed); .store(bestSlot ^ i, std::memory_order_relaxed);
return true; return true;
} }
} ++i;
} while (i < shared_mutex_detail::getMaxDeferredReaders());
return false; return false;
} }
...@@ -1635,6 +1662,8 @@ bool SharedMutexImpl< ...@@ -1635,6 +1662,8 @@ bool SharedMutexImpl<
BlockImmediately, BlockImmediately,
AnnotateForThreadSanitizer>:: AnnotateForThreadSanitizer>::
lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) { lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
const uint32_t maxDeferredReaders =
shared_mutex_detail::getMaxDeferredReaders();
while (true) { while (true) {
if (UNLIKELY((state & kHasE) != 0) && if (UNLIKELY((state & kHasE) != 0) &&
!waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) { !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
...@@ -1656,13 +1685,13 @@ bool SharedMutexImpl< ...@@ -1656,13 +1685,13 @@ bool SharedMutexImpl<
// starting point for our empty-slot search, can change after // starting point for our empty-slot search, can change after
// calling waitForZeroBits // calling waitForZeroBits
uint32_t bestSlot = uint32_t bestSlot =
(uint32_t)folly::AccessSpreader<Atom>::current(kMaxDeferredReaders); (uint32_t)folly::AccessSpreader<Atom>::current(maxDeferredReaders);
// deferred readers are already enabled, or it is time to // deferred readers are already enabled, or it is time to
// enable them if we can find a slot // enable them if we can find a slot
for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) { for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
slot = bestSlot ^ i; slot = bestSlot ^ i;
assert(slot < kMaxDeferredReaders); assert(slot < maxDeferredReaders);
slotValue = deferredReader(slot)->load(std::memory_order_relaxed); slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
if (slotValue == 0) { if (slotValue == 0) {
// found empty slot // found empty slot
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment