Commit c9d712c3 authored by Nathan Bronson's avatar Nathan Bronson Committed by Facebook Github Bot

switch to RDTSC spin loop timing

Summary:
The PAUSE instruction's latency varies widely across x64
architectures, resulting in too much spinning on Skylake in some
scenarios.  This diff switches to using cycle counts to limit spinning
on x64, so that the minimum and maximum spin durations are independent
of asm_volatile_pause's latency.

Reviewed By: yfeldblum

Differential Revision: D17961663

fbshipit-source-id: 6170bfa48d007ca21b73b1a5c7e68da0043cda2c
parent eddca91c
......@@ -19,6 +19,8 @@
#include <algorithm>
#include <limits>
#include <folly/Portability.h>
#include <folly/chrono/Hardware.h>
#include <folly/detail/Futex.h>
#include <folly/portability/Asm.h>
#include <folly/portability/Unistd.h>
......@@ -68,6 +70,12 @@ namespace detail {
/// cutoffs for different operations (read versus write, for example).
/// To avoid contention, the spin cutoff is only updated when requested
/// by the caller.
///
/// On x86 the latency of a spin loop varies dramatically across
/// architectures due to changes in the PAUSE instruction. Skylake
/// increases the latency by about a factor of 15 compared to previous
/// architectures. To work around this, on x86 we measure spins using
/// RDTSC rather than a loop counter.
template <template <typename> class Atom>
struct TurnSequencer {
explicit TurnSequencer(const uint32_t firstTurn = 0) noexcept
......@@ -96,13 +104,13 @@ struct TurnSequencer {
// the bottom to store the number of waiters. We call shifted turns
// "sturns" inside this class.
/// Blocks the current thread until turn has arrived. If
/// updateSpinCutoff is true then this will spin for up to kMaxSpins tries
/// before blocking and will adjust spinCutoff based on the results,
/// otherwise it will spin for at most spinCutoff spins.
/// Returns SUCCESS if the wait succeeded, PAST if the turn is in the past
/// or TIMEDOUT if the absTime time value is not nullptr and is reached before
/// the turn arrives
/// Blocks the current thread until turn has arrived.
/// If updateSpinCutoff is true then this will spin for up to
/// kMaxSpinLimit before blocking and will adjust spinCutoff based
/// on the results, otherwise it will spin for at most spinCutoff.
/// Returns SUCCESS if the wait succeeded, PAST if the turn is in the
/// past or TIMEDOUT if the absTime time value is not nullptr and is
/// reached before the turn arrives
template <
class Clock = std::chrono::steady_clock,
class Duration = typename Clock::duration>
......@@ -114,8 +122,9 @@ struct TurnSequencer {
nullptr) noexcept {
uint32_t prevThresh = spinCutoff.load(std::memory_order_relaxed);
const uint32_t effectiveSpinCutoff =
updateSpinCutoff || prevThresh == 0 ? kMaxSpins : prevThresh;
updateSpinCutoff || prevThresh == 0 ? kMaxSpinLimit : prevThresh;
uint64_t begin;
uint32_t tries;
const uint32_t sturn = turn << kTurnShift;
for (tries = 0;; ++tries) {
......@@ -133,9 +142,20 @@ struct TurnSequencer {
// the first effectSpinCutoff tries are spins, after that we will
// record ourself as a waiter and block with futexWait
if (tries < effectiveSpinCutoff) {
asm_volatile_pause();
continue;
if (kSpinUsingHardwareClock) {
auto now = hardware_timestamp();
if (tries == 0) {
begin = now;
}
if (tries == 0 || now < begin + effectiveSpinCutoff) {
asm_volatile_pause();
continue;
}
} else {
if (tries < effectiveSpinCutoff) {
asm_volatile_pause();
continue;
}
}
uint32_t current_max_waiter_delta = decodeMaxWaitersDelta(state);
......@@ -164,16 +184,21 @@ struct TurnSequencer {
}
if (updateSpinCutoff || prevThresh == 0) {
// if we hit kMaxSpins then spinning was pointless, so the right
// spinCutoff is kMinSpins
// if we hit kMaxSpinLimit then spinning was pointless, so the right
// spinCutoff is kMinSpinLimit
uint32_t target;
if (tries >= kMaxSpins) {
target = kMinSpins;
uint64_t elapsed = !kSpinUsingHardwareClock || tries == 0
? tries
: hardware_timestamp() - begin;
if (tries >= kMaxSpinLimit) {
target = kMinSpinLimit;
} else {
// to account for variations, we allow ourself to spin 2*N when
// we think that N is actually required in order to succeed
target = std::min<uint32_t>(
kMaxSpins, std::max<uint32_t>(kMinSpins, tries * 2));
target = std::min(
uint32_t{kMaxSpinLimit},
std::max(
uint32_t{kMinSpinLimit}, static_cast<uint32_t>(elapsed) * 2));
}
if (prevThresh == 0) {
......@@ -219,25 +244,29 @@ struct TurnSequencer {
}
private:
enum : uint32_t {
/// kTurnShift counts the bits that are stolen to record the delta
/// between the current turn and the maximum waiter. It needs to be big
/// enough to record wait deltas of 0 to 32 inclusive. Waiters more
/// than 32 in the future will be woken up 32*n turns early (since
/// their BITSET will hit) and will adjust the waiter count again.
/// We go a bit beyond and let the waiter count go up to 63, which
/// is free and might save us a few CAS
kTurnShift = 6,
kWaitersMask = (1 << kTurnShift) - 1,
static constexpr bool kSpinUsingHardwareClock = kIsArchAmd64;
static constexpr uint32_t kCyclesPerSpinLimit =
kSpinUsingHardwareClock ? 1 : 10;
/// kTurnShift counts the bits that are stolen to record the delta
/// between the current turn and the maximum waiter. It needs to be big
/// enough to record wait deltas of 0 to 32 inclusive. Waiters more
/// than 32 in the future will be woken up 32*n turns early (since
/// their BITSET will hit) and will adjust the waiter count again.
/// We go a bit beyond and let the waiter count go up to 63, which is
/// free and might save us a few CAS
static constexpr uint32_t kTurnShift = 6;
static constexpr uint32_t kWaitersMask = (1 << kTurnShift) - 1;
/// The minimum spin count that we will adaptively select
kMinSpins = 20,
/// The minimum spin duration that we will adaptively select. The value
/// here is cycles, adjusted to the way in which the limit will actually
/// be applied.
static constexpr uint32_t kMinSpinLimit = 200 / kCyclesPerSpinLimit;
/// The maximum spin count that we will adaptively select, and the
/// spin count that will be used when probing to get a new data point
/// for the adaptation
kMaxSpins = 2000,
};
/// The maximum spin duration that we will adaptively select, and the
/// spin duration that will be used when probing to get a new data
/// point for the adaptation
static constexpr uint32_t kMaxSpinLimit = 20000 / kCyclesPerSpinLimit;
/// This holds both the current turn, and the highest waiting turn,
/// stored as (current_turn << 6) | min(63, max(waited_turn - current_turn))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment