Commit 5569e6fa authored by Nathan Bronson's avatar Nathan Bronson Committed by Sara Golemon

TLS cache for AccessSpreader

Summary:
Under Linux the process gtod doesn't contain the current cpu, so
__vdso_getcpu uses RDTSCP, which makes it cost about 20 nanos.  The gtod
_does_, however, contain a nanosecond time (for CLOCK_REALTIME_COARSE)
updated during context switches.  This diff adds a TLS cache that uses
__vdso_clock_gettime_ns(CLOCK_REALTIME_COARSE) to detect context switches.
The end result is that AccessSpreader goes from ~20 nanos to ~6.

Test Plan: unit tests

Reviewed By: davejwatson@fb.com

Subscribers: yfeldblum, trunkagent, folly-diffs@

FB internal diff: D1798922

Signature: t1:1798922:1423264298:32312a5e9bddb3b8aa630c146ef708164a6a4651
parent b2f445a2
......@@ -19,6 +19,7 @@
#define _GNU_SOURCE 1 // for RTLD_NOLOAD
#include <dlfcn.h>
#include <fstream>
#include <mutex>
#include <folly/Conv.h>
#include <folly/Exception.h>
......@@ -36,7 +37,7 @@ static CacheLocality getSystemLocalityInfo() {
try {
return CacheLocality::readFromSysfs();
} catch (...) {
// keep trying
// fall through to below if something goes wrong
}
#endif
......@@ -201,29 +202,87 @@ CacheLocality CacheLocality::uniform(size_t numCpus) {
////////////// Getcpu
/// Resolves the dynamically loaded symbol __vdso_getcpu, returning null
/// on failure
static Getcpu::Func loadVdsoGetcpu() {
void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
if (h == nullptr) {
return nullptr;
}
#ifdef CLOCK_REALTIME_COARSE
auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
if (func == nullptr) {
// technically a null result could either be a failure or a successful
// lookup of a symbol with the null value, but the second can't actually
// happen for this symbol. No point holding the handle forever if
// we don't need the code
dlclose(h);
}
static std::once_flag gVdsoInitOnce;
static Getcpu::Func gVdsoGetcpuFunc;
static int64_t (*gVdsoGettimeNsFunc)(clockid_t);
static int cachingVdsoGetcpu(unsigned* cpu, unsigned* unused_node,
void* unused_tcache) {
static __thread unsigned tls_cpu;
static __thread int64_t tls_lastContextSwitchNanos;
return func;
auto lastContextSwitchNanos = gVdsoGettimeNsFunc(CLOCK_REALTIME_COARSE);
if (tls_lastContextSwitchNanos != lastContextSwitchNanos) {
int rv = gVdsoGetcpuFunc(&tls_cpu, nullptr, nullptr);
if (rv != 0) {
return rv;
}
tls_lastContextSwitchNanos = lastContextSwitchNanos;
}
*cpu = tls_cpu;
return 0;
}
#endif
/// Resolves the dynamically loaded symbol __vdso_getcpu and
/// __vdso_clock_gettime_ns, returning a pair of nulls on failure. Does a
/// little bit of probing to make sure that the __vdso_clock_gettime_ns
/// function isn't using the slow fallback path.
Getcpu::Func Getcpu::vdsoFunc() {
static Func func = loadVdsoGetcpu();
return func;
#ifdef CLOCK_REALTIME_COARSE
std::call_once(gVdsoInitOnce, []{
void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
typedef int64_t (*GettimeNsFunc)(clockid_t);
auto getcpuFunc = Getcpu::Func(
!h ? nullptr : dlsym(h, "__vdso_getcpu"));
auto gettimeNsFunc = GettimeNsFunc(
!h ? nullptr : dlsym(h, "__vdso_clock_gettime_ns"));
bool coarseGettimeDetected = false;
if (gettimeNsFunc != nullptr) {
// The TLS cache of getcpu results only is an optimization if the
// __vdso_clock_gettime_ns implementation is fast and actually
// coarse. The slow fallback implementation is not coarse, so if
// we detect a coarse clock we are set. If CLOCK_REALTIME_COARSE
// has the right properties, then so long as there is no context
// switch between two calls the returned time will be identical.
// Dynamically verify this. An unlikely context switch while we're
// testing can lead to a false negative, but not a false positive,
// so we just run the test multiple times. This ensures that we
// will get two calls to gettimeNsFunc in a row with no intervening
// context switch.
auto prev = gettimeNsFunc(CLOCK_REALTIME_COARSE);
for (int i = 0; i < 10 && !coarseGettimeDetected; ++i) {
auto next = gettimeNsFunc(CLOCK_REALTIME_COARSE);
coarseGettimeDetected = next == prev;
prev = next;
}
}
if (getcpuFunc == nullptr || !coarseGettimeDetected) {
// technically a null getcpuFunc could either be a failure or
// a successful lookup of a symbol with the null value, but the
// second can't actually happen for this symbol. No point holding
// the handle forever if we don't need the code
if (h) {
dlclose(h);
}
} else {
gVdsoGetcpuFunc = getcpuFunc;
gVdsoGettimeNsFunc = gettimeNsFunc;
}
});
if (gVdsoGetcpuFunc != nullptr) {
return cachingVdsoGetcpu;
}
#endif
return nullptr;
}
/////////////// SequentialThreadId
......
......@@ -306,8 +306,7 @@ struct AccessSpreader {
/// Points to the getcpu-like function we are using to obtain the
/// current cpu. It should not be assumed that the returned cpu value
/// is in range. We use a member for this instead of a static so that
/// this fetch preloads a prefix the stripeByCpu array
/// is in range.
Getcpu::Func getcpuFunc_;
/// A precomputed map from cpu to stripe. Rather than add a layer of
......
......@@ -447,67 +447,71 @@ enum class SpreaderType { GETCPU, SHARED, TLS_RR };
// _getcpu refers to the vdso getcpu implementation with a locally
// constructed AccessSpreader. _tls_rr refers to execution using
// SequentialThreadId, the fallback if the vdso getcpu isn't available.
// _shared refers to calling AccessSpreader<>::current(numStripes)
// inside the hot loop.
// _shared refers to calling AccessSpreader<>::current(numStripes) inside
// the hot loop.
//
// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
// so since the stripe selection is 21 nanos the atomic increments in
// the L1 is ~15 nanos. At width 8_stripe_0_work the line is expected
// to ping-pong almost every operation, since the loops have the same
// duration. Widths 4 and 2 have the same behavior, but each tour of the
// cache line is 4 and 8 cores long, respectively. These all suggest a
// lower bound of 60 nanos for intra-chip handoff and increment between
// the L1s.
// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic, so
// since the stripe selection is 6 nanos the atomic increments in the L1 is
// ~15 nanos. At width 8_stripe_0_work the line is expected to ping-pong
// almost every operation, since the loops have the same duration.
// Widths 4 and 2 have the same behavior, but each tour of the cache line
// is 4 and 8 cores long, respectively. These all suggest a lower bound
// of ~60 nanos for intra-chip handoff and increment between the L1s.
//
// With 455 nanos (1K cycles) of busywork per contended increment, the
// system can hide all of the latency of a tour of length 4, but not
// quite one of length 8. I was a bit surprised at how much worse the
// non-striped version got. It seems that the inter-chip traffic also
// interferes with the L1-only localWork.load(). When the local work is
// doubled to about 1 microsecond we see that the inter-chip contention
// is still very important, but subdivisions on the same chip don't matter.
// With 396 nanos (500 std::memory_order_seq_cst loads) of busywork per
// contended increment, the system can hide all of the latency of a tour
// of length 4, but not quite one of length 8. I was a bit surprised
// at how much worse the non-striped version got. It seems that the
// inter-chip traffic also interferes with the L1-only localWork.load().
// When the local work is doubled to 776 nanoseconds we see that the
// inter-chip contention is still very important, but subdivisions on
// the same chip don't matter.
//
// sudo nice -n -20
// _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
// ============================================================================
// folly/test/CacheLocalityTest.cpp relative time/iter iters/s
// ============================================================================
// contentionAtWidth(1_stripe_0_work_stub) 1.14us 873.64K
// contentionAtWidth(2_stripe_0_work_getcpu) 495.58ns 2.02M
// contentionAtWidth(4_stripe_0_work_getcpu) 232.99ns 4.29M
// contentionAtWidth(8_stripe_0_work_getcpu) 101.16ns 9.88M
// contentionAtWidth(16_stripe_0_work_getcpu) 41.93ns 23.85M
// contentionAtWidth(32_stripe_0_work_getcpu) 42.04ns 23.79M
// contentionAtWidth(64_stripe_0_work_getcpu) 41.94ns 23.84M
// contentionAtWidth(2_stripe_0_work_tls_rr) 1.00us 997.41K
// contentionAtWidth(4_stripe_0_work_tls_rr) 694.41ns 1.44M
// contentionAtWidth(8_stripe_0_work_tls_rr) 590.27ns 1.69M
// contentionAtWidth(16_stripe_0_work_tls_rr) 222.13ns 4.50M
// contentionAtWidth(32_stripe_0_work_tls_rr) 169.49ns 5.90M
// contentionAtWidth(64_stripe_0_work_tls_rr) 162.20ns 6.17M
// contentionAtWidth(2_stripe_0_work_shared) 495.54ns 2.02M
// contentionAtWidth(4_stripe_0_work_shared) 236.27ns 4.23M
// contentionAtWidth(8_stripe_0_work_shared) 114.81ns 8.71M
// contentionAtWidth(16_stripe_0_work_shared) 44.65ns 22.40M
// contentionAtWidth(32_stripe_0_work_shared) 41.76ns 23.94M
// contentionAtWidth(64_stripe_0_work_shared) 43.47ns 23.00M
// atomicIncrBaseline(local_incr_0_work) 20.39ns 49.06M
// LocalAccessSpreaderUse 6.34ns 157.75M
// SharedAccessSpreaderUse 6.34ns 157.75M
// AccessSpreaderConstruction 328.19ns 3.05M
// ----------------------------------------------------------------------------
// contentionAtWidth(1_stripe_500_work_stub) 2.04us 491.13K
// contentionAtWidth(2_stripe_500_work_getcpu) 610.98ns 1.64M
// contentionAtWidth(4_stripe_500_work_getcpu) 507.72ns 1.97M
// contentionAtWidth(8_stripe_500_work_getcpu) 542.53ns 1.84M
// contentionAtWidth(16_stripe_500_work_getcpu) 496.55ns 2.01M
// contentionAtWidth(32_stripe_500_work_getcpu) 500.67ns 2.00M
// atomicIncrBaseline(local_incr_500_work) 484.69ns 2.06M
// contentionAtWidth(1_stripe_0_work_stub) 909.99ns 1.10M
// contentionAtWidth(2_stripe_0_work_getcpu) 527.54ns 1.90M
// contentionAtWidth(4_stripe_0_work_getcpu) 260.28ns 3.84M
// contentionAtWidth(8_stripe_0_work_getcpu) 131.82ns 7.59M
// contentionAtWidth(16_stripe_0_work_getcpu) 25.92ns 38.58M
// contentionAtWidth(32_stripe_0_work_getcpu) 21.80ns 45.88M
// contentionAtWidth(64_stripe_0_work_getcpu) 20.06ns 49.85M
// contentionAtWidth(2_stripe_0_work_tls_rr) 759.21ns 1.32M
// contentionAtWidth(4_stripe_0_work_tls_rr) 607.46ns 1.65M
// contentionAtWidth(8_stripe_0_work_tls_rr) 403.79ns 2.48M
// contentionAtWidth(16_stripe_0_work_tls_rr) 188.14ns 5.32M
// contentionAtWidth(32_stripe_0_work_tls_rr) 131.59ns 7.60M
// contentionAtWidth(64_stripe_0_work_tls_rr) 103.56ns 9.66M
// contentionAtWidth(2_stripe_0_work_shared) 553.07ns 1.81M
// contentionAtWidth(4_stripe_0_work_shared) 274.23ns 3.65M
// contentionAtWidth(8_stripe_0_work_shared) 137.43ns 7.28M
// contentionAtWidth(16_stripe_0_work_shared) 24.52ns 40.78M
// contentionAtWidth(32_stripe_0_work_shared) 21.80ns 45.86M
// contentionAtWidth(64_stripe_0_work_shared) 21.66ns 46.17M
// atomicIncrBaseline(local_incr_0_work) 16.73ns 59.78M
// ----------------------------------------------------------------------------
// contentionAtWidth(1_stripe_1000_work_stub) 2.11us 473.78K
// contentionAtWidth(2_stripe_1000_work_getcpu) 970.64ns 1.03M
// contentionAtWidth(4_stripe_1000_work_getcpu) 987.31ns 1.01M
// contentionAtWidth(8_stripe_1000_work_getcpu) 1.01us 985.52K
// contentionAtWidth(16_stripe_1000_work_getcpu) 986.09ns 1.01M
// contentionAtWidth(32_stripe_1000_work_getcpu) 960.23ns 1.04M
// atomicIncrBaseline(local_incr_1000_work) 950.63ns 1.05M
// contentionAtWidth(1_stripe_500_work_stub) 1.75us 571.14K
// contentionAtWidth(2_stripe_500_work_getcpu) 500.79ns 2.00M
// contentionAtWidth(4_stripe_500_work_getcpu) 410.45ns 2.44M
// contentionAtWidth(8_stripe_500_work_getcpu) 411.41ns 2.43M
// contentionAtWidth(16_stripe_500_work_getcpu) 400.12ns 2.50M
// contentionAtWidth(32_stripe_500_work_getcpu) 397.37ns 2.52M
// atomicIncrBaseline(local_incr_500_work) 396.53ns 2.52M
// ----------------------------------------------------------------------------
// contentionAtWidth(1_stripe_1000_work_stub) 1.88us 530.59K
// contentionAtWidth(2_stripe_1000_work_getcpu) 778.77ns 1.28M
// contentionAtWidth(4_stripe_1000_work_getcpu) 779.56ns 1.28M
// contentionAtWidth(8_stripe_1000_work_getcpu) 795.62ns 1.26M
// contentionAtWidth(16_stripe_1000_work_getcpu) 778.81ns 1.28M
// contentionAtWidth(32_stripe_1000_work_getcpu) 780.26ns 1.28M
// atomicIncrBaseline(local_incr_1000_work) 776.39ns 1.29M
// ============================================================================
static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
SpreaderType spreaderType,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment