Revert "[folly] TLS cache for AccessSpreader"

Summary: This reverts commit 4ebb2303bbcf343d7c2bcc95d55557c0a3b444f3. The caching mechanism was based on a misunderstanding of the implementation of CLOCK_MONOTONIC_COARSE, and is not correct. On the plus side, the upstream kernel patch e76b027e6408 x86,vdso: Use LSL unconditionally for vgetcpu gets the vdso getcpu from 16 nanos down to 10 (half of which is the inherent indirect call). Test Plan: unit tests Reviewed By: delong.j@fb.com Subscribers: trunkagent, bmaurer, tudorb, folly-diffs@, yfeldblum, jdelong FB internal diff: D1840690 Signature: t1:1840690:1423693026:33801341ec0b83bf47e050be6528c5dd05021ed5

Revert "[folly] TLS cache for AccessSpreader"
Summary: This reverts commit 4ebb2303bbcf343d7c2bcc95d55557c0a3b444f3. The caching mechanism was based on a misunderstanding of the implementation of CLOCK_MONOTONIC_COARSE, and is not correct. On the plus side, the upstream kernel patch e76b027e6408 x86,vdso: Use LSL unconditionally for vgetcpu gets the vdso getcpu from 16 nanos down to 10 (half of which is the inherent indirect call). Test Plan: unit tests Reviewed By: delong.j@fb.com Subscribers: trunkagent, bmaurer, tudorb, folly-diffs@, yfeldblum, jdelong FB internal diff: D1840690 Signature: t1:1840690:1423693026:33801341ec0b83bf47e050be6528c5dd05021ed5
6a9fa4cf · Nathan Bronson · Sara Golemon · 01734a1c · 6a9fa4cf · 6a9fa4cf
Commit 6a9fa4cf authored Feb 11, 2015 by Nathan Bronson Committed by Sara Golemon Feb 12, 2015
Showing with 72 additions and 134 deletions

folly/detail/CacheLocality.cpp folly/detail/CacheLocality.cpp +19 -78

folly/detail/CacheLocality.h folly/detail/CacheLocality.h +2 -1

folly/test/CacheLocalityTest.cpp folly/test/CacheLocalityTest.cpp +51 -55

No files found.
--- a/folly/detail/CacheLocality.cpp
+++ b/folly/detail/CacheLocality.cpp
@@ -19,7 +19,6 @@
 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
 #include <dlfcn.h>
 #include <fstream>
-#include <mutex>
 #include <folly/Conv.h>
 #include <folly/Exception.h>
@@ -37,7 +36,7 @@ static CacheLocality getSystemLocalityInfo() {
  try {
    return CacheLocality::readFromSysfs();
  } catch (...) {
-    // fall through to below if something goes wrong
+    // keep trying
  }
 #endif
@@ -202,87 +201,29 @@ CacheLocality CacheLocality::uniform(size_t numCpus) {
 ////////////// Getcpu
-#ifdef CLOCK_REALTIME_COARSE
+/// Resolves the dynamically loaded symbol __vdso_getcpu, returning null
+/// on failure
-static std::once_flag gVdsoInitOnce;
+static Getcpu::Func loadVdsoGetcpu() {
-static Getcpu::Func gVdsoGetcpuFunc;
+  void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
-static int64_t (*gVdsoGettimeNsFunc)(clockid_t);
+  if (h == nullptr) {
+    return nullptr;
-static int cachingVdsoGetcpu(unsigned* cpu, unsigned* unused_node,
+  }
-                             void* unused_tcache) {
-  static __thread unsigned tls_cpu;
-  static __thread int64_t tls_lastContextSwitchNanos;
-  auto lastContextSwitchNanos = gVdsoGettimeNsFunc(CLOCK_REALTIME_COARSE);
+  auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
-  if (tls_lastContextSwitchNanos != lastContextSwitchNanos) {
+  if (func == nullptr) {
-    int rv = gVdsoGetcpuFunc(&tls_cpu, nullptr, nullptr);
+    // technically a null result could either be a failure or a successful
-    if (rv != 0) {
+    // lookup of a symbol with the null value, but the second can't actually
-      return rv;
+    // happen for this symbol.  No point holding the handle forever if
-    }
+    // we don't need the code
-    tls_lastContextSwitchNanos = lastContextSwitchNanos;
+    dlclose(h);
  }
-  *cpu = tls_cpu;
-  return 0;
+  return func;
 }
-#endif
-/// Resolves the dynamically loaded symbol __vdso_getcpu and
-/// __vdso_clock_gettime_ns, returning a pair of nulls on failure.  Does a
-/// little bit of probing to make sure that the __vdso_clock_gettime_ns
-/// function isn't using the slow fallback path.
 Getcpu::Func Getcpu::vdsoFunc() {
-#ifdef CLOCK_REALTIME_COARSE
+  static Func func = loadVdsoGetcpu();
-  std::call_once(gVdsoInitOnce, []{
+  return func;
-    void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
-    typedef int64_t (*GettimeNsFunc)(clockid_t);
-    auto getcpuFunc = Getcpu::Func(
-        !h ? nullptr : dlsym(h, "__vdso_getcpu"));
-    auto gettimeNsFunc = GettimeNsFunc(
-        !h ? nullptr : dlsym(h, "__vdso_clock_gettime_ns"));
-    bool coarseGettimeDetected = false;
-    if (gettimeNsFunc != nullptr) {
-      // The TLS cache of getcpu results only is an optimization if the
-      // __vdso_clock_gettime_ns implementation is fast and actually
-      // coarse.  The slow fallback implementation is not coarse, so if
-      // we detect a coarse clock we are set.  If CLOCK_REALTIME_COARSE
-      // has the right properties, then so long as there is no context
-      // switch between two calls the returned time will be identical.
-      // Dynamically verify this.  An unlikely context switch while we're
-      // testing can lead to a false negative, but not a false positive,
-      // so we just run the test multiple times.  This ensures that we
-      // will get two calls to gettimeNsFunc in a row with no intervening
-      // context switch.
-      auto prev = gettimeNsFunc(CLOCK_REALTIME_COARSE);
-      for (int i = 0; i < 10 && !coarseGettimeDetected; ++i) {
-        auto next = gettimeNsFunc(CLOCK_REALTIME_COARSE);
-        coarseGettimeDetected = next == prev;
-        prev = next;
-      }
-    }
-    if (getcpuFunc == nullptr || !coarseGettimeDetected) {
-      // technically a null getcpuFunc could either be a failure or
-      // a successful lookup of a symbol with the null value, but the
-      // second can't actually happen for this symbol.  No point holding
-      // the handle forever if we don't need the code
-      if (h) {
-        dlclose(h);
-      }
-    } else {
-      gVdsoGetcpuFunc = getcpuFunc;
-      gVdsoGettimeNsFunc = gettimeNsFunc;
-    }
-  });
-  if (gVdsoGetcpuFunc != nullptr) {
-    return cachingVdsoGetcpu;
-  }
-#endif
-  return nullptr;
 }
 /////////////// SequentialThreadId

--- a/folly/detail/CacheLocality.h
+++ b/folly/detail/CacheLocality.h
@@ -306,7 +306,8 @@ struct AccessSpreader {
  /// Points to the getcpu-like function we are using to obtain the
  /// current cpu.  It should not be assumed that the returned cpu value
-  /// is in range.
+  /// is in range.  We use a member for this instead of a static so that
+  /// this fetch preloads a prefix the stripeByCpu array
  Getcpu::Func getcpuFunc_;
  /// A precomputed map from cpu to stripe.  Rather than add a layer of

--- a/folly/test/CacheLocalityTest.cpp
+++ b/folly/test/CacheLocalityTest.cpp
@@ -447,71 +447,67 @@ enum class SpreaderType { GETCPU, SHARED, TLS_RR };
 // _getcpu refers to the vdso getcpu implementation with a locally
 // constructed AccessSpreader.  _tls_rr refers to execution using
 // SequentialThreadId, the fallback if the vdso getcpu isn't available.
-// _shared refers to calling AccessSpreader<>::current(numStripes) inside
+// _shared refers to calling AccessSpreader<>::current(numStripes)
-// the hot loop.
+// inside the hot loop.
 //
-// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic, so
+// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
-// since the stripe selection is 6 nanos the atomic increments in the L1 is
+// so since the stripe selection is 21 nanos the atomic increments in
-// ~15 nanos.  At width 8_stripe_0_work the line is expected to ping-pong
+// the L1 is ~15 nanos.  At width 8_stripe_0_work the line is expected
-// almost every operation, since the loops have the same duration.
+// to ping-pong almost every operation, since the loops have the same
-// Widths 4 and 2 have the same behavior, but each tour of the cache line
+// duration.  Widths 4 and 2 have the same behavior, but each tour of the
-// is 4 and 8 cores long, respectively.  These all suggest a lower bound
+// cache line is 4 and 8 cores long, respectively.  These all suggest a
-// of ~60 nanos for intra-chip handoff and increment between the L1s.
+// lower bound of 60 nanos for intra-chip handoff and increment between
+// the L1s.
 //
-// With 396 nanos (500 std::memory_order_seq_cst loads) of busywork per
+// With 455 nanos (1K cycles) of busywork per contended increment, the
-// contended increment, the system can hide all of the latency of a tour
+// system can hide all of the latency of a tour of length 4, but not
-// of length 4, but not quite one of length 8.  I was a bit surprised
+// quite one of length 8.  I was a bit surprised at how much worse the
-// at how much worse the non-striped version got.  It seems that the
+// non-striped version got.  It seems that the inter-chip traffic also
-// inter-chip traffic also interferes with the L1-only localWork.load().
+// interferes with the L1-only localWork.load().  When the local work is
-// When the local work is doubled to 776 nanoseconds we see that the
+// doubled to about 1 microsecond we see that the inter-chip contention
-// inter-chip contention is still very important, but subdivisions on
+// is still very important, but subdivisions on the same chip don't matter.
-// the same chip don't matter.
 //
 // sudo nice -n -20
 //   _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
 // ============================================================================
 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
 // ============================================================================
-// LocalAccessSpreaderUse                                       6.34ns  157.75M
+// contentionAtWidth(1_stripe_0_work_stub)                      1.14us  873.64K
-// SharedAccessSpreaderUse                                      6.34ns  157.75M
+// contentionAtWidth(2_stripe_0_work_getcpu)                  495.58ns    2.02M
-// AccessSpreaderConstruction                                 328.19ns    3.05M
+// contentionAtWidth(4_stripe_0_work_getcpu)                  232.99ns    4.29M
+// contentionAtWidth(8_stripe_0_work_getcpu)                  101.16ns    9.88M
+// contentionAtWidth(16_stripe_0_work_getcpu)                  41.93ns   23.85M
+// contentionAtWidth(32_stripe_0_work_getcpu)                  42.04ns   23.79M
+// contentionAtWidth(64_stripe_0_work_getcpu)                  41.94ns   23.84M
+// contentionAtWidth(2_stripe_0_work_tls_rr)                    1.00us  997.41K
+// contentionAtWidth(4_stripe_0_work_tls_rr)                  694.41ns    1.44M
+// contentionAtWidth(8_stripe_0_work_tls_rr)                  590.27ns    1.69M
+// contentionAtWidth(16_stripe_0_work_tls_rr)                 222.13ns    4.50M
+// contentionAtWidth(32_stripe_0_work_tls_rr)                 169.49ns    5.90M
+// contentionAtWidth(64_stripe_0_work_tls_rr)                 162.20ns    6.17M
+// contentionAtWidth(2_stripe_0_work_shared)                  495.54ns    2.02M
+// contentionAtWidth(4_stripe_0_work_shared)                  236.27ns    4.23M
+// contentionAtWidth(8_stripe_0_work_shared)                  114.81ns    8.71M
+// contentionAtWidth(16_stripe_0_work_shared)                  44.65ns   22.40M
+// contentionAtWidth(32_stripe_0_work_shared)                  41.76ns   23.94M
+// contentionAtWidth(64_stripe_0_work_shared)                  43.47ns   23.00M
+// atomicIncrBaseline(local_incr_0_work)                       20.39ns   49.06M
 // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_0_work_stub)                    909.99ns    1.10M
+// contentionAtWidth(1_stripe_500_work_stub)                    2.04us  491.13K
-// contentionAtWidth(2_stripe_0_work_getcpu)                  527.54ns    1.90M
+// contentionAtWidth(2_stripe_500_work_getcpu)                610.98ns    1.64M
-// contentionAtWidth(4_stripe_0_work_getcpu)                  260.28ns    3.84M
+// contentionAtWidth(4_stripe_500_work_getcpu)                507.72ns    1.97M
-// contentionAtWidth(8_stripe_0_work_getcpu)                  131.82ns    7.59M
+// contentionAtWidth(8_stripe_500_work_getcpu)                542.53ns    1.84M
-// contentionAtWidth(16_stripe_0_work_getcpu)                  25.92ns   38.58M
+// contentionAtWidth(16_stripe_500_work_getcpu)               496.55ns    2.01M
-// contentionAtWidth(32_stripe_0_work_getcpu)                  21.80ns   45.88M
+// contentionAtWidth(32_stripe_500_work_getcpu)               500.67ns    2.00M
-// contentionAtWidth(64_stripe_0_work_getcpu)                  20.06ns   49.85M
+// atomicIncrBaseline(local_incr_500_work)                    484.69ns    2.06M
-// contentionAtWidth(2_stripe_0_work_tls_rr)                  759.21ns    1.32M
-// contentionAtWidth(4_stripe_0_work_tls_rr)                  607.46ns    1.65M
-// contentionAtWidth(8_stripe_0_work_tls_rr)                  403.79ns    2.48M
-// contentionAtWidth(16_stripe_0_work_tls_rr)                 188.14ns    5.32M
-// contentionAtWidth(32_stripe_0_work_tls_rr)                 131.59ns    7.60M
-// contentionAtWidth(64_stripe_0_work_tls_rr)                 103.56ns    9.66M
-// contentionAtWidth(2_stripe_0_work_shared)                  553.07ns    1.81M
-// contentionAtWidth(4_stripe_0_work_shared)                  274.23ns    3.65M
-// contentionAtWidth(8_stripe_0_work_shared)                  137.43ns    7.28M
-// contentionAtWidth(16_stripe_0_work_shared)                  24.52ns   40.78M
-// contentionAtWidth(32_stripe_0_work_shared)                  21.80ns   45.86M
-// contentionAtWidth(64_stripe_0_work_shared)                  21.66ns   46.17M
-// atomicIncrBaseline(local_incr_0_work)                       16.73ns   59.78M
 // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_500_work_stub)                    1.75us  571.14K
+// contentionAtWidth(1_stripe_1000_work_stub)                   2.11us  473.78K
-// contentionAtWidth(2_stripe_500_work_getcpu)                500.79ns    2.00M
+// contentionAtWidth(2_stripe_1000_work_getcpu)               970.64ns    1.03M
-// contentionAtWidth(4_stripe_500_work_getcpu)                410.45ns    2.44M
+// contentionAtWidth(4_stripe_1000_work_getcpu)               987.31ns    1.01M
-// contentionAtWidth(8_stripe_500_work_getcpu)                411.41ns    2.43M
+// contentionAtWidth(8_stripe_1000_work_getcpu)                 1.01us  985.52K
-// contentionAtWidth(16_stripe_500_work_getcpu)               400.12ns    2.50M
+// contentionAtWidth(16_stripe_1000_work_getcpu)              986.09ns    1.01M
-// contentionAtWidth(32_stripe_500_work_getcpu)               397.37ns    2.52M
+// contentionAtWidth(32_stripe_1000_work_getcpu)              960.23ns    1.04M
-// atomicIncrBaseline(local_incr_500_work)                    396.53ns    2.52M
+// atomicIncrBaseline(local_incr_1000_work)                   950.63ns    1.05M
-// ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_1000_work_stub)                   1.88us  530.59K
-// contentionAtWidth(2_stripe_1000_work_getcpu)               778.77ns    1.28M
-// contentionAtWidth(4_stripe_1000_work_getcpu)               779.56ns    1.28M
-// contentionAtWidth(8_stripe_1000_work_getcpu)               795.62ns    1.26M
-// contentionAtWidth(16_stripe_1000_work_getcpu)              778.81ns    1.28M
-// contentionAtWidth(32_stripe_1000_work_getcpu)              780.26ns    1.28M
-// atomicIncrBaseline(local_incr_1000_work)                   776.39ns    1.29M
 // ============================================================================
 static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
                              SpreaderType spreaderType,