remove AccessSpreader extern-template-struct

Summary: Using `extern template struct` can be tricky - best to avoid. Use static storage but no longer use static initialization to avoid all issues surrounding ODR, global initializers, etc. Microbenchmarks show no significant hit. Add a global constructor to continue triggering initialization at static initialization time. Differential Revision: D27746718 fbshipit-source-id: c5288c33412a80bb813c8d615794b9773cef323c

remove AccessSpreader extern-template-struct
Summary: Using `extern template struct` can be tricky - best to avoid. Use static storage but no longer use static initialization to avoid all issues surrounding ODR, global initializers, etc. Microbenchmarks show no significant hit. Add a global constructor to continue triggering initialization at static initialization time. Differential Revision: D27746718 fbshipit-source-id: c5288c33412a80bb813c8d615794b9773cef323c
cd58205e · Yedidya Feldblum · Facebook GitHub Bot · c5651a00 · cd58205e · cd58205e
Commit cd58205e authored Apr 20, 2021 by Yedidya Feldblum Committed by Facebook GitHub Bot Apr 20, 2021
3 changed files
--- a/folly/concurrency/CacheLocality.cpp
+++ b/folly/concurrency/CacheLocality.cpp
@@ -62,8 +62,17 @@ static CacheLocality getSystemLocalityInfo() {

 template <>
 const CacheLocality& CacheLocality::system<std::atomic>() {
-  static auto* cache = new CacheLocality(getSystemLocalityInfo());
-  return *cache;
+  static std::atomic<const CacheLocality*> cache;
+  auto value = cache.load(std::memory_order_acquire);
+  if (value != nullptr) {
+    return *value;
+  }
+  auto next = new CacheLocality(getSystemLocalityInfo());
+  if (cache.compare_exchange_strong(value, next, std::memory_order_acq_rel)) {
+    return *next;
+  }
+  delete next;
+  return *value;
 }

 // Each level of cache has sharing sets, which are the set of cpus
@@ -346,44 +355,51 @@ int AccessSpreaderBase::degenerateGetcpu(unsigned* cpu, unsigned* node, void*) {
  return 0;
 }

+struct AccessSpreaderStaticInit {
+  static AccessSpreaderStaticInit instance;
+  AccessSpreaderStaticInit() { (void)AccessSpreader<>::current(~size_t(0)); }
+};
+AccessSpreaderStaticInit AccessSpreaderStaticInit::instance;
+
 bool AccessSpreaderBase::initialize(
-    Getcpu::Func& getcpuFunc,
+    GlobalState& state,
    Getcpu::Func (&pickGetcpuFunc)(),
-    const CacheLocality& (&system)(),
-    CompactStripeTable& widthAndCpuToStripe) {
-  getcpuFunc = pickGetcpuFunc();
-
+    const CacheLocality& (&system)()) {
+  (void)AccessSpreaderStaticInit::instance; // ODR-use it so it is not dropped
  auto& cacheLocality = system();
  auto n = cacheLocality.numCpus;
  for (size_t width = 0; width <= kMaxCpus; ++width) {
-    auto& row = widthAndCpuToStripe[width];
+    auto& row = state.table[width];
    auto numStripes = std::max(size_t{1}, width);
    for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
      auto index = cacheLocality.localityIndexByCpu[cpu];
      assert(index < n);
      // as index goes from 0..n, post-transform value goes from
      // 0..numStripes
-      row[cpu] = static_cast<CompactStripe>((index * numStripes) / n);
+      row[cpu].store(
+          static_cast<CompactStripe>((index * numStripes) / n),
+          std::memory_order_relaxed);
      assert(row[cpu] < numStripes);
    }
    size_t filled = n;
    while (filled < kMaxCpus) {
      size_t len = std::min(filled, kMaxCpus - filled);
-      std::memcpy(&row[filled], &row[0], len);
+      for (size_t i = 0; i < len; ++i) {
+        row[filled + i].store(
+            row[i].load(std::memory_order_relaxed), std::memory_order_relaxed);
+      }
      filled += len;
    }
    for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
      assert(row[cpu] == row[cpu - n]);
    }
  }
+  state.getcpu.exchange(pickGetcpuFunc(), std::memory_order_acq_rel);
  return true;
 }

 } // namespace detail

-/////////////// AccessSpreader
-template struct AccessSpreader<std::atomic>;
-
 SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz)
    : allocSize_{allocSize}, sz_(sz) {}


--- a/folly/concurrency/CacheLocality.h
+++ b/folly/concurrency/CacheLocality.h
@@ -201,16 +201,29 @@ class AccessSpreaderBase {
      kMaxCpus - 1 <= std::numeric_limits<CompactStripe>::max(),
      "stripeByCpu element type isn't wide enough");

-  using CompactStripeTable = CompactStripe[kMaxCpus + 1][kMaxCpus];
+  using CompactStripeTable = std::atomic<CompactStripe>[kMaxCpus + 1][kMaxCpus];
+
+  struct GlobalState {
+    /// For each level of splitting up to kMaxCpus, maps the cpu (mod
+    /// kMaxCpus) to the stripe.  Rather than performing any inequalities
+    /// or modulo on the actual number of cpus, we just fill in the entire
+    /// array.
+    /// Keep as the first field to avoid extra + in the fastest path.
+    CompactStripeTable table;
+
+    /// Points to the getcpu-like function we are using to obtain the
+    /// current cpu. It should not be assumed that the returned cpu value
+    /// is in range.
+    std::atomic<Getcpu::Func> getcpu; // nullptr -> not initialized
+  };
+  static_assert(
+      std::is_trivial<GlobalState>::value || kCpplibVer, "not trivial");

  /// Always claims to be on CPU zero, node zero
  static int degenerateGetcpu(unsigned* cpu, unsigned* node, void*);

  static bool initialize(
-      Getcpu::Func&,
-      Getcpu::Func (&)(),
-      const CacheLocality& (&)(),
-      CompactStripeTable&);
+      GlobalState& out, Getcpu::Func (&)(), const CacheLocality& (&)());
 };

 } // namespace detail
@@ -251,17 +264,31 @@ class AccessSpreaderBase {
 /// all of the time.
 template <template <typename> class Atom = std::atomic>
 struct AccessSpreader : private detail::AccessSpreaderBase {
+ private:
+  struct GlobalState : detail::AccessSpreaderBase::GlobalState {};
+  static_assert(
+      std::is_trivial<GlobalState>::value || kCpplibVer, "not trivial");
+
+ public:
+  FOLLY_EXPORT static GlobalState& state() {
+    static GlobalState state; // trivial for zero ctor and zero dtor
+    if (FOLLY_UNLIKELY(!state.getcpu.load(std::memory_order_acquire))) {
+      initialize(state);
+    }
+    return state;
+  }
+
  /// Returns the stripe associated with the current CPU.  The returned
  /// value will be < numStripes.
-  static size_t current(size_t numStripes) {
-    // widthAndCpuToStripe[0] will actually work okay (all zeros), but
+  static size_t current(size_t numStripes, const GlobalState& s = state()) {
+    // s.table[0] will actually work okay (all zeros), but
    // something's wrong with the caller
    assert(numStripes > 0);

    unsigned cpu;
-    getcpuFunc(&cpu, nullptr, nullptr);
-    return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)]
-                              [cpu % kMaxCpus];
+    s.getcpu.load(std::memory_order_relaxed)(&cpu, nullptr, nullptr);
+    return s.table[std::min(size_t(kMaxCpus), numStripes)][cpu % kMaxCpus].load(
+        std::memory_order_relaxed);
  }

 #ifdef FOLLY_CL_USE_FOLLY_TLS
@@ -271,13 +298,17 @@ struct AccessSpreader : private detail::AccessSpreaderBase {
  /// certain small number of calls, which can make the result imprecise, but
  /// it is more efficient (amortized 2 ns on my dev box, compared to 12 ns for
  /// current()).
-  static size_t cachedCurrent(size_t numStripes) {
-    return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)]
-                              [cpuCache().cpu()];
+  static size_t cachedCurrent(
+      size_t numStripes, const GlobalState& s = state()) {
+    return s.table[std::min(size_t(kMaxCpus), numStripes)][cpuCache().cpu(s)]
+        .load(std::memory_order_relaxed);
  }
 #else
  /// Fallback implementation when thread-local storage isn't available.
-  static size_t cachedCurrent(size_t numStripes) { return current(numStripes); }
+  static size_t cachedCurrent(
+      size_t numStripes, const GlobalState& s = state()) {
+    return current(numStripes, s);
+  }
 #endif

  /// Returns the maximum stripe value that can be returned under any
@@ -285,27 +316,13 @@ struct AccessSpreader : private detail::AccessSpreaderBase {
  static constexpr size_t maxStripeValue() { return kMaxCpus; }

 private:
-  /// Points to the getcpu-like function we are using to obtain the
-  /// current cpu.  It should not be assumed that the returned cpu value
-  /// is in range.  We use a static for this so that we can prearrange a
-  /// valid value in the pre-constructed state and avoid the need for a
-  /// conditional on every subsequent invocation (not normally a big win,
-  /// but 20% on some inner loops here).
-  static Getcpu::Func getcpuFunc;
-
-  /// For each level of splitting up to kMaxCpus, maps the cpu (mod
-  /// kMaxCpus) to the stripe.  Rather than performing any inequalities
-  /// or modulo on the actual number of cpus, we just fill in the entire
-  /// array.
-  static CompactStripeTable widthAndCpuToStripe;
-
  /// Caches the current CPU and refreshes the cache every so often.
  class CpuCache {
   public:
-    unsigned cpu() {
+    unsigned cpu(GlobalState const& s) {
      if (UNLIKELY(cachedCpuUses_-- == 0)) {
        unsigned cpu;
-        AccessSpreader::getcpuFunc(&cpu, nullptr, nullptr);
+        s.getcpu.load(std::memory_order_relaxed)(&cpu, nullptr, nullptr);
        cachedCpu_ = cpu % kMaxCpus;
        cachedCpuUses_ = kMaxCachedCpuUses - 1;
      }
@@ -326,8 +343,6 @@ struct AccessSpreader : private detail::AccessSpreaderBase {
  }
 #endif

-  static bool initialized;
-
  /// Returns the best getcpu implementation for Atom
  static Getcpu::Func pickGetcpuFunc() {
    auto best = Getcpu::resolveVdsoFunc();
@@ -347,30 +362,12 @@ struct AccessSpreader : private detail::AccessSpreaderBase {
  // zero stripe.  Once a sanitizer gets smart enough to detect this as
  // a race or undefined behavior, we can annotate it.

-  static bool initialize() {
+  static bool initialize(GlobalState& state) {
    return detail::AccessSpreaderBase::initialize(
-        getcpuFunc,
-        pickGetcpuFunc,
-        CacheLocality::system<Atom>,
-        widthAndCpuToStripe);
+        state, pickGetcpuFunc, CacheLocality::system<Atom>);
  }
 };

-template <template <typename> class Atom>
-Getcpu::Func AccessSpreader<Atom>::getcpuFunc =
-    AccessSpreader<Atom>::degenerateGetcpu;
-
-template <template <typename> class Atom>
-typename AccessSpreader<Atom>::CompactStripe
-    AccessSpreader<Atom>::widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus] = {};
-
-template <template <typename> class Atom>
-bool AccessSpreader<Atom>::initialized = AccessSpreader<Atom>::initialize();
-
-// Suppress this instantiation in other translation units. It is
-// instantiated in CacheLocality.cpp
-extern template struct AccessSpreader<std::atomic>;
-
 /**
 * A simple freelist allocator.  Allocates things of size sz, from
 * slabs of size allocSize.  Takes a lock on each

--- a/folly/concurrency/test/CacheLocalityBenchmark.cpp
+++ b/folly/concurrency/test/CacheLocalityBenchmark.cpp
@@ -71,8 +71,14 @@ struct CachedCurrentTag {};
 } // namespace
 namespace folly {
 template <>
-size_t AccessSpreader<CachedCurrentTag>::current(size_t numStripes) {
-  return AccessSpreader<std::atomic>::cachedCurrent(numStripes);
+const CacheLocality& CacheLocality::system<CachedCurrentTag>() {
+  return CacheLocality::system<>();
+}
+template <>
+size_t AccessSpreader<CachedCurrentTag>::current(
+    size_t numStripes, const GlobalState& state) {
+  auto& alter = reinterpret_cast<const AccessSpreader::GlobalState&>(state);
+  return AccessSpreader::cachedCurrent(numStripes, alter);
 }
 } // namespace folly

@@ -83,6 +89,14 @@ BENCHMARK(AccessSpreaderUse, iters) {
  }
 }

+BENCHMARK(StateAccessSpreaderUse, iters) {
+  auto& state = AccessSpreader<>::state();
+  for (unsigned long i = 0; i < iters; ++i) {
+    auto x = AccessSpreader<>::current(16, state);
+    folly::doNotOptimizeAway(x);
+  }
+}
+
 BENCHMARK(CachedAccessSpreaderUse, iters) {
  for (unsigned long i = 0; i < iters; ++i) {
    auto x = AccessSpreader<>::cachedCurrent(16);
@@ -90,6 +104,14 @@ BENCHMARK(CachedAccessSpreaderUse, iters) {
  }
 }

+BENCHMARK(StateCachedAccessSpreaderUse, iters) {
+  auto& state = AccessSpreader<>::state();
+  for (unsigned long i = 0; i < iters; ++i) {
+    auto x = AccessSpreader<>::cachedCurrent(16, state);
+    folly::doNotOptimizeAway(x);
+  }
+}
+
 BENCHMARK(BaselineAtomicIncrement, iters) {
  std::atomic<int> value;
  for (unsigned long i = 0; i < iters; ++i) {
@@ -107,6 +129,16 @@ BENCHMARK(CachedAccessSpreaderAtomicIncrement, iters) {
  }
 }

+BENCHMARK(StateCachedAccessSpreaderAtomicIncrement, iters) {
+  auto& state = AccessSpreader<>::state();
+  std::array<std::atomic<int>, 64> values;
+  for (unsigned long i = 0; i < iters; ++i) {
+    auto x = AccessSpreader<>::cachedCurrent(64, state);
+    ++values[x];
+    folly::doNotOptimizeAway(values[x]);
+  }
+}
+
 // Benchmark scores here reflect the time for 32 threads to perform an
 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
 // if we don't separate the counters onto unique 128 byte stripes the