Add AccessSpreader<>::cachedCurrent()

Summary: `AccessSpreader::cachedCurrent()` caches the result of `AccessSpreader::getcpuFunc()` for 32 calls in a thread-local. The cached function takes 2 ns, where the current call takes 12 ns. This comes at the cost of being imprecise when threads migrate to a new cpu. I chose 32 as the number of calls because it only has a 10% overhead over never refreshing (2.05 ns vs 1.83 ns), where 16 has a 30% overhead, and it performs just as well as 64. Reviewed By: ot Differential Revision: D10151009 fbshipit-source-id: 07ed292dfdcdcedcb74c24279f7773a80ad09348

Add AccessSpreader<>::cachedCurrent()
Summary: `AccessSpreader::cachedCurrent()` caches the result of `AccessSpreader::getcpuFunc()` for 32 calls in a thread-local. The cached function takes 2 ns, where the current call takes 12 ns. This comes at the cost of being imprecise when threads migrate to a new cpu. I chose 32 as the number of calls because it only has a 10% overhead over never refreshing (2.05 ns vs 1.83 ns), where 16 has a 30% overhead, and it performs just as well as 64. Reviewed By: ot Differential Revision: D10151009 fbshipit-source-id: 07ed292dfdcdcedcb74c24279f7773a80ad09348
03fe7209 · Nick Terrell · Facebook Github Bot · e0ec3bcd · 03fe7209 · 03fe7209
Commit 03fe7209 authored Oct 04, 2018 by Nick Terrell Committed by Facebook Github Bot Oct 04, 2018
3 changed files
--- a/folly/concurrency/CacheLocality.h
+++ b/folly/concurrency/CacheLocality.h
@@ -239,6 +239,24 @@ struct AccessSpreader {
                              [cpu % kMaxCpus];
  }

+#ifdef FOLLY_TLS
+  /// Returns the stripe associated with the current CPU.  The returned
+  /// value will be < numStripes.
+  /// This function caches the current cpu in a thread-local variable for a
+  /// certain small number of calls, which can make the result imprecise, but
+  /// it is more efficient (amortized 2 ns on my dev box, compared to 12 ns for
+  /// current()).
+  static size_t cachedCurrent(size_t numStripes) {
+    return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)]
+                              [cpuCache.cpu()];
+  }
+#else
+  /// Fallback implementation when thread-local storage isn't available.
+  static size_t cachedCurrent(size_t numStripes) {
+    return current(numStripes);
+  }
+#endif
+
 private:
  /// If there are more cpus than this nothing will crash, but there
  /// might be unnecessary sharing
@@ -267,6 +285,30 @@ struct AccessSpreader {
  /// array.
  static CompactStripe widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus];

+  /// Caches the current CPU and refreshes the cache every so often.
+  class CpuCache {
+   public:
+    unsigned cpu() {
+      if (UNLIKELY(cachedCpuUses_-- == 0)) {
+        unsigned cpu;
+        AccessSpreader::getcpuFunc(&cpu, nullptr, nullptr);
+        cachedCpu_ = cpu % kMaxCpus;
+        cachedCpuUses_ = kMaxCachedCpuUses - 1;
+      }
+      return cachedCpu_;
+    }
+
+   private:
+    static constexpr unsigned kMaxCachedCpuUses = 32;
+
+    unsigned cachedCpu_{0};
+    unsigned cachedCpuUses_{0};
+  };
+
+#ifdef FOLLY_TLS
+  static FOLLY_TLS CpuCache cpuCache;
+#endif
+
  static bool initialized;

  /// Returns the best getcpu implementation for Atom
@@ -331,6 +373,12 @@ template <template <typename> class Atom>
 typename AccessSpreader<Atom>::CompactStripe
    AccessSpreader<Atom>::widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus] = {};

+#ifdef FOLLY_TLS
+template <template <typename> class Atom>
+FOLLY_TLS
+    typename AccessSpreader<Atom>::CpuCache AccessSpreader<Atom>::cpuCache;
+#endif
+
 template <template <typename> class Atom>
 bool AccessSpreader<Atom>::initialized = AccessSpreader<Atom>::initialize();


--- a/folly/concurrency/test/CacheLocalityBenchmark.cpp
+++ b/folly/concurrency/test/CacheLocalityBenchmark.cpp
--- a/folly/concurrency/test/CacheLocalityTest.cpp
+++ b/folly/concurrency/test/CacheLocalityTest.cpp
@@ -375,6 +375,30 @@ TEST(AccessSpreader, Simple) {
  }
 }

+TEST(AccessSpreader, SimpleCached) {
+  for (size_t s = 1; s < 200; ++s) {
+    EXPECT_LT(AccessSpreader<>::cachedCurrent(s), s);
+  }
+}
+
+TEST(AccessSpreader, ConcurrentAccessCached) {
+  std::vector<std::thread> threads;
+  for (size_t i = 0; i < 4; ++i) {
+    threads.emplace_back([]() {
+      for (size_t s : {16, 32, 64}) {
+        for (size_t j = 1; j < 200; ++j) {
+          EXPECT_LT(AccessSpreader<>::cachedCurrent(s), s);
+          EXPECT_LT(AccessSpreader<>::cachedCurrent(s), s);
+        }
+        std::this_thread::yield();
+      }
+    });
+  }
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
 #ifdef FOLLY_TLS
 #define DECLARE_SPREADER_TAG(tag, locality, func)      \
  namespace {                                          \