Make folly::detail::CacheLocality portable on apple

Summary: This is one of the series steps to port folly::future on ios. Apple doesn't support __thread. Adding a HashingThreadId as a fallback on apple. Reviewed By: nbronson Differential Revision: D2832068 fb-gh-sync-id: c3389245f3c0bbd36de6260680f7ac6110b3206c

Make folly::detail::CacheLocality portable on apple
Summary: This is one of the series steps to port folly::future on ios. Apple doesn't support __thread. Adding a HashingThreadId as a fallback on apple. Reviewed By: nbronson Differential Revision: D2832068 fb-gh-sync-id: c3389245f3c0bbd36de6260680f7ac6110b3206c
dbf7c3d2 · Francis Ma · facebook-github-bot-0 · 94174b55 · dbf7c3d2 · dbf7c3d2
Commit dbf7c3d2 authored Jan 27, 2016 by Francis Ma Committed by facebook-github-bot-0 Jan 27, 2016
4 changed files
--- a/folly/Portability.h
+++ b/folly/Portability.h
@@ -223,7 +223,9 @@ namespace std { typedef ::max_align_t max_align_t; }
 * the semantics are the same
 * (but remember __thread has different semantics when using emutls (ex. apple))
 */
-#if defined(_MSC_VER)
+#if defined(__APPLE__)
+#undef FOLLY_TLS
+#elif defined(_MSC_VER)
 # define FOLLY_TLS __declspec(thread)
 #elif defined(__GNUC__) || defined(__clang__)
 # define FOLLY_TLS __thread

--- a/folly/detail/CacheLocality.cpp
+++ b/folly/detail/CacheLocality.cpp
@@ -232,6 +232,7 @@ Getcpu::Func Getcpu::vdsoFunc() {
  return func;
 }

+#ifdef FOLLY_TLS
 /////////////// SequentialThreadId

 template<>
@@ -239,6 +240,7 @@ std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0);

 template<>
 FOLLY_TLS size_t SequentialThreadId<std::atomic>::currentId(0);
+#endif

 /////////////// AccessSpreader

@@ -277,7 +279,7 @@ Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) {
    return &degenerateGetcpu;
  } else {
    auto best = Getcpu::vdsoFunc();
-    return best ? best : &SequentialThreadId<std::atomic>::getcpu;
+    return best ? best : &FallbackGetcpuType::getcpu;
  }
 }


--- a/folly/detail/CacheLocality.h
+++ b/folly/detail/CacheLocality.h
@@ -26,6 +26,7 @@
 #include <string>
 #include <type_traits>
 #include <vector>
+#include <folly/Hash.h>
 #include <folly/Likely.h>
 #include <folly/Portability.h>

@@ -141,10 +142,7 @@ struct Getcpu {
  static Func vdsoFunc();
 };

-/// A class that lazily binds a unique (for each implementation of Atom)
-/// identifier to a thread.  This is a fallback mechanism for the access
-/// spreader if we are in testing (using DeterministicAtomic) or if
-/// __vdso_getcpu can't be dynamically loaded
+#ifdef FOLLY_TLS
 template <template<typename> class Atom>
 struct SequentialThreadId {

@@ -157,11 +155,32 @@ struct SequentialThreadId {
    return rv;
  }

+ private:
+  static Atom<size_t> prevId;
+
+  static FOLLY_TLS size_t currentId;
+};
+#endif
+
+struct HashingThreadId {
+  static size_t get() {
+    pthread_t pid = pthread_self();
+    uint64_t id = 0;
+    memcpy(&id, &pid, std::min(sizeof(pid), sizeof(id)));
+    return hash::twang_32from64(id);
+  }
+};
+
+/// A class that lazily binds a unique (for each implementation of Atom)
+/// identifier to a thread.  This is a fallback mechanism for the access
+/// spreader if __vdso_getcpu can't be loaded
+template <typename ThreadId>
+struct FallbackGetcpu {
  /// Fills the thread id into the cpu and node out params (if they
  /// are non-null).  This method is intended to act like getcpu when a
  /// fast-enough form of getcpu isn't available or isn't desired
  static int getcpu(unsigned* cpu, unsigned* node, void* unused) {
-    auto id = get();
+    auto id = ThreadId::get();
    if (cpu) {
      *cpu = id;
    }
@@ -170,13 +189,14 @@ struct SequentialThreadId {
    }
    return 0;
  }
-
- private:
-  static Atom<size_t> prevId;
-
-  static FOLLY_TLS size_t currentId;
 };

+#ifdef FOLLY_TLS
+typedef FallbackGetcpu<SequentialThreadId<std::atomic>> FallbackGetcpuType;
+#else
+typedef FallbackGetcpu<HashingThreadId> FallbackGetcpuType;
+#endif
+
 template <template<typename> class Atom, size_t kMaxCpus>
 struct AccessSpreaderArray;


--- a/folly/test/CacheLocalityTest.cpp
+++ b/folly/test/CacheLocalityTest.cpp
@@ -317,13 +317,30 @@ TEST(Getcpu, VdsoGetcpu) {
  EXPECT_TRUE(cpu < CPU_SETSIZE);
 }

-TEST(SequentialThreadId, Simple) {
+#ifdef FOLLY_TLS
+TEST(ThreadId, SimpleTls) {
  unsigned cpu = 0;
-  auto rv = SequentialThreadId<std::atomic>::getcpu(&cpu, nullptr, nullptr);
+  auto rv =
+      folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
+          &cpu, nullptr, nullptr);
  EXPECT_EQ(rv, 0);
  EXPECT_TRUE(cpu > 0);
  unsigned again;
-  SequentialThreadId<std::atomic>::getcpu(&again, nullptr, nullptr);
+  folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
+      &again, nullptr, nullptr);
+  EXPECT_EQ(cpu, again);
+}
+#endif
+
+TEST(ThreadId, SimplePthread) {
+  unsigned cpu = 0;
+  auto rv = folly::detail::FallbackGetcpu<HashingThreadId>::getcpu(
+      &cpu, nullptr, nullptr);
+  EXPECT_EQ(rv, 0);
+  EXPECT_TRUE(cpu > 0);
+  unsigned again;
+  folly::detail::FallbackGetcpu<HashingThreadId>::getcpu(
+      &again, nullptr, nullptr);
  EXPECT_EQ(cpu, again);
 }

@@ -434,7 +451,7 @@ BENCHMARK(AccessSpreaderConstruction, iters) {
  }
 }

-enum class SpreaderType { GETCPU, SHARED, TLS_RR };
+enum class SpreaderType { GETCPU, SHARED, TLS_RR, PTHREAD_SELF };

 // Benchmark scores here reflect the time for 32 threads to perform an
 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
@@ -472,42 +489,52 @@ enum class SpreaderType { GETCPU, SHARED, TLS_RR };
 // ============================================================================
 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
 // ============================================================================
-// contentionAtWidth(1_stripe_0_work_stub)                      1.14us  873.64K
-// contentionAtWidth(2_stripe_0_work_getcpu)                  495.58ns    2.02M
-// contentionAtWidth(4_stripe_0_work_getcpu)                  232.99ns    4.29M
-// contentionAtWidth(8_stripe_0_work_getcpu)                  101.16ns    9.88M
-// contentionAtWidth(16_stripe_0_work_getcpu)                  41.93ns   23.85M
-// contentionAtWidth(32_stripe_0_work_getcpu)                  42.04ns   23.79M
-// contentionAtWidth(64_stripe_0_work_getcpu)                  41.94ns   23.84M
-// contentionAtWidth(2_stripe_0_work_tls_rr)                    1.00us  997.41K
-// contentionAtWidth(4_stripe_0_work_tls_rr)                  694.41ns    1.44M
-// contentionAtWidth(8_stripe_0_work_tls_rr)                  590.27ns    1.69M
-// contentionAtWidth(16_stripe_0_work_tls_rr)                 222.13ns    4.50M
-// contentionAtWidth(32_stripe_0_work_tls_rr)                 169.49ns    5.90M
-// contentionAtWidth(64_stripe_0_work_tls_rr)                 162.20ns    6.17M
-// contentionAtWidth(2_stripe_0_work_shared)                  495.54ns    2.02M
-// contentionAtWidth(4_stripe_0_work_shared)                  236.27ns    4.23M
-// contentionAtWidth(8_stripe_0_work_shared)                  114.81ns    8.71M
-// contentionAtWidth(16_stripe_0_work_shared)                  44.65ns   22.40M
-// contentionAtWidth(32_stripe_0_work_shared)                  41.76ns   23.94M
-// contentionAtWidth(64_stripe_0_work_shared)                  43.47ns   23.00M
-// atomicIncrBaseline(local_incr_0_work)                       20.39ns   49.06M
+// LocalAccessSpreaderUse                                      13.00ns   76.94M
+// SharedAccessSpreaderUse                                     13.04ns   76.66M
+// AccessSpreaderConstruction                                 366.00ns    2.73M
+// ----------------------------------------------------------------------------
+// contentionAtWidth(1_stripe_0_work_stub)                    891.04ns    1.12M
+// contentionAtWidth(2_stripe_0_work_getcpu)                  403.45ns    2.48M
+// contentionAtWidth(4_stripe_0_work_getcpu)                  198.02ns    5.05M
+// contentionAtWidth(8_stripe_0_work_getcpu)                   90.54ns   11.04M
+// contentionAtWidth(16_stripe_0_work_getcpu)                  31.21ns   32.04M
+// contentionAtWidth(32_stripe_0_work_getcpu)                  29.15ns   34.31M
+// contentionAtWidth(64_stripe_0_work_getcpu)                  32.41ns   30.86M
+// contentionAtWidth(2_stripe_0_work_tls_rr)                  958.06ns    1.04M
+// contentionAtWidth(4_stripe_0_work_tls_rr)                  494.31ns    2.02M
+// contentionAtWidth(8_stripe_0_work_tls_rr)                  362.34ns    2.76M
+// contentionAtWidth(16_stripe_0_work_tls_rr)                 231.37ns    4.32M
+// contentionAtWidth(32_stripe_0_work_tls_rr)                 128.26ns    7.80M
+// contentionAtWidth(64_stripe_0_work_tls_rr)                 115.08ns    8.69M
+// contentionAtWidth(2_stripe_0_work_pthread_self)            856.63ns    1.17M
+// contentionAtWidth(4_stripe_0_work_pthread_self)            623.43ns    1.60M
+// contentionAtWidth(8_stripe_0_work_pthread_self)            419.69ns    2.38M
+// contentionAtWidth(16_stripe_0_work_pthread_self            217.32ns    4.60M
+// contentionAtWidth(32_stripe_0_work_pthread_self            157.69ns    6.34M
+// contentionAtWidth(64_stripe_0_work_pthread_self            140.94ns    7.10M
+// contentionAtWidth(2_stripe_0_work_shared)                  406.55ns    2.46M
+// contentionAtWidth(4_stripe_0_work_shared)                  198.28ns    5.04M
+// contentionAtWidth(8_stripe_0_work_shared)                   90.11ns   11.10M
+// contentionAtWidth(16_stripe_0_work_shared)                  34.53ns   28.96M
+// contentionAtWidth(32_stripe_0_work_shared)                  30.08ns   33.25M
+// contentionAtWidth(64_stripe_0_work_shared)                  34.60ns   28.90M
+// atomicIncrBaseline(local_incr_0_work)                       17.51ns   57.12M
 // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_500_work_stub)                    2.04us  491.13K
-// contentionAtWidth(2_stripe_500_work_getcpu)                610.98ns    1.64M
-// contentionAtWidth(4_stripe_500_work_getcpu)                507.72ns    1.97M
-// contentionAtWidth(8_stripe_500_work_getcpu)                542.53ns    1.84M
-// contentionAtWidth(16_stripe_500_work_getcpu)               496.55ns    2.01M
-// contentionAtWidth(32_stripe_500_work_getcpu)               500.67ns    2.00M
-// atomicIncrBaseline(local_incr_500_work)                    484.69ns    2.06M
+// contentionAtWidth(1_stripe_500_work_stub)                    1.87us  534.36K
+// contentionAtWidth(2_stripe_500_work_getcpu)                542.31ns    1.84M
+// contentionAtWidth(4_stripe_500_work_getcpu)                409.18ns    2.44M
+// contentionAtWidth(8_stripe_500_work_getcpu)                511.05ns    1.96M
+// contentionAtWidth(16_stripe_500_work_getcpu)               399.14ns    2.51M
+// contentionAtWidth(32_stripe_500_work_getcpu)               399.05ns    2.51M
+// atomicIncrBaseline(local_incr_500_work)                    399.41ns    2.50M
 // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_1000_work_stub)                   2.11us  473.78K
-// contentionAtWidth(2_stripe_1000_work_getcpu)               970.64ns    1.03M
-// contentionAtWidth(4_stripe_1000_work_getcpu)               987.31ns    1.01M
-// contentionAtWidth(8_stripe_1000_work_getcpu)                 1.01us  985.52K
-// contentionAtWidth(16_stripe_1000_work_getcpu)              986.09ns    1.01M
-// contentionAtWidth(32_stripe_1000_work_getcpu)              960.23ns    1.04M
-// atomicIncrBaseline(local_incr_1000_work)                   950.63ns    1.05M
+// contentionAtWidth(1_stripe_1000_work_stub)                   1.90us  525.73K
+// contentionAtWidth(2_stripe_1000_work_getcpu)               792.91ns    1.26M
+// contentionAtWidth(4_stripe_1000_work_getcpu)               788.14ns    1.27M
+// contentionAtWidth(8_stripe_1000_work_getcpu)               794.16ns    1.26M
+// contentionAtWidth(16_stripe_1000_work_getcpu)              785.33ns    1.27M
+// contentionAtWidth(32_stripe_1000_work_getcpu)              786.56ns    1.27M
+// atomicIncrBaseline(local_incr_1000_work)                   784.69ns    1.27M
 // ============================================================================
 static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
                              SpreaderType spreaderType,
@@ -515,11 +542,18 @@ static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
                              size_t numThreads = 32) {
  folly::BenchmarkSuspender braces;

+  folly::detail::Getcpu::Func getcpuFunc = nullptr;
+
+  if (spreaderType == SpreaderType::TLS_RR) {
+    getcpuFunc =
+        folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu;
+  }
+  if (spreaderType == SpreaderType::PTHREAD_SELF) {
+    getcpuFunc = folly::detail::FallbackGetcpu<HashingThreadId>::getcpu;
+  }
+
  AccessSpreader<> spreader(
-      stripes,
-      CacheLocality::system<std::atomic>(),
-      spreaderType == SpreaderType::TLS_RR
-          ? SequentialThreadId<std::atomic>::getcpu : nullptr);
+      stripes, CacheLocality::system<std::atomic>(), getcpuFunc);

  std::atomic<size_t> ready(0);
  std::atomic<bool> go(false);
@@ -651,6 +685,36 @@ BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
                      32, 0, SpreaderType::TLS_RR)
 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
                      64, 0, SpreaderType::TLS_RR)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      2_stripe_0_work_pthread_self,
+                      2,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      4_stripe_0_work_pthread_self,
+                      4,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      8_stripe_0_work_pthread_self,
+                      8,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      16_stripe_0_work_pthread_self,
+                      16,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      32_stripe_0_work_pthread_self,
+                      32,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      64_stripe_0_work_pthread_self,
+                      64,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
                      2, 0, SpreaderType::SHARED)
 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,