Commit dbf7c3d2 authored by Francis Ma's avatar Francis Ma Committed by facebook-github-bot-0

Make folly::detail::CacheLocality portable on apple

Summary:
This is one of the series steps to port folly::future on ios. Apple doesn't support __thread. Adding a HashingThreadId as a fallback
on apple.

Reviewed By: nbronson

Differential Revision: D2832068

fb-gh-sync-id: c3389245f3c0bbd36de6260680f7ac6110b3206c
parent 94174b55
...@@ -223,7 +223,9 @@ namespace std { typedef ::max_align_t max_align_t; } ...@@ -223,7 +223,9 @@ namespace std { typedef ::max_align_t max_align_t; }
* the semantics are the same * the semantics are the same
* (but remember __thread has different semantics when using emutls (ex. apple)) * (but remember __thread has different semantics when using emutls (ex. apple))
*/ */
#if defined(_MSC_VER) #if defined(__APPLE__)
#undef FOLLY_TLS
#elif defined(_MSC_VER)
# define FOLLY_TLS __declspec(thread) # define FOLLY_TLS __declspec(thread)
#elif defined(__GNUC__) || defined(__clang__) #elif defined(__GNUC__) || defined(__clang__)
# define FOLLY_TLS __thread # define FOLLY_TLS __thread
......
...@@ -232,6 +232,7 @@ Getcpu::Func Getcpu::vdsoFunc() { ...@@ -232,6 +232,7 @@ Getcpu::Func Getcpu::vdsoFunc() {
return func; return func;
} }
#ifdef FOLLY_TLS
/////////////// SequentialThreadId /////////////// SequentialThreadId
template<> template<>
...@@ -239,6 +240,7 @@ std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0); ...@@ -239,6 +240,7 @@ std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0);
template<> template<>
FOLLY_TLS size_t SequentialThreadId<std::atomic>::currentId(0); FOLLY_TLS size_t SequentialThreadId<std::atomic>::currentId(0);
#endif
/////////////// AccessSpreader /////////////// AccessSpreader
...@@ -277,7 +279,7 @@ Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) { ...@@ -277,7 +279,7 @@ Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) {
return &degenerateGetcpu; return &degenerateGetcpu;
} else { } else {
auto best = Getcpu::vdsoFunc(); auto best = Getcpu::vdsoFunc();
return best ? best : &SequentialThreadId<std::atomic>::getcpu; return best ? best : &FallbackGetcpuType::getcpu;
} }
} }
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include <string> #include <string>
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
#include <folly/Hash.h>
#include <folly/Likely.h> #include <folly/Likely.h>
#include <folly/Portability.h> #include <folly/Portability.h>
...@@ -141,10 +142,7 @@ struct Getcpu { ...@@ -141,10 +142,7 @@ struct Getcpu {
static Func vdsoFunc(); static Func vdsoFunc();
}; };
/// A class that lazily binds a unique (for each implementation of Atom) #ifdef FOLLY_TLS
/// identifier to a thread. This is a fallback mechanism for the access
/// spreader if we are in testing (using DeterministicAtomic) or if
/// __vdso_getcpu can't be dynamically loaded
template <template<typename> class Atom> template <template<typename> class Atom>
struct SequentialThreadId { struct SequentialThreadId {
...@@ -157,11 +155,32 @@ struct SequentialThreadId { ...@@ -157,11 +155,32 @@ struct SequentialThreadId {
return rv; return rv;
} }
private:
static Atom<size_t> prevId;
static FOLLY_TLS size_t currentId;
};
#endif
struct HashingThreadId {
static size_t get() {
pthread_t pid = pthread_self();
uint64_t id = 0;
memcpy(&id, &pid, std::min(sizeof(pid), sizeof(id)));
return hash::twang_32from64(id);
}
};
/// A class that lazily binds a unique (for each implementation of Atom)
/// identifier to a thread. This is a fallback mechanism for the access
/// spreader if __vdso_getcpu can't be loaded
template <typename ThreadId>
struct FallbackGetcpu {
/// Fills the thread id into the cpu and node out params (if they /// Fills the thread id into the cpu and node out params (if they
/// are non-null). This method is intended to act like getcpu when a /// are non-null). This method is intended to act like getcpu when a
/// fast-enough form of getcpu isn't available or isn't desired /// fast-enough form of getcpu isn't available or isn't desired
static int getcpu(unsigned* cpu, unsigned* node, void* unused) { static int getcpu(unsigned* cpu, unsigned* node, void* unused) {
auto id = get(); auto id = ThreadId::get();
if (cpu) { if (cpu) {
*cpu = id; *cpu = id;
} }
...@@ -170,13 +189,14 @@ struct SequentialThreadId { ...@@ -170,13 +189,14 @@ struct SequentialThreadId {
} }
return 0; return 0;
} }
private:
static Atom<size_t> prevId;
static FOLLY_TLS size_t currentId;
}; };
#ifdef FOLLY_TLS
typedef FallbackGetcpu<SequentialThreadId<std::atomic>> FallbackGetcpuType;
#else
typedef FallbackGetcpu<HashingThreadId> FallbackGetcpuType;
#endif
template <template<typename> class Atom, size_t kMaxCpus> template <template<typename> class Atom, size_t kMaxCpus>
struct AccessSpreaderArray; struct AccessSpreaderArray;
......
...@@ -317,13 +317,30 @@ TEST(Getcpu, VdsoGetcpu) { ...@@ -317,13 +317,30 @@ TEST(Getcpu, VdsoGetcpu) {
EXPECT_TRUE(cpu < CPU_SETSIZE); EXPECT_TRUE(cpu < CPU_SETSIZE);
} }
TEST(SequentialThreadId, Simple) { #ifdef FOLLY_TLS
TEST(ThreadId, SimpleTls) {
unsigned cpu = 0; unsigned cpu = 0;
auto rv = SequentialThreadId<std::atomic>::getcpu(&cpu, nullptr, nullptr); auto rv =
folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
&cpu, nullptr, nullptr);
EXPECT_EQ(rv, 0); EXPECT_EQ(rv, 0);
EXPECT_TRUE(cpu > 0); EXPECT_TRUE(cpu > 0);
unsigned again; unsigned again;
SequentialThreadId<std::atomic>::getcpu(&again, nullptr, nullptr); folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
&again, nullptr, nullptr);
EXPECT_EQ(cpu, again);
}
#endif
TEST(ThreadId, SimplePthread) {
unsigned cpu = 0;
auto rv = folly::detail::FallbackGetcpu<HashingThreadId>::getcpu(
&cpu, nullptr, nullptr);
EXPECT_EQ(rv, 0);
EXPECT_TRUE(cpu > 0);
unsigned again;
folly::detail::FallbackGetcpu<HashingThreadId>::getcpu(
&again, nullptr, nullptr);
EXPECT_EQ(cpu, again); EXPECT_EQ(cpu, again);
} }
...@@ -434,7 +451,7 @@ BENCHMARK(AccessSpreaderConstruction, iters) { ...@@ -434,7 +451,7 @@ BENCHMARK(AccessSpreaderConstruction, iters) {
} }
} }
enum class SpreaderType { GETCPU, SHARED, TLS_RR }; enum class SpreaderType { GETCPU, SHARED, TLS_RR, PTHREAD_SELF };
// Benchmark scores here reflect the time for 32 threads to perform an // Benchmark scores here reflect the time for 32 threads to perform an
// atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly, // atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
...@@ -472,42 +489,52 @@ enum class SpreaderType { GETCPU, SHARED, TLS_RR }; ...@@ -472,42 +489,52 @@ enum class SpreaderType { GETCPU, SHARED, TLS_RR };
// ============================================================================ // ============================================================================
// folly/test/CacheLocalityTest.cpp relative time/iter iters/s // folly/test/CacheLocalityTest.cpp relative time/iter iters/s
// ============================================================================ // ============================================================================
// contentionAtWidth(1_stripe_0_work_stub) 1.14us 873.64K // LocalAccessSpreaderUse 13.00ns 76.94M
// contentionAtWidth(2_stripe_0_work_getcpu) 495.58ns 2.02M // SharedAccessSpreaderUse 13.04ns 76.66M
// contentionAtWidth(4_stripe_0_work_getcpu) 232.99ns 4.29M // AccessSpreaderConstruction 366.00ns 2.73M
// contentionAtWidth(8_stripe_0_work_getcpu) 101.16ns 9.88M // ----------------------------------------------------------------------------
// contentionAtWidth(16_stripe_0_work_getcpu) 41.93ns 23.85M // contentionAtWidth(1_stripe_0_work_stub) 891.04ns 1.12M
// contentionAtWidth(32_stripe_0_work_getcpu) 42.04ns 23.79M // contentionAtWidth(2_stripe_0_work_getcpu) 403.45ns 2.48M
// contentionAtWidth(64_stripe_0_work_getcpu) 41.94ns 23.84M // contentionAtWidth(4_stripe_0_work_getcpu) 198.02ns 5.05M
// contentionAtWidth(2_stripe_0_work_tls_rr) 1.00us 997.41K // contentionAtWidth(8_stripe_0_work_getcpu) 90.54ns 11.04M
// contentionAtWidth(4_stripe_0_work_tls_rr) 694.41ns 1.44M // contentionAtWidth(16_stripe_0_work_getcpu) 31.21ns 32.04M
// contentionAtWidth(8_stripe_0_work_tls_rr) 590.27ns 1.69M // contentionAtWidth(32_stripe_0_work_getcpu) 29.15ns 34.31M
// contentionAtWidth(16_stripe_0_work_tls_rr) 222.13ns 4.50M // contentionAtWidth(64_stripe_0_work_getcpu) 32.41ns 30.86M
// contentionAtWidth(32_stripe_0_work_tls_rr) 169.49ns 5.90M // contentionAtWidth(2_stripe_0_work_tls_rr) 958.06ns 1.04M
// contentionAtWidth(64_stripe_0_work_tls_rr) 162.20ns 6.17M // contentionAtWidth(4_stripe_0_work_tls_rr) 494.31ns 2.02M
// contentionAtWidth(2_stripe_0_work_shared) 495.54ns 2.02M // contentionAtWidth(8_stripe_0_work_tls_rr) 362.34ns 2.76M
// contentionAtWidth(4_stripe_0_work_shared) 236.27ns 4.23M // contentionAtWidth(16_stripe_0_work_tls_rr) 231.37ns 4.32M
// contentionAtWidth(8_stripe_0_work_shared) 114.81ns 8.71M // contentionAtWidth(32_stripe_0_work_tls_rr) 128.26ns 7.80M
// contentionAtWidth(16_stripe_0_work_shared) 44.65ns 22.40M // contentionAtWidth(64_stripe_0_work_tls_rr) 115.08ns 8.69M
// contentionAtWidth(32_stripe_0_work_shared) 41.76ns 23.94M // contentionAtWidth(2_stripe_0_work_pthread_self) 856.63ns 1.17M
// contentionAtWidth(64_stripe_0_work_shared) 43.47ns 23.00M // contentionAtWidth(4_stripe_0_work_pthread_self) 623.43ns 1.60M
// atomicIncrBaseline(local_incr_0_work) 20.39ns 49.06M // contentionAtWidth(8_stripe_0_work_pthread_self) 419.69ns 2.38M
// contentionAtWidth(16_stripe_0_work_pthread_self 217.32ns 4.60M
// contentionAtWidth(32_stripe_0_work_pthread_self 157.69ns 6.34M
// contentionAtWidth(64_stripe_0_work_pthread_self 140.94ns 7.10M
// contentionAtWidth(2_stripe_0_work_shared) 406.55ns 2.46M
// contentionAtWidth(4_stripe_0_work_shared) 198.28ns 5.04M
// contentionAtWidth(8_stripe_0_work_shared) 90.11ns 11.10M
// contentionAtWidth(16_stripe_0_work_shared) 34.53ns 28.96M
// contentionAtWidth(32_stripe_0_work_shared) 30.08ns 33.25M
// contentionAtWidth(64_stripe_0_work_shared) 34.60ns 28.90M
// atomicIncrBaseline(local_incr_0_work) 17.51ns 57.12M
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// contentionAtWidth(1_stripe_500_work_stub) 2.04us 491.13K // contentionAtWidth(1_stripe_500_work_stub) 1.87us 534.36K
// contentionAtWidth(2_stripe_500_work_getcpu) 610.98ns 1.64M // contentionAtWidth(2_stripe_500_work_getcpu) 542.31ns 1.84M
// contentionAtWidth(4_stripe_500_work_getcpu) 507.72ns 1.97M // contentionAtWidth(4_stripe_500_work_getcpu) 409.18ns 2.44M
// contentionAtWidth(8_stripe_500_work_getcpu) 542.53ns 1.84M // contentionAtWidth(8_stripe_500_work_getcpu) 511.05ns 1.96M
// contentionAtWidth(16_stripe_500_work_getcpu) 496.55ns 2.01M // contentionAtWidth(16_stripe_500_work_getcpu) 399.14ns 2.51M
// contentionAtWidth(32_stripe_500_work_getcpu) 500.67ns 2.00M // contentionAtWidth(32_stripe_500_work_getcpu) 399.05ns 2.51M
// atomicIncrBaseline(local_incr_500_work) 484.69ns 2.06M // atomicIncrBaseline(local_incr_500_work) 399.41ns 2.50M
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// contentionAtWidth(1_stripe_1000_work_stub) 2.11us 473.78K // contentionAtWidth(1_stripe_1000_work_stub) 1.90us 525.73K
// contentionAtWidth(2_stripe_1000_work_getcpu) 970.64ns 1.03M // contentionAtWidth(2_stripe_1000_work_getcpu) 792.91ns 1.26M
// contentionAtWidth(4_stripe_1000_work_getcpu) 987.31ns 1.01M // contentionAtWidth(4_stripe_1000_work_getcpu) 788.14ns 1.27M
// contentionAtWidth(8_stripe_1000_work_getcpu) 1.01us 985.52K // contentionAtWidth(8_stripe_1000_work_getcpu) 794.16ns 1.26M
// contentionAtWidth(16_stripe_1000_work_getcpu) 986.09ns 1.01M // contentionAtWidth(16_stripe_1000_work_getcpu) 785.33ns 1.27M
// contentionAtWidth(32_stripe_1000_work_getcpu) 960.23ns 1.04M // contentionAtWidth(32_stripe_1000_work_getcpu) 786.56ns 1.27M
// atomicIncrBaseline(local_incr_1000_work) 950.63ns 1.05M // atomicIncrBaseline(local_incr_1000_work) 784.69ns 1.27M
// ============================================================================ // ============================================================================
static void contentionAtWidth(size_t iters, size_t stripes, size_t work, static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
SpreaderType spreaderType, SpreaderType spreaderType,
...@@ -515,11 +542,18 @@ static void contentionAtWidth(size_t iters, size_t stripes, size_t work, ...@@ -515,11 +542,18 @@ static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
size_t numThreads = 32) { size_t numThreads = 32) {
folly::BenchmarkSuspender braces; folly::BenchmarkSuspender braces;
folly::detail::Getcpu::Func getcpuFunc = nullptr;
if (spreaderType == SpreaderType::TLS_RR) {
getcpuFunc =
folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu;
}
if (spreaderType == SpreaderType::PTHREAD_SELF) {
getcpuFunc = folly::detail::FallbackGetcpu<HashingThreadId>::getcpu;
}
AccessSpreader<> spreader( AccessSpreader<> spreader(
stripes, stripes, CacheLocality::system<std::atomic>(), getcpuFunc);
CacheLocality::system<std::atomic>(),
spreaderType == SpreaderType::TLS_RR
? SequentialThreadId<std::atomic>::getcpu : nullptr);
std::atomic<size_t> ready(0); std::atomic<size_t> ready(0);
std::atomic<bool> go(false); std::atomic<bool> go(false);
...@@ -651,6 +685,36 @@ BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr, ...@@ -651,6 +685,36 @@ BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
32, 0, SpreaderType::TLS_RR) 32, 0, SpreaderType::TLS_RR)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr, BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
64, 0, SpreaderType::TLS_RR) 64, 0, SpreaderType::TLS_RR)
BENCHMARK_NAMED_PARAM(contentionAtWidth,
2_stripe_0_work_pthread_self,
2,
0,
SpreaderType::PTHREAD_SELF)
BENCHMARK_NAMED_PARAM(contentionAtWidth,
4_stripe_0_work_pthread_self,
4,
0,
SpreaderType::PTHREAD_SELF)
BENCHMARK_NAMED_PARAM(contentionAtWidth,
8_stripe_0_work_pthread_self,
8,
0,
SpreaderType::PTHREAD_SELF)
BENCHMARK_NAMED_PARAM(contentionAtWidth,
16_stripe_0_work_pthread_self,
16,
0,
SpreaderType::PTHREAD_SELF)
BENCHMARK_NAMED_PARAM(contentionAtWidth,
32_stripe_0_work_pthread_self,
32,
0,
SpreaderType::PTHREAD_SELF)
BENCHMARK_NAMED_PARAM(contentionAtWidth,
64_stripe_0_work_pthread_self,
64,
0,
SpreaderType::PTHREAD_SELF)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared, BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
2, 0, SpreaderType::SHARED) 2, 0, SpreaderType::SHARED)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared, BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment