Commit 2d7a636e authored by Yedidya Feldblum's avatar Yedidya Feldblum Committed by Facebook Github Bot

Improve the SingletonThreadLocal fast path

Summary:
[Folly] Improve the `SingletonThreadLocal` fast path.

Principally, by having the thread-local cache checked checked before the static-local guard variable is checked, rather than after.

This change measurably improves the `EventBase` benchmark:

```name=branch
============================================================================
folly/io/async/test/EventBaseBenchmark.cpp      relative  time/iter  iters/s
============================================================================
timeMeasurementsOn                                           1.02us  980.21K
timeMeasurementsOff                              251.64%   405.41ns    2.47M
============================================================================
```
```name=master
============================================================================
folly/io/async/test/EventBaseBenchmark.cpp      relative  time/iter  iters/s
============================================================================
timeMeasurementsOn                                           1.03us  969.51K
timeMeasurementsOff                              247.08%   417.45ns    2.40M
============================================================================
```

This change shortens the fast-path in `folly::RequestContext::getStaticContext()` (manually cleaned up):

```name=branch
---- fast path ----
<+0>:     mov    rax,QWORD PTR fs:folly::SingletonThreadLocal<...>::get()::cache@tpoff
<+9>:     test   rax,rax
<+12>:    je     folly::RequestContext::getStaticContext()+16
<+14>:    ret
<+15>:    nop
---- slow path ----
<+16>:    push   rbp
<+17>:    mov    rbp,rsp
<+20>:    call   folly::SingletonThreadLocal<...>::getWrapperOutline()
<+25>:    mov    rdx,QWORD PTR fs:0x0
<+34>:    mov    QWORD PTR fs:folly::SingletonThreadLocal<...>::get()::cache@tpoff,rax
<+43>:    add    rdx,OFFSET FLAT:folly::SingletonThreadLocal<...>::get()::cache@tpoff
<+50>:    mov    QWORD PTR [rax+0x10],rdx
<+54>:    pop    rbp
<+55>:    ret
```
```name=master
---- fast path ----
<+0>:     push   rbp
<+1>:     mov    rbp,rsp
<+4>:     push   rbx
<+5>:     sub    rsp,0x8
<+9>:     cmp    BYTE PTR guard variable for folly::RequestContext::getStaticContext()::singleton[rip],0x0
<+16>:    je     folly::RequestContext::getStaticContext()+48
<+18>:    mov    rax,QWORD PTR fs:folly::SingletonThreadLocal<...>::get()::cache@tpoff
<+27>:    test   rax,rax
<+30>:    je     folly::RequestContext::getStaticContext()+96
<+32>:    mov    rbx,QWORD PTR [rbp-0x8]
<+36>:    leave
<+37>:    ret
<+38>:    nop    WORD PTR cs:[rax+rax*1+0x0]
---- slow path ----
<+48>:    mov    edi,OFFSET FLAT:guard variable for folly::RequestContext::getStaticContext()::singleton
<+53>:    call   __cxa_guard_acquire
<+58>:    test   eax,eax
<+60>:    je     folly::RequestContext::getStaticContext()+18
<+62>:    sub    rsp,0x8
<+66>:    mov    edi,OFFSET FLAT:folly::RequestContext::getStaticContext()::singleton
<+71>:    push   0x0
<+73>:    call   folly::SingletonThreadLocal<...>::SingletonThreadLocal({lambda()})
<+78>:    pop    rax
<+79>:    mov    edi,OFFSET FLAT:guard variable for folly::RequestContext::getStaticContext()::singleton
<+84>:    pop    rdx
<+85>:    call   __cxa_guard_release
<+90>:    jmp    folly::RequestContext::getStaticContext()+18
<+92>:    nop    DWORD PTR [rax+0x0]
<+96>:    call   folly::SingletonThreadLocal<...>::getSlow()
<+101>:   mov    rbx,QWORD PTR [rbp-0x8]
<+105>:   mov    QWORD PTR fs:folly::SingletonThreadLocal<...>::get()::cache@tpoff,rax
<+114>:   leave
<+115>:   ret
<+116>:   mov    rbx,rax
<+119>:   mov    edi,OFFSET FLAT:guard variable for folly::RequestContext::getStaticContext()::singleton
<+124>:   call   __cxa_guard_abort
<+129>:   mov    rdi,rbx
<+132>:   call   _Unwind_Resume
```

Reviewed By: andriigrynenko

Differential Revision: D6763655

fbshipit-source-id: 6f2d317ffd40a4e1f143b4bbbd087e85cc667b8c
parent 58791d65
......@@ -129,25 +129,15 @@ struct RandomTag {};
} // namespace
void Random::secureRandom(void* data, size_t size) {
static SingletonThreadLocal<BufferedRandomDevice, RandomTag>
bufferedRandomDevice;
bufferedRandomDevice.get().get(data, size);
using Single = SingletonThreadLocal<BufferedRandomDevice, RandomTag>;
Single::get().get(data, size);
}
class ThreadLocalPRNG::LocalInstancePRNG {
public:
LocalInstancePRNG() : rng(Random::create()) {}
Random::DefaultGenerator rng;
};
ThreadLocalPRNG::ThreadLocalPRNG() {
static SingletonThreadLocal<ThreadLocalPRNG::LocalInstancePRNG, RandomTag>
localInstancePRNG;
local_ = &localInstancePRNG.get();
}
uint32_t ThreadLocalPRNG::getImpl(LocalInstancePRNG* local) {
return local->rng();
ThreadLocalPRNG::result_type ThreadLocalPRNG::operator()() {
struct Wrapper {
Random::DefaultGenerator object{Random::create()};
};
using Single = SingletonThreadLocal<Wrapper, RandomTag>;
return Single::get().object();
}
} // namespace folly
......@@ -52,11 +52,7 @@ class ThreadLocalPRNG {
public:
using result_type = uint32_t;
result_type operator()() {
// Using a static method allows the compiler to avoid allocating stack space
// for this class.
return getImpl(local_);
}
result_type operator()();
static constexpr result_type min() {
return std::numeric_limits<result_type>::min();
......@@ -64,15 +60,6 @@ class ThreadLocalPRNG {
static constexpr result_type max() {
return std::numeric_limits<result_type>::max();
}
friend class Random;
ThreadLocalPRNG();
class LocalInstancePRNG;
private:
static result_type getImpl(LocalInstancePRNG* local);
LocalInstancePRNG* local_;
};
class Random {
......
......@@ -182,6 +182,15 @@ namespace detail {
struct DefaultTag {};
template <typename T>
struct DefaultMake {
// Required form until C++17, which permits returning objects of types which
// are neither copy-constructible nor move-constructible.
T* operator()(unsigned char (&buf)[sizeof(T)]) const {
return new (buf) T();
}
};
// A TypeDescriptor is the unique handle for a given singleton. It is
// a combinaiton of the type and of the optional name, and is used as
// a key in unordered_maps.
......
......@@ -18,78 +18,111 @@
#include <folly/Singleton.h>
#include <folly/ThreadLocal.h>
#include <folly/functional/Invoke.h>
namespace folly {
// SingletonThreadLocal
//
// This class can help you implement a per-thread leaky-singleton model within
// your application. Please read the usage block at the top of Singleton.h as
// the recommendations there are also generally applicable to this class.
//
// When we say this is "leaky" we mean that the T instances held by a
// SingletonThreadLocal<T> will survive until their owning thread exits,
// regardless of the lifetime of the singleton object holding them. That
// means that they can be safely used during process shutdown, and
// that they can also be safely used in an application that spawns many
// temporary threads throughout its life.
//
// Keywords to help people find this class in search:
// Thread Local Singleton ThreadLocalSingleton
template <typename T, typename Tag = detail::DefaultTag>
/// SingletonThreadLocal
///
/// Useful for a per-thread leaky-singleton model in libraries and applications.
///
/// By "leaky" it is meant that the T instances held by the instantiation
/// SingletonThreadLocal<T> will survive until their owning thread exits.
/// Therefore, they can safely be used before main() begins and after main()
/// ends, and they can also safely be used in an application that spawns many
/// temporary threads throughout its life.
///
/// Example:
///
/// struct UsefulButHasExpensiveCtor {
/// UsefulButHasExpensiveCtor(); // this is expensive
/// Result operator()(Arg arg);
/// };
///
/// Result useful(Arg arg) {
/// using Useful = UsefulButHasExpensiveCtor;
/// auto& useful = folly::SingletonThreadLocal<Useful>::get();
/// return useful(arg);
/// }
///
/// As an example use-case, the random generators in <random> are expensive to
/// construct. And their constructors are deterministic, but many cases require
/// that they be randomly seeded. So folly::Random makes good canonical uses of
/// folly::SingletonThreadLocal so that a seed is computed from the secure
/// random device once per thread, and the random generator is constructed with
/// the seed once per thread.
///
/// Keywords to help people find this class in search:
/// Thread Local Singleton ThreadLocalSingleton
template <
typename T,
typename Tag = detail::DefaultTag,
typename Make = detail::DefaultMake<T>>
class SingletonThreadLocal {
public:
using CreateFunc = std::function<T*(void)>;
private:
SingletonThreadLocal() = delete;
SingletonThreadLocal() : SingletonThreadLocal([]() { return new T(); }) {}
struct Wrapper {
// keep as first field, to save 1 instr in the fast path
union {
alignas(alignof(T)) unsigned char storage[sizeof(T)];
T object;
};
Wrapper** cache{};
template <typename Create>
FOLLY_NOINLINE explicit SingletonThreadLocal(Create create)
: singleton_([create = std::move(create)]() mutable {
return new ThreadLocalT([create = std::move(create)]() mutable {
return new Wrapper(std::unique_ptr<T>(create()));
});
}) {}
/* implicit */ operator T&() {
return object;
}
FOLLY_ALWAYS_INLINE static T& get() {
#ifdef FOLLY_TLS
return *localPtr() ? **localPtr() : *(*localPtr() = &getSlow());
#else
return **SingletonT::get();
#endif
}
// normal make types
template <
typename S = T,
_t<std::enable_if<is_invocable_r<S, Make>::value, int>> = 0>
Wrapper() {
(void)new (storage) S(Make{}());
}
// default and special make types for non-move-constructible T, until C++17
template <
typename S = T,
_t<std::enable_if<!is_invocable_r<S, Make>::value, int>> = 0>
Wrapper() {
(void)Make{}(storage);
}
~Wrapper() {
if (cache) {
*cache = nullptr;
}
object.~T();
}
};
private:
FOLLY_NOINLINE static T& getSlow() {
return **SingletonT::get();
FOLLY_EXPORT FOLLY_ALWAYS_INLINE static Wrapper& getWrapperInline() {
static LeakySingleton<ThreadLocal<Wrapper>, Tag> singleton;
return *singleton.get();
}
#ifdef FOLLY_TLS
FOLLY_ALWAYS_INLINE static T** localPtr() {
static FOLLY_TLS T* localPtr = nullptr;
return &localPtr;
FOLLY_NOINLINE static Wrapper& getWrapperOutline() {
return getWrapperInline();
}
#endif
class Wrapper {
public:
explicit Wrapper(std::unique_ptr<T> t) : t_(std::move(t)) {}
/// Benchmarks indicate that getSlow being inline but containing a call to
/// getWrapperOutline is faster than getSlow being outline but containing
/// a call to getWrapperInline, which would otherwise produce smaller code.
FOLLY_ALWAYS_INLINE static Wrapper& getSlow(Wrapper*& cache) {
cache = &getWrapperOutline();
cache->cache = &cache;
return *cache;
}
~Wrapper() {
public:
FOLLY_EXPORT FOLLY_ALWAYS_INLINE static T& get() {
// the absolute minimal conditional-compilation
#ifdef FOLLY_TLS
*localPtr() = nullptr;
static FOLLY_TLS Wrapper* cache;
return FOLLY_LIKELY(!!cache) ? *cache : getSlow(cache);
#else
return getWrapperInline();
#endif
}
T& operator*() { return *t_; }
private:
std::unique_ptr<T> t_;
};
using ThreadLocalT = ThreadLocal<Wrapper>;
using SingletonT = LeakySingleton<ThreadLocalT, Tag>;
SingletonT singleton_;
}
};
} // namespace folly
......@@ -344,6 +344,9 @@ class ScopedAlternateSignalStack {
setAlternateStack(stack_->data(), stack_->size());
}
ScopedAlternateSignalStack(ScopedAlternateSignalStack&&) = default;
ScopedAlternateSignalStack& operator=(ScopedAlternateSignalStack&&) = default;
~ScopedAlternateSignalStack() {
if (stack_) {
unsetAlternateStack();
......@@ -357,8 +360,7 @@ class ScopedAlternateSignalStack {
} // namespace
void FiberManager::registerAlternateSignalStack() {
static folly::SingletonThreadLocal<ScopedAlternateSignalStack> singleton;
singleton.get();
SingletonThreadLocal<ScopedAlternateSignalStack>::get();
alternateSignalStackRegistered_ = true;
}
......
......@@ -142,9 +142,7 @@ std::shared_ptr<RequestContext> RequestContext::setContext(
std::shared_ptr<RequestContext>& RequestContext::getStaticContext() {
using SingletonT = SingletonThreadLocal<std::shared_ptr<RequestContext>>;
static SingletonT singleton;
return singleton.get();
return SingletonT::get();
}
RequestContext* RequestContext::get() {
......
......@@ -36,7 +36,6 @@ struct Foo {
}
};
using FooSingletonTL = SingletonThreadLocal<Foo>;
FooSingletonTL theFooSingleton;
} // namespace
TEST(SingletonThreadLocalTest, OneSingletonPerThread) {
......@@ -64,3 +63,37 @@ TEST(SingletonThreadLocalTest, OneSingletonPerThread) {
EXPECT_EQ(threads.size(), fooCreatedCount);
EXPECT_EQ(threads.size(), fooDeletedCount);
}
TEST(SingletonThreadLocalTest, MoveConstructibleMake) {
struct Foo {
int a, b;
Foo(int a_, int b_) : a(a_), b(b_) {}
Foo(Foo&&) = default;
Foo& operator=(Foo&&) = default;
};
struct Tag {};
struct Make {
Foo operator()() const {
return Foo(3, 4);
}
};
auto& single = SingletonThreadLocal<Foo, Tag, Make>::get();
EXPECT_EQ(4, single.b);
}
TEST(SingletonThreadLocalTest, NotMoveConstructibleMake) {
struct Foo {
int a, b;
Foo(int a_, int b_) : a(a_), b(b_) {}
Foo(Foo&&) = delete;
Foo& operator=(Foo&&) = delete;
};
struct Tag {};
struct Make {
Foo* operator()(unsigned char (&buf)[sizeof(Foo)]) const {
return new (buf) Foo(3, 4);
}
};
auto& single = SingletonThreadLocal<Foo, Tag, Make>::get();
EXPECT_EQ(4, single.b);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment