Commit dc4be288 authored by Dave Watson's avatar Dave Watson Committed by Facebook Github Bot

Add a fast path to folly::ThreadLocal

Summary:
Currently folly::ThreadLocal[Ptr] is pretty heavy-weight for a get():

1) call instance(), take a static init guard, branch
2) call getThreadEntry, check if thread_local is not null, branch
3) check if id < threadEntry->capacity, branch
4) Finally, return threadEntry->elements[id]

If we have real thread_locals, we can do better by caching the capacity directly,
combining all three checks:

1) checkif id < threadLocalCapacityCheck, branch.  If not, do slow path.
2) return threadEntry->elements[id].  Threadentry is never null if capacity > 0, and
    instance() setup work is called during the first getThreadEntry call when threadlocalcapacity == 0.

Reviewed By: yfeldblum

Differential Revision: D6379878

fbshipit-source-id: 4fc7564bbb2f319d65875124026aef28d910ef06
parent 8bfce3ed
...@@ -161,7 +161,7 @@ class ThreadLocalPtr { ...@@ -161,7 +161,7 @@ class ThreadLocalPtr {
} }
T* get() const { T* get() const {
threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_); threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_);
return static_cast<T*>(w.ptr); return static_cast<T*>(w.ptr);
} }
...@@ -174,14 +174,14 @@ class ThreadLocalPtr { ...@@ -174,14 +174,14 @@ class ThreadLocalPtr {
} }
T* release() { T* release() {
threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_); threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_);
return static_cast<T*>(w.release()); return static_cast<T*>(w.release());
} }
void reset(T* newPtr = nullptr) { void reset(T* newPtr = nullptr) {
auto guard = makeGuard([&] { delete newPtr; }); auto guard = makeGuard([&] { delete newPtr; });
threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_); threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_);
w.dispose(TLPDestructionMode::THIS_THREAD); w.dispose(TLPDestructionMode::THIS_THREAD);
guard.dismiss(); guard.dismiss();
...@@ -235,7 +235,7 @@ class ThreadLocalPtr { ...@@ -235,7 +235,7 @@ class ThreadLocalPtr {
deleter(newPtr, TLPDestructionMode::THIS_THREAD); deleter(newPtr, TLPDestructionMode::THIS_THREAD);
} }
}); });
threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_); threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_);
w.dispose(TLPDestructionMode::THIS_THREAD); w.dispose(TLPDestructionMode::THIS_THREAD);
guard.dismiss(); guard.dismiss();
w.set(newPtr, deleter); w.set(newPtr, deleter);
......
...@@ -290,7 +290,7 @@ struct StaticMetaBase { ...@@ -290,7 +290,7 @@ struct StaticMetaBase {
*/ */
void reserve(EntryID* id); void reserve(EntryID* id);
ElementWrapper& get(EntryID* ent); ElementWrapper& getElement(EntryID* ent);
static void initAtFork(); static void initAtFork();
static void registerAtFork( static void registerAtFork(
...@@ -335,7 +335,35 @@ struct StaticMeta : StaticMetaBase { ...@@ -335,7 +335,35 @@ struct StaticMeta : StaticMetaBase {
return *instance; return *instance;
} }
ElementWrapper& get(EntryID* ent) { #ifdef FOLLY_TLD_USE_FOLLY_TLS
// Eliminate as many branches as possible:
// One branch on capacityCache, vs. three:
// 1) instance() static initializer
// 2) getThreadEntry null check
// 3) elementsCapacity size check.
// 3 will never be true if 1 or 2 are false.
FOLLY_ALWAYS_INLINE static ElementWrapper& get(EntryID* ent) {
uint32_t id = ent->getOrInvalid();
if (UNLIKELY(capacityCache_ <= id)) {
return getSlow(ent);
} else {
return threadEntryCache_->elements[id];
}
}
static ElementWrapper& getSlow(EntryID* ent) {
ElementWrapper& res = instance().getElement(ent);
// Cache new capacity
capacityCache_ = getThreadEntry()->elementsCapacity;
return res;
}
#else
static ElementWrapper& get(EntryID* ent) {
return instance().getElement(ent);
}
#endif
ElementWrapper& getElement(EntryID* ent) {
ThreadEntry* threadEntry = getThreadEntry(); ThreadEntry* threadEntry = getThreadEntry();
uint32_t id = ent->getOrInvalid(); uint32_t id = ent->getOrInvalid();
// if id is invalid, it is equal to uint32_t's max value. // if id is invalid, it is equal to uint32_t's max value.
...@@ -369,11 +397,10 @@ struct StaticMeta : StaticMetaBase { ...@@ -369,11 +397,10 @@ struct StaticMeta : StaticMetaBase {
inline static ThreadEntry* getThreadEntry() { inline static ThreadEntry* getThreadEntry() {
#ifdef FOLLY_TLD_USE_FOLLY_TLS #ifdef FOLLY_TLD_USE_FOLLY_TLS
static FOLLY_TLS ThreadEntry* threadEntryCache{nullptr}; if (UNLIKELY(threadEntryCache_ == nullptr)) {
if (UNLIKELY(threadEntryCache == nullptr)) { threadEntryCache_ = instance().threadEntry_();
threadEntryCache = instance().threadEntry_();
} }
return threadEntryCache; return threadEntryCache_;
#else #else
return instance().threadEntry_(); return instance().threadEntry_();
#endif #endif
...@@ -397,7 +424,18 @@ struct StaticMeta : StaticMetaBase { ...@@ -397,7 +424,18 @@ struct StaticMeta : StaticMetaBase {
} }
instance().lock_.unlock(); instance().lock_.unlock();
} }
#ifdef FOLLY_TLD_USE_FOLLY_TLS
static FOLLY_TLS ThreadEntry* threadEntryCache_;
static FOLLY_TLS size_t capacityCache_;
#endif
}; };
#ifdef FOLLY_TLD_USE_FOLLY_TLS
template <class Tag, class AccessMode>
FOLLY_TLS ThreadEntry* StaticMeta<Tag, AccessMode>::threadEntryCache_{nullptr};
template <class Tag, class AccessMode>
FOLLY_TLS size_t StaticMeta<Tag, AccessMode>::capacityCache_{0};
#endif
} // namespace threadlocal_detail } // namespace threadlocal_detail
} // namespace folly } // namespace folly
...@@ -132,13 +132,13 @@ int main(int argc, char** argv) { ...@@ -132,13 +132,13 @@ int main(int argc, char** argv) {
============================================================================ ============================================================================
folly/test/ThreadLocalBenchmark.cpp relative time/iter iters/s folly/test/ThreadLocalBenchmark.cpp relative time/iter iters/s
============================================================================ ============================================================================
BM_mt_tlp 2.30ns 434.53M BM_mt_tlp 1.92ns 520.02M
BM_mt_pthread_get_specific 2.69ns 371.75M BM_mt_pthread_get_specific 2.69ns 372.15M
BM_mt_boost_tsp 11.66ns 85.78M BM_mt_boost_tsp 11.81ns 84.67M
---------------------------------------------------------------------------- ----------------------------------------------------------------------------
BM_mt_tlp_multi 12.46ns 80.25M BM_mt_tlp_multi 7.53ns 132.79M
BM_mt_pthread_get_specific_multi 16.58ns 60.32M BM_mt_pthread_get_specific_multi 15.80ns 63.29M
BM_mt_boost_tsp_multi 70.85ns 14.12M BM_mt_boost_tsp_multi 71.70ns 13.95M
---------------------------------------------------------------------------- ----------------------------------------------------------------------------
============================================================================ ============================================================================
*/ */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment