Commit 8f802235 authored by Yedidya Feldblum's avatar Yedidya Feldblum Committed by Facebook GitHub Bot

un-monomorphize parts of SingletonRelaxedCounter

Summary: SingletonRelaxedCounter is templated over tag types, but this leads to heavy monomorphization. extract the common code.

Reviewed By: aary

Differential Revision: D27383671

fbshipit-source-id: e419579e4d9ab84b5107670bd1e94e1768dcebf4
parent 0a8487bd
...@@ -30,55 +30,20 @@ ...@@ -30,55 +30,20 @@
namespace folly { namespace folly {
// SingletonRelaxedCounter namespace detail {
//
// A singleton-per-tag relaxed counter. Optimized for increment/decrement // SingletonRelaxedCounterBase
// runtime performance under contention and inlined fast path code size.
//
// The cost of computing the value of the counter is linear in the number of
// threads which perform increments/decrements, and computing the value of the
// counter is exclusive with thread exit and dlclose. The result of this
// computation is not a point-in-time snapshot of increments and decrements
// summed, but is an approximation which may exclude any subset of increments
// and decrements that do not happen before the start of the computation.
//
// Templated over the integral types. When templated over an unsigned integral
// type, it is assumed that decrements do not exceed increments, and if within
// computation of the value of the counter more decrements are observed to
// exceed increments then the excess decrements are ignored. This avoids the
// scenario of incrementing and decrementing once each in different threads,
// and concurrently observing a computed value of the counter of 2^64 - 1.
// //
// Templated over the tag types. Each unique pair of integral type and tag type // Extracts tag-independent functionality from SingletonRelaxedCounter below
// is a different counter. // to avoid the monomorphization the compiler would otherwise perform.
// //
// Implementation: // Tricks:
// Uses a thread-local counter when possible to avoid contention, and a global // * Use constexpr vtables as the polymorphization mechanism.
// counter as a fallback. The total count at any given time is computed by // * Use double-noinline outer-only-noexcept definitions to shrink code size in
// summing over the global counter plus all of the thread-local counters; since // inline or monomorphized slow paths.
// the total sum is not a snapshot of the value at any given point in time, it template <typename Int>
// is a relaxed sum; when the system quiesces (i.e., when no concurrent class SingletonRelaxedCounterBase {
// increments or decrements are happening and no threads are going through protected:
// thread exit phase), the sum is exact.
template <typename Int, typename Tag>
class SingletonRelaxedCounter {
public:
static void add(Int value) { mutate(+to_signed(value)); }
static void sub(Int value) { mutate(-to_signed(value)); }
static Int count() {
auto const& global = Global::instance();
auto count = global.fallback.load(std::memory_order_relaxed);
auto const tracking = global.tracking.rlock();
for (auto const& kvp : tracking->locals) {
count += kvp.first->load(std::memory_order_relaxed);
}
return std::is_unsigned<Int>::value
? to_unsigned(std::max(Signed(0), count))
: count;
}
private:
using Signed = std::make_signed_t<Int>; using Signed = std::make_signed_t<Int>;
using Counter = std::atomic<Signed>; using Counter = std::atomic<Signed>;
...@@ -94,6 +59,10 @@ class SingletonRelaxedCounter { ...@@ -94,6 +59,10 @@ class SingletonRelaxedCounter {
struct LocalLifetime; struct LocalLifetime;
// Global
//
// Tracks all of the per-thread/per-dso counters and lifetimes and maintains
// a global fallback counter.
struct Global { struct Global {
struct Tracking { struct Tracking {
using CounterSet = std::unordered_set<Counter*>; using CounterSet = std::unordered_set<Counter*>;
...@@ -103,22 +72,33 @@ class SingletonRelaxedCounter { ...@@ -103,22 +72,33 @@ class SingletonRelaxedCounter {
Counter fallback; // used instead of local during thread destruction Counter fallback; // used instead of local during thread destruction
folly::Synchronized<Tracking> tracking; folly::Synchronized<Tracking> tracking;
};
static Global& instance() { using GetGlobal = Global&();
return folly::detail::createGlobal<Global, Tag>(); using GetLocal = CounterAndCache&();
} using GetLifetime = LocalLifetime&();
struct Arg {
GetGlobal& global;
GetLocal& local;
GetLifetime& lifetime;
}; };
// manages local().cache, global().tracking, and moving outstanding counts // LocalLifetime
// from local().counter to global().counter during thread destruction //
// Manages local().cache, global().tracking, and moving outstanding counts
// from local().counter to global().counter during thread destruction.
// //
// the counter-set is within Global to reduce per-thread overhead for threads // The counter-set is within Global to reduce per-thread overhead for threads
// which do not participate in counter mutations, rather than being a member // which do not participate in counter mutations, rather than being a member
// field of LocalLifetime; this comes at the cost of the slow path always // field of LocalLifetime. This comes at the cost of the slow path always
// acquiring a unique lock on the global mutex // acquiring a unique lock on the global mutex.
struct LocalLifetime { struct LocalLifetime {
~LocalLifetime() { FOLLY_NOINLINE void destroy(GetGlobal& get_global) noexcept {
auto& global = Global::instance(); destroy_(get_global);
}
FOLLY_NOINLINE void destroy_(GetGlobal& get_global) {
auto& global = get_global();
auto const tracking = global.tracking.wlock(); auto const tracking = global.tracking.wlock();
auto& lifetimes = tracking->lifetimes[this]; auto& lifetimes = tracking->lifetimes[this];
for (auto ctr : lifetimes) { for (auto ctr : lifetimes) {
...@@ -132,8 +112,10 @@ class SingletonRelaxedCounter { ...@@ -132,8 +112,10 @@ class SingletonRelaxedCounter {
tracking->lifetimes.erase(this); tracking->lifetimes.erase(this);
} }
void track(CounterAndCache& state) { FOLLY_NOINLINE void track(Global& global, CounterAndCache& state) noexcept {
auto& global = Global::instance(); track_(global, state);
}
FOLLY_NOINLINE void track_(Global& global, CounterAndCache& state) {
state.cache = &state.counter; state.cache = &state.counter;
auto const tracking = global.tracking.wlock(); auto const tracking = global.tracking.wlock();
auto const inserted = tracking->lifetimes[this].insert(&state.counter); auto const inserted = tracking->lifetimes[this].insert(&state.counter);
...@@ -141,56 +123,147 @@ class SingletonRelaxedCounter { ...@@ -141,56 +123,147 @@ class SingletonRelaxedCounter {
} }
}; };
FOLLY_NOINLINE static Int aggregate(GetGlobal& get_global) noexcept {
return aggregate_(get_global);
}
FOLLY_NOINLINE static Int aggregate_(GetGlobal& get_global) {
auto& global = get_global();
auto count = global.fallback.load(std::memory_order_relaxed);
auto const tracking = global.tracking.rlock();
for (auto const& kvp : tracking->locals) {
count += kvp.first->load(std::memory_order_relaxed);
}
return std::is_unsigned<Int>::value
? to_unsigned(std::max(Signed(0), count))
: count;
}
FOLLY_ERASE static void mutate(Signed v, CounterRefAndLocal cl) { FOLLY_ERASE static void mutate(Signed v, CounterRefAndLocal cl) {
auto& c = *cl.counter; auto& c = *cl.counter;
if (cl.local) { if (cl.local) {
// splitting load/store on the local counter is faster than fetch-and-add // splitting load/store on the local counter is faster than fetch-and-add
c.store(c.load(std::memory_order_relaxed) + v, std::memory_order_relaxed); c.store(c.load(std::memory_order_relaxed) + v, std::memory_order_relaxed);
} else { } else {
// but is not allowed on the global counter because mutations may be lost // but is not allowed on the global counter because mutations may be lost
c.fetch_add(v, std::memory_order_relaxed); c.fetch_add(v, std::memory_order_relaxed);
} }
} }
FOLLY_NOINLINE static void mutate_slow(Signed v) { mutate(v, counter()); } FOLLY_NOINLINE static void mutate_slow(Signed v, Arg const& arg) noexcept {
mutate(v, counter(arg));
}
FOLLY_NOINLINE static Counter& counter_slow(Arg const& arg) noexcept {
auto& global = arg.global();
if (threadlocal_detail::StaticMetaBase::dying()) {
return global.fallback;
}
auto& state = arg.local();
arg.lifetime().track(global, state); // idempotent
auto const cache = state.cache;
return FOLLY_LIKELY(!!cache) ? *cache : global.fallback;
}
FOLLY_ERASE static CounterRefAndLocal counter(Arg const& arg) {
auto& state = arg.local();
auto const cache = state.cache; // a copy! null before/after LocalLifetime
auto const counter = FOLLY_LIKELY(!!cache) ? cache : &counter_slow(arg);
// cache is a stale nullptr after the first call to counter_slow; this is
// intentional for the side-effect of shrinking the inline fast path
return CounterRefAndLocal{counter, !!cache};
}
};
} // namespace detail
// SingletonRelaxedCounter
//
// A singleton-per-tag relaxed counter. Optimized for increment/decrement
// runtime performance under contention and inlined fast path code size.
//
// The cost of computing the value of the counter is linear in the number of
// threads which perform increments/decrements, and computing the value of the
// counter is exclusive with thread exit and dlclose. The result of this
// computation is not a point-in-time snapshot of increments and decrements
// summed, but is an approximation which may exclude any subset of increments
// and decrements that do not happen before the start of the computation.
//
// Templated over the integral types. When templated over an unsigned integral
// type, it is assumed that decrements do not exceed increments, and if within
// computation of the value of the counter more decrements are observed to
// exceed increments then the excess decrements are ignored. This avoids the
// scenario of incrementing and decrementing once each in different threads,
// and concurrently observing a computed value of the counter of 2^64 - 1.
//
// Templated over the tag types. Each unique pair of integral type and tag type
// is a different counter.
//
// Implementation:
// Uses a thread-local counter when possible to avoid contention, and a global
// counter as a fallback. The total count at any given time is computed by
// summing over the global counter plus all of the thread-local counters; since
// the total sum is not a snapshot of the value at any given point in time, it
// is a relaxed sum; when the system quiesces (i.e., when no concurrent
// increments or decrements are happening and no threads are going through
// thread exit phase), the sum is exact.
//
// Most of the implementation is in SingletonRelaxedCounterBase to avoid excess
// monomorphization.
template <typename Int, typename Tag>
class SingletonRelaxedCounter
: private detail::SingletonRelaxedCounterBase<Int> {
public:
static void add(Int value) { mutate(+to_signed(value)); }
static void sub(Int value) { mutate(-to_signed(value)); }
static Int count() { return aggregate(global); }
private:
using Base = detail::SingletonRelaxedCounterBase<Int>;
using Base::aggregate;
using Base::mutate;
using Base::mutate_slow;
using typename Base::Arg;
using typename Base::CounterAndCache;
using typename Base::GetGlobal;
using typename Base::Global;
using typename Base::LocalLifetime;
using typename Base::Signed;
struct MonoLocalLifetime : LocalLifetime {
~MonoLocalLifetime() noexcept(false) { LocalLifetime::destroy(global); }
};
FOLLY_NOINLINE static void mutate_slow(Signed v) noexcept {
mutate_slow(v, arg);
}
FOLLY_ERASE static void mutate(Signed v, void (&slow)(Signed) = mutate_slow) { FOLLY_ERASE static void mutate(Signed v, void (&slow)(Signed) = mutate_slow) {
auto const cache = local().cache; // a copy! null before/after LocalLifetime auto const cache = local().cache; // a copy! null before/after LocalLifetime
// fun-ref to trick compiler into emitting a tail call // fun-ref to trick compiler into emitting a tail call
FOLLY_LIKELY(!!cache) ? mutate(v, {cache, true}) : slow(v); FOLLY_LIKELY(!!cache) ? mutate(v, {cache, true}) : slow(v);
} }
static constexpr GetGlobal& global = folly::detail::createGlobal<Global, Tag>;
FOLLY_EXPORT FOLLY_ALWAYS_INLINE static CounterAndCache& local() { FOLLY_EXPORT FOLLY_ALWAYS_INLINE static CounterAndCache& local() {
// this is a member function local instead of a class member because of // this is a member function local instead of a class member because of
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66944 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66944
static thread_local CounterAndCache instance; static thread_local CounterAndCache instance;
return instance; return instance;
} }
FOLLY_EXPORT FOLLY_ALWAYS_INLINE static LocalLifetime& lifetime() { FOLLY_EXPORT FOLLY_ALWAYS_INLINE static LocalLifetime& lifetime() {
static thread_local LocalLifetime lifetime; static thread_local MonoLocalLifetime lifetime;
return lifetime; return lifetime;
} }
FOLLY_NOINLINE static Counter* counterSlow(CounterAndCache& state) { static constexpr Arg arg{global, local, lifetime};
if (threadlocal_detail::StaticMetaBase::dying()) {
return &Global::instance().fallback;
}
lifetime().track(state); // idempotent
auto const cache = state.cache;
return FOLLY_LIKELY(!!cache) ? cache : &Global::instance().fallback;
}
FOLLY_ALWAYS_INLINE static CounterRefAndLocal counter() {
auto& state = local();
auto const cache = state.cache; // a copy! null before/after LocalLifetime
auto const counter = FOLLY_LIKELY(!!cache) ? cache : counterSlow(state);
// cache is a stale nullptr after the first call to counterSlow(); this is
// intentional for the side-effect of shrinking the inline fast path
return CounterRefAndLocal{counter, !!cache};
}
}; };
template <typename Int, typename Tag>
constexpr typename SingletonRelaxedCounter<Int, Tag>::Arg
SingletonRelaxedCounter<Int, Tag>::arg;
template <typename Counted> template <typename Counted>
class SingletonRelaxedCountableAccess; class SingletonRelaxedCountableAccess;
...@@ -206,20 +279,20 @@ class SingletonRelaxedCountableAccess; ...@@ -206,20 +279,20 @@ class SingletonRelaxedCountableAccess;
template <typename Counted> template <typename Counted>
class SingletonRelaxedCountable { class SingletonRelaxedCountable {
public: public:
SingletonRelaxedCountable() { SingletonRelaxedCountable() noexcept {
static_assert( static_assert(
std::is_base_of<SingletonRelaxedCountable, Counted>::value, "non-crtp"); std::is_base_of<SingletonRelaxedCountable, Counted>::value, "non-crtp");
Counter::add(1); Counter::add(1);
} }
~SingletonRelaxedCountable() { ~SingletonRelaxedCountable() noexcept {
static_assert( static_assert(
std::is_base_of<SingletonRelaxedCountable, Counted>::value, "non-crtp"); std::is_base_of<SingletonRelaxedCountable, Counted>::value, "non-crtp");
Counter::sub(1); Counter::sub(1);
} }
SingletonRelaxedCountable(const SingletonRelaxedCountable&) SingletonRelaxedCountable(const SingletonRelaxedCountable&) noexcept
: SingletonRelaxedCountable() {} : SingletonRelaxedCountable() {}
SingletonRelaxedCountable(SingletonRelaxedCountable&&) SingletonRelaxedCountable(SingletonRelaxedCountable&&) noexcept
: SingletonRelaxedCountable() {} : SingletonRelaxedCountable() {}
SingletonRelaxedCountable& operator=(const SingletonRelaxedCountable&) = SingletonRelaxedCountable& operator=(const SingletonRelaxedCountable&) =
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment