Commit cd58205e authored by Yedidya Feldblum's avatar Yedidya Feldblum Committed by Facebook GitHub Bot

remove AccessSpreader extern-template-struct

Summary:
Using `extern template struct` can be tricky - best to avoid.

Use static storage but no longer use static initialization to avoid all issues surrounding ODR, global initializers, etc. Microbenchmarks show no significant hit.

Add a global constructor to continue triggering initialization at static initialization time.

Differential Revision: D27746718

fbshipit-source-id: c5288c33412a80bb813c8d615794b9773cef323c
parent c5651a00
......@@ -62,8 +62,17 @@ static CacheLocality getSystemLocalityInfo() {
template <>
const CacheLocality& CacheLocality::system<std::atomic>() {
static auto* cache = new CacheLocality(getSystemLocalityInfo());
return *cache;
static std::atomic<const CacheLocality*> cache;
auto value = cache.load(std::memory_order_acquire);
if (value != nullptr) {
return *value;
}
auto next = new CacheLocality(getSystemLocalityInfo());
if (cache.compare_exchange_strong(value, next, std::memory_order_acq_rel)) {
return *next;
}
delete next;
return *value;
}
// Each level of cache has sharing sets, which are the set of cpus
......@@ -346,44 +355,51 @@ int AccessSpreaderBase::degenerateGetcpu(unsigned* cpu, unsigned* node, void*) {
return 0;
}
struct AccessSpreaderStaticInit {
static AccessSpreaderStaticInit instance;
AccessSpreaderStaticInit() { (void)AccessSpreader<>::current(~size_t(0)); }
};
AccessSpreaderStaticInit AccessSpreaderStaticInit::instance;
bool AccessSpreaderBase::initialize(
Getcpu::Func& getcpuFunc,
GlobalState& state,
Getcpu::Func (&pickGetcpuFunc)(),
const CacheLocality& (&system)(),
CompactStripeTable& widthAndCpuToStripe) {
getcpuFunc = pickGetcpuFunc();
const CacheLocality& (&system)()) {
(void)AccessSpreaderStaticInit::instance; // ODR-use it so it is not dropped
auto& cacheLocality = system();
auto n = cacheLocality.numCpus;
for (size_t width = 0; width <= kMaxCpus; ++width) {
auto& row = widthAndCpuToStripe[width];
auto& row = state.table[width];
auto numStripes = std::max(size_t{1}, width);
for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
auto index = cacheLocality.localityIndexByCpu[cpu];
assert(index < n);
// as index goes from 0..n, post-transform value goes from
// 0..numStripes
row[cpu] = static_cast<CompactStripe>((index * numStripes) / n);
row[cpu].store(
static_cast<CompactStripe>((index * numStripes) / n),
std::memory_order_relaxed);
assert(row[cpu] < numStripes);
}
size_t filled = n;
while (filled < kMaxCpus) {
size_t len = std::min(filled, kMaxCpus - filled);
std::memcpy(&row[filled], &row[0], len);
for (size_t i = 0; i < len; ++i) {
row[filled + i].store(
row[i].load(std::memory_order_relaxed), std::memory_order_relaxed);
}
filled += len;
}
for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
assert(row[cpu] == row[cpu - n]);
}
}
state.getcpu.exchange(pickGetcpuFunc(), std::memory_order_acq_rel);
return true;
}
} // namespace detail
/////////////// AccessSpreader
template struct AccessSpreader<std::atomic>;
SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz)
: allocSize_{allocSize}, sz_(sz) {}
......
......@@ -201,16 +201,29 @@ class AccessSpreaderBase {
kMaxCpus - 1 <= std::numeric_limits<CompactStripe>::max(),
"stripeByCpu element type isn't wide enough");
using CompactStripeTable = CompactStripe[kMaxCpus + 1][kMaxCpus];
using CompactStripeTable = std::atomic<CompactStripe>[kMaxCpus + 1][kMaxCpus];
struct GlobalState {
/// For each level of splitting up to kMaxCpus, maps the cpu (mod
/// kMaxCpus) to the stripe. Rather than performing any inequalities
/// or modulo on the actual number of cpus, we just fill in the entire
/// array.
/// Keep as the first field to avoid extra + in the fastest path.
CompactStripeTable table;
/// Points to the getcpu-like function we are using to obtain the
/// current cpu. It should not be assumed that the returned cpu value
/// is in range.
std::atomic<Getcpu::Func> getcpu; // nullptr -> not initialized
};
static_assert(
std::is_trivial<GlobalState>::value || kCpplibVer, "not trivial");
/// Always claims to be on CPU zero, node zero
static int degenerateGetcpu(unsigned* cpu, unsigned* node, void*);
static bool initialize(
Getcpu::Func&,
Getcpu::Func (&)(),
const CacheLocality& (&)(),
CompactStripeTable&);
GlobalState& out, Getcpu::Func (&)(), const CacheLocality& (&)());
};
} // namespace detail
......@@ -251,17 +264,31 @@ class AccessSpreaderBase {
/// all of the time.
template <template <typename> class Atom = std::atomic>
struct AccessSpreader : private detail::AccessSpreaderBase {
private:
struct GlobalState : detail::AccessSpreaderBase::GlobalState {};
static_assert(
std::is_trivial<GlobalState>::value || kCpplibVer, "not trivial");
public:
FOLLY_EXPORT static GlobalState& state() {
static GlobalState state; // trivial for zero ctor and zero dtor
if (FOLLY_UNLIKELY(!state.getcpu.load(std::memory_order_acquire))) {
initialize(state);
}
return state;
}
/// Returns the stripe associated with the current CPU. The returned
/// value will be < numStripes.
static size_t current(size_t numStripes) {
// widthAndCpuToStripe[0] will actually work okay (all zeros), but
static size_t current(size_t numStripes, const GlobalState& s = state()) {
// s.table[0] will actually work okay (all zeros), but
// something's wrong with the caller
assert(numStripes > 0);
unsigned cpu;
getcpuFunc(&cpu, nullptr, nullptr);
return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)]
[cpu % kMaxCpus];
s.getcpu.load(std::memory_order_relaxed)(&cpu, nullptr, nullptr);
return s.table[std::min(size_t(kMaxCpus), numStripes)][cpu % kMaxCpus].load(
std::memory_order_relaxed);
}
#ifdef FOLLY_CL_USE_FOLLY_TLS
......@@ -271,13 +298,17 @@ struct AccessSpreader : private detail::AccessSpreaderBase {
/// certain small number of calls, which can make the result imprecise, but
/// it is more efficient (amortized 2 ns on my dev box, compared to 12 ns for
/// current()).
static size_t cachedCurrent(size_t numStripes) {
return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)]
[cpuCache().cpu()];
static size_t cachedCurrent(
size_t numStripes, const GlobalState& s = state()) {
return s.table[std::min(size_t(kMaxCpus), numStripes)][cpuCache().cpu(s)]
.load(std::memory_order_relaxed);
}
#else
/// Fallback implementation when thread-local storage isn't available.
static size_t cachedCurrent(size_t numStripes) { return current(numStripes); }
static size_t cachedCurrent(
size_t numStripes, const GlobalState& s = state()) {
return current(numStripes, s);
}
#endif
/// Returns the maximum stripe value that can be returned under any
......@@ -285,27 +316,13 @@ struct AccessSpreader : private detail::AccessSpreaderBase {
static constexpr size_t maxStripeValue() { return kMaxCpus; }
private:
/// Points to the getcpu-like function we are using to obtain the
/// current cpu. It should not be assumed that the returned cpu value
/// is in range. We use a static for this so that we can prearrange a
/// valid value in the pre-constructed state and avoid the need for a
/// conditional on every subsequent invocation (not normally a big win,
/// but 20% on some inner loops here).
static Getcpu::Func getcpuFunc;
/// For each level of splitting up to kMaxCpus, maps the cpu (mod
/// kMaxCpus) to the stripe. Rather than performing any inequalities
/// or modulo on the actual number of cpus, we just fill in the entire
/// array.
static CompactStripeTable widthAndCpuToStripe;
/// Caches the current CPU and refreshes the cache every so often.
class CpuCache {
public:
unsigned cpu() {
unsigned cpu(GlobalState const& s) {
if (UNLIKELY(cachedCpuUses_-- == 0)) {
unsigned cpu;
AccessSpreader::getcpuFunc(&cpu, nullptr, nullptr);
s.getcpu.load(std::memory_order_relaxed)(&cpu, nullptr, nullptr);
cachedCpu_ = cpu % kMaxCpus;
cachedCpuUses_ = kMaxCachedCpuUses - 1;
}
......@@ -326,8 +343,6 @@ struct AccessSpreader : private detail::AccessSpreaderBase {
}
#endif
static bool initialized;
/// Returns the best getcpu implementation for Atom
static Getcpu::Func pickGetcpuFunc() {
auto best = Getcpu::resolveVdsoFunc();
......@@ -347,30 +362,12 @@ struct AccessSpreader : private detail::AccessSpreaderBase {
// zero stripe. Once a sanitizer gets smart enough to detect this as
// a race or undefined behavior, we can annotate it.
static bool initialize() {
static bool initialize(GlobalState& state) {
return detail::AccessSpreaderBase::initialize(
getcpuFunc,
pickGetcpuFunc,
CacheLocality::system<Atom>,
widthAndCpuToStripe);
state, pickGetcpuFunc, CacheLocality::system<Atom>);
}
};
template <template <typename> class Atom>
Getcpu::Func AccessSpreader<Atom>::getcpuFunc =
AccessSpreader<Atom>::degenerateGetcpu;
template <template <typename> class Atom>
typename AccessSpreader<Atom>::CompactStripe
AccessSpreader<Atom>::widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus] = {};
template <template <typename> class Atom>
bool AccessSpreader<Atom>::initialized = AccessSpreader<Atom>::initialize();
// Suppress this instantiation in other translation units. It is
// instantiated in CacheLocality.cpp
extern template struct AccessSpreader<std::atomic>;
/**
* A simple freelist allocator. Allocates things of size sz, from
* slabs of size allocSize. Takes a lock on each
......
......@@ -71,8 +71,14 @@ struct CachedCurrentTag {};
} // namespace
namespace folly {
template <>
size_t AccessSpreader<CachedCurrentTag>::current(size_t numStripes) {
return AccessSpreader<std::atomic>::cachedCurrent(numStripes);
const CacheLocality& CacheLocality::system<CachedCurrentTag>() {
return CacheLocality::system<>();
}
template <>
size_t AccessSpreader<CachedCurrentTag>::current(
size_t numStripes, const GlobalState& state) {
auto& alter = reinterpret_cast<const AccessSpreader::GlobalState&>(state);
return AccessSpreader::cachedCurrent(numStripes, alter);
}
} // namespace folly
......@@ -83,6 +89,14 @@ BENCHMARK(AccessSpreaderUse, iters) {
}
}
BENCHMARK(StateAccessSpreaderUse, iters) {
auto& state = AccessSpreader<>::state();
for (unsigned long i = 0; i < iters; ++i) {
auto x = AccessSpreader<>::current(16, state);
folly::doNotOptimizeAway(x);
}
}
BENCHMARK(CachedAccessSpreaderUse, iters) {
for (unsigned long i = 0; i < iters; ++i) {
auto x = AccessSpreader<>::cachedCurrent(16);
......@@ -90,6 +104,14 @@ BENCHMARK(CachedAccessSpreaderUse, iters) {
}
}
BENCHMARK(StateCachedAccessSpreaderUse, iters) {
auto& state = AccessSpreader<>::state();
for (unsigned long i = 0; i < iters; ++i) {
auto x = AccessSpreader<>::cachedCurrent(16, state);
folly::doNotOptimizeAway(x);
}
}
BENCHMARK(BaselineAtomicIncrement, iters) {
std::atomic<int> value;
for (unsigned long i = 0; i < iters; ++i) {
......@@ -107,6 +129,16 @@ BENCHMARK(CachedAccessSpreaderAtomicIncrement, iters) {
}
}
BENCHMARK(StateCachedAccessSpreaderAtomicIncrement, iters) {
auto& state = AccessSpreader<>::state();
std::array<std::atomic<int>, 64> values;
for (unsigned long i = 0; i < iters; ++i) {
auto x = AccessSpreader<>::cachedCurrent(64, state);
++values[x];
folly::doNotOptimizeAway(values[x]);
}
}
// Benchmark scores here reflect the time for 32 threads to perform an
// atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
// if we don't separate the counters onto unique 128 byte stripes the
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment