Commit 03fe7209 authored by Nick Terrell's avatar Nick Terrell Committed by Facebook Github Bot

Add AccessSpreader<>::cachedCurrent()

Summary:
`AccessSpreader::cachedCurrent()` caches the result of `AccessSpreader::getcpuFunc()` for 32 calls in a thread-local. The cached function takes 2 ns, where the current call takes 12 ns. This comes at the cost of being imprecise when threads migrate to a new cpu.

I chose 32 as the number of calls because it only has a 10% overhead over never refreshing (2.05 ns vs 1.83 ns), where 16 has a 30% overhead, and it performs just as well as 64.

Reviewed By: ot

Differential Revision: D10151009

fbshipit-source-id: 07ed292dfdcdcedcb74c24279f7773a80ad09348
parent e0ec3bcd
......@@ -239,6 +239,24 @@ struct AccessSpreader {
[cpu % kMaxCpus];
}
#ifdef FOLLY_TLS
/// Returns the stripe associated with the current CPU. The returned
/// value will be < numStripes.
/// This function caches the current cpu in a thread-local variable for a
/// certain small number of calls, which can make the result imprecise, but
/// it is more efficient (amortized 2 ns on my dev box, compared to 12 ns for
/// current()).
static size_t cachedCurrent(size_t numStripes) {
return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)]
[cpuCache.cpu()];
}
#else
/// Fallback implementation when thread-local storage isn't available.
static size_t cachedCurrent(size_t numStripes) {
return current(numStripes);
}
#endif
private:
/// If there are more cpus than this nothing will crash, but there
/// might be unnecessary sharing
......@@ -267,6 +285,30 @@ struct AccessSpreader {
/// array.
static CompactStripe widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus];
/// Caches the current CPU and refreshes the cache every so often.
class CpuCache {
public:
unsigned cpu() {
if (UNLIKELY(cachedCpuUses_-- == 0)) {
unsigned cpu;
AccessSpreader::getcpuFunc(&cpu, nullptr, nullptr);
cachedCpu_ = cpu % kMaxCpus;
cachedCpuUses_ = kMaxCachedCpuUses - 1;
}
return cachedCpu_;
}
private:
static constexpr unsigned kMaxCachedCpuUses = 32;
unsigned cachedCpu_{0};
unsigned cachedCpuUses_{0};
};
#ifdef FOLLY_TLS
static FOLLY_TLS CpuCache cpuCache;
#endif
static bool initialized;
/// Returns the best getcpu implementation for Atom
......@@ -331,6 +373,12 @@ template <template <typename> class Atom>
typename AccessSpreader<Atom>::CompactStripe
AccessSpreader<Atom>::widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus] = {};
#ifdef FOLLY_TLS
template <template <typename> class Atom>
FOLLY_TLS
typename AccessSpreader<Atom>::CpuCache AccessSpreader<Atom>::cpuCache;
#endif
template <template <typename> class Atom>
bool AccessSpreader<Atom>::initialized = AccessSpreader<Atom>::initialize();
......
......@@ -375,6 +375,30 @@ TEST(AccessSpreader, Simple) {
}
}
TEST(AccessSpreader, SimpleCached) {
for (size_t s = 1; s < 200; ++s) {
EXPECT_LT(AccessSpreader<>::cachedCurrent(s), s);
}
}
TEST(AccessSpreader, ConcurrentAccessCached) {
std::vector<std::thread> threads;
for (size_t i = 0; i < 4; ++i) {
threads.emplace_back([]() {
for (size_t s : {16, 32, 64}) {
for (size_t j = 1; j < 200; ++j) {
EXPECT_LT(AccessSpreader<>::cachedCurrent(s), s);
EXPECT_LT(AccessSpreader<>::cachedCurrent(s), s);
}
std::this_thread::yield();
}
});
}
for (auto& thread : threads) {
thread.join();
}
}
#ifdef FOLLY_TLS
#define DECLARE_SPREADER_TAG(tag, locality, func) \
namespace { \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment