Commit 9b9fb1d6 authored by Nathan Bronson's avatar Nathan Bronson Committed by Jordan DeLong

centralize cache-line alignment goo into CacheLocality

Summary:
The alignment requirements and details required to avoid false
sharing belong in CacheLocality, so this diff moves them there.

@override-unit-failures

Test Plan: fbmake runtests

Reviewed By: davejwatson@fb.com

FB internal diff: D1086090
parent 7c3790ca
...@@ -25,9 +25,11 @@ ...@@ -25,9 +25,11 @@
#include <linux/futex.h> #include <linux/futex.h>
#include <string.h> #include <string.h>
#include <sys/syscall.h> #include <sys/syscall.h>
#include <type_traits>
#include <unistd.h> #include <unistd.h>
#include <folly/Traits.h> #include <folly/Traits.h>
#include <folly/detail/CacheLocality.h>
#include <folly/detail/Futex.h> #include <folly/detail/Futex.h>
namespace folly { namespace folly {
...@@ -80,11 +82,13 @@ template <typename T> class MPMCPipelineStageImpl; ...@@ -80,11 +82,13 @@ template <typename T> class MPMCPipelineStageImpl;
/// FOLLY_ASSUME_FBVECTOR_COMPATIBLE then your type can be put in /// FOLLY_ASSUME_FBVECTOR_COMPATIBLE then your type can be put in
/// MPMCQueue. /// MPMCQueue.
template<typename T, template<typename T,
template<typename> class Atom = std::atomic, template<typename> class Atom = std::atomic>
typename = typename std::enable_if<
std::is_nothrow_constructible<T,T&&>::value ||
folly::IsRelocatable<T>::value>::type>
class MPMCQueue : boost::noncopyable { class MPMCQueue : boost::noncopyable {
static_assert(std::is_nothrow_constructible<T,T&&>::value ||
folly::IsRelocatable<T>::value,
"T must be relocatable or have a noexcept move constructor");
friend class detail::MPMCPipelineStageImpl<T>; friend class detail::MPMCPipelineStageImpl<T>;
public: public:
typedef T value_type; typedef T value_type;
...@@ -100,7 +104,11 @@ class MPMCQueue : boost::noncopyable { ...@@ -100,7 +104,11 @@ class MPMCQueue : boost::noncopyable {
, popSpinCutoff_(0) , popSpinCutoff_(0)
{ {
// ideally this would be a static assert, but g++ doesn't allow it // ideally this would be a static assert, but g++ doesn't allow it
assert(alignof(MPMCQueue<T,Atom>) >= kFalseSharingRange); assert(alignof(MPMCQueue<T,Atom>)
>= detail::CacheLocality::kFalseSharingRange);
assert(static_cast<uint8_t*>(static_cast<void*>(&popTicket_))
- static_cast<uint8_t*>(static_cast<void*>(&pushTicket_))
>= detail::CacheLocality::kFalseSharingRange);
} }
/// A default-constructed queue is useful because a usable (non-zero /// A default-constructed queue is useful because a usable (non-zero
...@@ -324,27 +332,15 @@ class MPMCQueue : boost::noncopyable { ...@@ -324,27 +332,15 @@ class MPMCQueue : boost::noncopyable {
/// the proper spin backoff /// the proper spin backoff
kAdaptationFreq = 128, kAdaptationFreq = 128,
/// Memory locations on the same cache line are subject to false
/// sharing, which is very bad for performance
kFalseSharingRange = 64,
/// To avoid false sharing in slots_ with neighboring memory /// To avoid false sharing in slots_ with neighboring memory
/// allocations, we pad it with this many SingleElementQueue-s at /// allocations, we pad it with this many SingleElementQueue-s at
/// each end /// each end
kSlotPadding = 1 + kSlotPadding = (detail::CacheLocality::kFalseSharingRange - 1)
(kFalseSharingRange - 1) / sizeof(detail::SingleElementQueue<T,Atom>) / sizeof(detail::SingleElementQueue<T,Atom>) + 1
}; };
static_assert(kFalseSharingRange == 64,
"FOLLY_ON_NEXT_CACHE_LINE must track kFalseSharingRange");
// This literal "64' should be kFalseSharingRange,
// but gcc-4.8.0 and 4.8.1 reject it.
// FIXME: s/64/kFalseSharingRange/ if that ever changes.
#define FOLLY_ON_NEXT_CACHE_LINE __attribute__((aligned(64)))
/// The maximum number of items in the queue at once /// The maximum number of items in the queue at once
size_t capacity_ FOLLY_ON_NEXT_CACHE_LINE; size_t FOLLY_ALIGN_TO_AVOID_FALSE_SHARING capacity_;
/// An array of capacity_ SingleElementQueue-s, each of which holds /// An array of capacity_ SingleElementQueue-s, each of which holds
/// either 0 or 1 item. We over-allocate by 2 * kSlotPadding and don't /// either 0 or 1 item. We over-allocate by 2 * kSlotPadding and don't
...@@ -357,24 +353,23 @@ class MPMCQueue : boost::noncopyable { ...@@ -357,24 +353,23 @@ class MPMCQueue : boost::noncopyable {
int stride_; int stride_;
/// Enqueuers get tickets from here /// Enqueuers get tickets from here
Atom<uint64_t> pushTicket_ FOLLY_ON_NEXT_CACHE_LINE; Atom<uint64_t> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING pushTicket_;
/// Dequeuers get tickets from here /// Dequeuers get tickets from here
Atom<uint64_t> popTicket_ FOLLY_ON_NEXT_CACHE_LINE; Atom<uint64_t> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING popTicket_;
/// This is how many times we will spin before using FUTEX_WAIT when /// This is how many times we will spin before using FUTEX_WAIT when
/// the queue is full on enqueue, adaptively computed by occasionally /// the queue is full on enqueue, adaptively computed by occasionally
/// spinning for longer and smoothing with an exponential moving average /// spinning for longer and smoothing with an exponential moving average
Atom<int> pushSpinCutoff_ FOLLY_ON_NEXT_CACHE_LINE; Atom<int> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING pushSpinCutoff_;
/// The adaptive spin cutoff when the queue is empty on dequeue /// The adaptive spin cutoff when the queue is empty on dequeue
Atom<int> popSpinCutoff_ FOLLY_ON_NEXT_CACHE_LINE; Atom<int> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING popSpinCutoff_;
/// Alignment doesn't avoid false sharing at the end of the struct, /// Alignment doesn't prevent false sharing at the end of the struct,
/// so fill out the last cache line /// so fill out the last cache line
char padding_[kFalseSharingRange - sizeof(Atom<int>)]; char padding_[detail::CacheLocality::kFalseSharingRange - sizeof(Atom<int>)];
#undef FOLLY_ON_NEXT_CACHE_LINE
/// We assign tickets in increasing order, but we don't want to /// We assign tickets in increasing order, but we don't want to
/// access neighboring elements of slots_ because that will lead to /// access neighboring elements of slots_ because that will lead to
...@@ -771,7 +766,7 @@ struct SingleElementQueue { ...@@ -771,7 +766,7 @@ struct SingleElementQueue {
const bool updateSpinCutoff, const bool updateSpinCutoff,
Args&&... args) noexcept { Args&&... args) noexcept {
sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff); sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff);
new (contents_) T(std::forward<Args>(args)...); new (&contents_) T(std::forward<Args>(args)...);
sequencer_.completeTurn(turn * 2); sequencer_.completeTurn(turn * 2);
} }
...@@ -789,13 +784,13 @@ struct SingleElementQueue { ...@@ -789,13 +784,13 @@ struct SingleElementQueue {
if (std::is_nothrow_constructible<T,T&&>::value) { if (std::is_nothrow_constructible<T,T&&>::value) {
// this is preferred // this is preferred
sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff); sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff);
new (contents_) T(std::move(goner)); new (&contents_) T(std::move(goner));
sequencer_.completeTurn(turn * 2); sequencer_.completeTurn(turn * 2);
} else { } else {
// simulate nothrow move with relocation, followed by default // simulate nothrow move with relocation, followed by default
// construction to fill the gap we created // construction to fill the gap we created
sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff); sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff);
memcpy(contents_, &goner, sizeof(T)); memcpy(&contents_, &goner, sizeof(T));
sequencer_.completeTurn(turn * 2); sequencer_.completeTurn(turn * 2);
new (&goner) T(); new (&goner) T();
} }
...@@ -818,7 +813,7 @@ struct SingleElementQueue { ...@@ -818,7 +813,7 @@ struct SingleElementQueue {
// unlikely, but if we don't complete our turn the queue will die // unlikely, but if we don't complete our turn the queue will die
} }
sequencer_.waitForTurn(turn * 2 + 1, spinCutoff, updateSpinCutoff); sequencer_.waitForTurn(turn * 2 + 1, spinCutoff, updateSpinCutoff);
memcpy(&elem, contents_, sizeof(T)); memcpy(&elem, &contents_, sizeof(T));
sequencer_.completeTurn(turn * 2 + 1); sequencer_.completeTurn(turn * 2 + 1);
} else { } else {
// use nothrow move assignment // use nothrow move assignment
...@@ -835,13 +830,13 @@ struct SingleElementQueue { ...@@ -835,13 +830,13 @@ struct SingleElementQueue {
private: private:
/// Storage for a T constructed with placement new /// Storage for a T constructed with placement new
char contents_[sizeof(T)] __attribute__((aligned(alignof(T)))); typename std::aligned_storage<sizeof(T),alignof(T)>::type contents_;
/// Even turns are pushes, odd turns are pops /// Even turns are pushes, odd turns are pops
TurnSequencer<Atom> sequencer_; TurnSequencer<Atom> sequencer_;
T* ptr() noexcept { T* ptr() noexcept {
return static_cast<T*>(static_cast<void*>(contents_)); return static_cast<T*>(static_cast<void*>(&contents_));
} }
void destroyContents() noexcept { void destroyContents() noexcept {
...@@ -851,7 +846,7 @@ struct SingleElementQueue { ...@@ -851,7 +846,7 @@ struct SingleElementQueue {
// g++ doesn't seem to have std::is_nothrow_destructible yet // g++ doesn't seem to have std::is_nothrow_destructible yet
} }
#ifndef NDEBUG #ifndef NDEBUG
memset(contents_, 'Q', sizeof(T)); memset(&contents_, 'Q', sizeof(T));
#endif #endif
} }
}; };
......
...@@ -108,8 +108,26 @@ struct CacheLocality { ...@@ -108,8 +108,26 @@ struct CacheLocality {
/// CacheLocality structure with the specified number of cpus and a /// CacheLocality structure with the specified number of cpus and a
/// single cache level that associates one cpu per cache. /// single cache level that associates one cpu per cache.
static CacheLocality uniform(size_t numCpus); static CacheLocality uniform(size_t numCpus);
enum {
/// Memory locations on the same cache line are subject to false
/// sharing, which is very bad for performance. Microbenchmarks
/// indicate that pairs of cache lines also see interference under
/// heavy use of atomic operations (observed for atomic increment on
/// Sandy Bridge). See FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
kFalseSharingRange = 128
};
static_assert(kFalseSharingRange == 128,
"FOLLY_ALIGN_TO_AVOID_FALSE_SHARING should track kFalseSharingRange");
}; };
// TODO replace __attribute__ with alignas and 128 with kFalseSharingRange
/// An attribute that will cause a variable or field to be aligned so that
/// it doesn't have false sharing with anything at a smaller memory address.
#define FOLLY_ALIGN_TO_AVOID_FALSE_SHARING __attribute__((aligned(128)))
/// Holds a function pointer to the VDSO implementation of getcpu(2), /// Holds a function pointer to the VDSO implementation of getcpu(2),
/// if available /// if available
struct Getcpu { struct Getcpu {
...@@ -329,12 +347,6 @@ struct AccessSpreaderArray { ...@@ -329,12 +347,6 @@ struct AccessSpreaderArray {
private: private:
/// If we align the access spreaders at the beginning of a cache line
/// then getcpuFunc_ and the first 56 bytes of stripeByCpu will be on
/// the same cache line
enum { kAlignment = 64 };
// AccessSpreader uses sharedInstance // AccessSpreader uses sharedInstance
friend AccessSpreader<Atom>; friend AccessSpreader<Atom>;
...@@ -343,7 +355,8 @@ struct AccessSpreaderArray { ...@@ -343,7 +355,8 @@ struct AccessSpreaderArray {
/// aligned_storage is uninitialized, we use placement new since there /// aligned_storage is uninitialized, we use placement new since there
/// is no AccessSpreader default constructor /// is no AccessSpreader default constructor
typename std::aligned_storage<sizeof(AccessSpreader<Atom>),kAlignment>::type typename std::aligned_storage<sizeof(AccessSpreader<Atom>),
CacheLocality::kFalseSharingRange>::type
raw[kMaxStripe + 1]; raw[kMaxStripe + 1];
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment