Commit 5ad17f07 authored by Yedidya Feldblum's avatar Yedidya Feldblum Committed by Facebook Github Bot

Add hardware_destructive_interference_size

Summary:
[Folly] Add `hardware_destructive_interference_size` and `hardware_constructive_interference_size` to `folly/lang/Align.h`.

As backports from C++17. Which may require keeping, depending on how stanard libraries choose to implement these.

And replace `CacheLocality::kFalseSharingRange` with `hardware_destructive_interference_size`.

Reviewed By: ot

Differential Revision: D6554817

fbshipit-source-id: bff49f5ca8b01d38fa806076f99201355df76cd9
parent d1ef992f
......@@ -17,9 +17,8 @@
#pragma once
#include <cstddef>
#include <utility>
#include <folly/Portability.h>
#include <folly/concurrency/CacheLocality.h>
#include <folly/lang/Align.h>
namespace folly {
......@@ -68,8 +67,8 @@ class CachelinePadded {
private:
static constexpr size_t paddingSize() noexcept {
return CacheLocality::kFalseSharingRange -
(alignof(T) % CacheLocality::kFalseSharingRange);
return hardware_destructive_interference_size -
(alignof(T) % hardware_destructive_interference_size);
}
char paddingPre_[paddingSize()];
T inner_;
......
......@@ -651,11 +651,12 @@ class MPMCQueueBase<Derived<T, Atom, Dynamic>> : boost::noncopyable {
}
// ideally this would be a static assert, but g++ doesn't allow it
assert(alignof(MPMCQueue<T, Atom>) >= CacheLocality::kFalseSharingRange);
assert(
alignof(MPMCQueue<T, Atom>) >= hardware_destructive_interference_size);
assert(
static_cast<uint8_t*>(static_cast<void*>(&popTicket_)) -
static_cast<uint8_t*>(static_cast<void*>(&pushTicket_)) >=
CacheLocality::kFalseSharingRange);
static_cast<ptrdiff_t>(hardware_destructive_interference_size));
}
/// A default-constructed queue is useful because a usable (non-zero
......@@ -975,7 +976,8 @@ class MPMCQueueBase<Derived<T, Atom, Dynamic>> : boost::noncopyable {
/// To avoid false sharing in slots_ with neighboring memory
/// allocations, we pad it with this many SingleElementQueue-s at
/// each end
kSlotPadding = (CacheLocality::kFalseSharingRange - 1) / sizeof(Slot) + 1
kSlotPadding =
(hardware_destructive_interference_size - 1) / sizeof(Slot) + 1
};
/// The maximum number of items in the queue at once
......@@ -1027,7 +1029,7 @@ class MPMCQueueBase<Derived<T, Atom, Dynamic>> : boost::noncopyable {
/// Alignment doesn't prevent false sharing at the end of the struct,
/// so fill out the last cache line
char padding_[CacheLocality::kFalseSharingRange - sizeof(Atom<uint32_t>)];
char pad_[hardware_destructive_interference_size - sizeof(Atom<uint32_t>)];
/// We assign tickets in increasing order, but we don't want to
/// access neighboring elements of slots_ because that will lead to
......
......@@ -97,6 +97,12 @@ constexpr bool kHasUnalignedAccess = false;
# define FOLLY_X64 0
#endif
#if defined(__arm__)
#define FOLLY_ARM 1
#else
#define FOLLY_ARM 0
#endif
#if defined(__aarch64__)
# define FOLLY_AARCH64 1
#else
......@@ -110,6 +116,7 @@ constexpr bool kHasUnalignedAccess = false;
#endif
namespace folly {
constexpr bool kIsArchArm = FOLLY_ARM == 1;
constexpr bool kIsArchAmd64 = FOLLY_X64 == 1;
constexpr bool kIsArchAArch64 = FOLLY_AARCH64 == 1;
constexpr bool kIsArchPPC64 = FOLLY_PPC64 == 1;
......
......@@ -173,14 +173,14 @@ struct ProducerConsumerQueue {
}
private:
char pad0_[CacheLocality::kFalseSharingRange];
char pad0_[hardware_destructive_interference_size];
const uint32_t size_;
T* const records_;
FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> readIndex_;
FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> writeIndex_;
char pad1_[CacheLocality::kFalseSharingRange - sizeof(writeIndex_)];
char pad1_[hardware_destructive_interference_size - sizeof(writeIndex_)];
};
} // namespace folly
......@@ -116,22 +116,9 @@ struct CacheLocality {
/// CacheLocality structure with the specified number of cpus and a
/// single cache level that associates one cpu per cache.
static CacheLocality uniform(size_t numCpus);
enum {
/// Memory locations on the same cache line are subject to false
/// sharing, which is very bad for performance. Microbenchmarks
/// indicate that pairs of cache lines also see interference under
/// heavy use of atomic operations (observed for atomic increment on
/// Sandy Bridge). See FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
kFalseSharingRange = 128
};
static_assert(
kFalseSharingRange == 128,
"FOLLY_ALIGN_TO_AVOID_FALSE_SHARING should track kFalseSharingRange");
};
// TODO replace __attribute__ with alignas and 128 with kFalseSharingRange
// TODO replace with alignas(hardware_destructive_interference_size)
/// An attribute that will cause a variable or field to be aligned so that
/// it doesn't have false sharing with anything at a smaller memory address.
......@@ -451,14 +438,11 @@ class CoreAllocator {
void* allocate(size_t size) {
auto cl = sizeClass(size);
if (cl == 4) {
static_assert(
CacheLocality::kFalseSharingRange == 128,
"kFalseSharingRange changed");
// Align to a cacheline
size = size + (CacheLocality::kFalseSharingRange - 1);
size &= ~size_t(CacheLocality::kFalseSharingRange - 1);
void* mem =
detail::aligned_malloc(size, CacheLocality::kFalseSharingRange);
size = size + (hardware_destructive_interference_size - 1);
size &= ~size_t(hardware_destructive_interference_size - 1);
void* mem = detail::aligned_malloc(
size, hardware_destructive_interference_size);
if (!mem) {
std::__throw_bad_alloc();
}
......
......@@ -18,6 +18,8 @@
#include <cstddef>
#include <folly/Portability.h>
namespace folly {
namespace detail {
......@@ -88,4 +90,32 @@ using max_align_v_ = max_align_t_<
constexpr std::size_t max_align_v = detail::max_align_v_::value;
struct alignas(max_align_v) max_align_t {};
// Memory locations within the same cache line are subject to destructive
// interference, also known as false sharing, which is when concurrent
// accesses to these different memory locations from different cores, where at
// least one of the concurrent accesses is or involves a store operation,
// induce contention and harm performance.
//
// Microbenchmarks indicate that pairs of cache lines also see destructive
// interference under heavy use of atomic operations, as observed for atomic
// increment on Sandy Bridge.
//
// We assume a cache line size of 64, so we use a cache line pair size of 128
// to avoid destructive interference.
//
// mimic: std::hardware_destructive_interference_size, C++17
constexpr std::size_t hardware_destructive_interference_size =
kIsArchArm ? 64 : 128;
static_assert(hardware_destructive_interference_size >= max_align_v, "math?");
// Memory locations within the same cache line are subject to constructive
// interference, also known as true sharing, which is when accesses to some
// memory locations induce all memory locations within the same cache line to
// be cached, benefiting subsequent accesses to different memory locations
// within the same cache line and heping performance.
//
// mimic: std::hardware_constructive_interference_size, C++17
constexpr std::size_t hardware_constructive_interference_size = 64;
static_assert(hardware_constructive_interference_size >= max_align_v, "math?");
} // namespace folly
......@@ -27,7 +27,8 @@ static_assert(
std::is_standard_layout<CachelinePadded<int>>::value,
"CachelinePadded<T> must be standard-layout if T is.");
static constexpr int kCachelineSize = folly::CacheLocality::kFalseSharingRange;
static constexpr int kCachelineSize =
folly::hardware_destructive_interference_size;
template <size_t dataSize, size_t alignment = alignof(void*)>
struct alignas(alignment) SizedData {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment