Commit 27929e3d authored by Nathan Bronson's avatar Nathan Bronson Committed by Facebook Github Bot

use vector intrinsics for F14 on 32-bit platforms

Summary:
This diff adds 32-bit support for F14, so SSE and NEON
intrinsics can be used on x86 and arm architectures (rather than just
x86_64 and aarch64).  The portability fallback to std::unordered_map
and std::unordered_set is now used only when vector intrinsics are not
available, or on PPC.

Reviewed By: shixiao

Differential Revision: D8586283

fbshipit-source-id: 1c4d090e80381fe7ad071c3059b3cb242c04c9f7
parent 3cb1c6d0
......@@ -301,6 +301,12 @@ constexpr auto kIsBigEndian = !kIsLittleEndian;
#define FOLLY_SSE_PREREQ(major, minor) \
(FOLLY_SSE > major || FOLLY_SSE == major && FOLLY_SSE_MINOR >= minor)
#ifndef FOLLY_NEON
# if defined(__ARM_NEON)
# define FOLLY_NEON 1
# endif
#endif
#if FOLLY_UNUSUAL_GFLAGS_NAMESPACE
namespace FOLLY_GFLAGS_NAMESPACE { }
namespace gflags {
......
......@@ -18,16 +18,16 @@
#include <folly/Portability.h>
// clang-format off
// F14 has been implemented for x86_64 SSE2 and AARCH64 NEON (so far)
#ifndef FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
# if FOLLY_SSE >= 2 && FOLLY_X64 || FOLLY_AARCH64
# define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 1
# else
# define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 0
# pragma message \
"Vector intrinsics / F14 support unavailable on this platform, " \
// F14 has been implemented for SSE2 and NEON (so far)
#if FOLLY_SSE >= 2 || FOLLY_NEON
#define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 1
#else
#define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 0
#pragma message \
"Vector intrinsics / F14 support unavailable on this platform, " \
"falling back to std::unordered_map / set"
# endif
#endif
// clang-format on
#endif
......@@ -25,7 +25,9 @@
#include <folly/Unit.h>
#include <folly/container/detail/F14Table.h>
#include <folly/hash/Hash.h>
#include <folly/lang/Align.h>
#include <folly/lang/SafeAssert.h>
#include <folly/memory/Malloc.h>
#if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
......@@ -226,6 +228,10 @@ struct BasePolicy
std::size_t computeKeyHash(K const& key) const {
static_assert(
isAvalanchingHasher() == IsAvalanchingHasher<Hasher, K>::value, "");
static_assert(
!isAvalanchingHasher() ||
sizeof(decltype(hasher()(key))) >= sizeof(std::size_t),
"hasher is not avalanching if it doesn't return enough bits");
return hasher()(key);
}
......@@ -272,14 +278,13 @@ struct BasePolicy
std::size_t /*capacity*/,
P&& /*rhs*/) {}
// Rounds chunkBytes up to the next multiple of 16 if it is possible
// that a sub-Chunk allocation has a size that is not a multiple of 16.
static std::size_t alignedChunkAllocSize(std::size_t chunkAllocSize) {
if ((sizeof(Item) % 8) != 0) {
chunkAllocSize = -(-chunkAllocSize & ~std::size_t{0xf});
std::size_t alignedAllocSize(std::size_t n) const {
if (kRequiredVectorAlignment <= alignof(max_align_t) ||
std::is_same<ByteAlloc, std::allocator<uint8_t>>::value) {
return n;
} else {
return n + kRequiredVectorAlignment;
}
FOLLY_SAFE_DCHECK((chunkAllocSize % 16) == 0, "");
return chunkAllocSize;
}
bool beforeRehash(
......@@ -288,9 +293,9 @@ struct BasePolicy
std::size_t /*newCapacity*/,
std::size_t chunkAllocSize,
BytePtr& outChunkAllocation) {
ByteAlloc a{alloc()};
outChunkAllocation =
ByteAllocTraits::allocate(a, alignedChunkAllocSize(chunkAllocSize));
allocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
alloc(), chunkAllocSize);
return false;
}
......@@ -304,9 +309,8 @@ struct BasePolicy
std::size_t chunkAllocSize) {
// on success, this will be the old allocation, on failure the new one
if (chunkAllocation != nullptr) {
ByteAlloc a{alloc()};
ByteAllocTraits::deallocate(
a, chunkAllocation, alignedChunkAllocSize(chunkAllocSize));
deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
alloc(), chunkAllocation, chunkAllocSize);
}
}
......@@ -321,10 +325,8 @@ struct BasePolicy
std::size_t /*capacity*/,
BytePtr chunkAllocation,
std::size_t chunkAllocSize) {
FOLLY_SAFE_DCHECK(chunkAllocation != nullptr, "");
ByteAlloc a{alloc()};
ByteAllocTraits::deallocate(
a, chunkAllocation, alignedChunkAllocSize(chunkAllocSize));
deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
alloc(), chunkAllocation, chunkAllocSize);
}
void prefetchValue(Item const&) const {
......@@ -459,6 +461,8 @@ class ValueContainerPolicy : public BasePolicy<
using typename Super::Value;
private:
using typename Super::ByteAlloc;
using Super::kIsMap;
public:
......@@ -571,7 +575,10 @@ class ValueContainerPolicy : public BasePolicy<
std::size_t /*capacity*/,
V&& visitor) const {
if (chunkAllocSize > 0) {
visitor(Super::alignedChunkAllocSize(chunkAllocSize), 1);
visitor(
allocationBytesForOverAligned<ByteAlloc, kRequiredVectorAlignment>(
chunkAllocSize),
1);
}
}
......@@ -698,6 +705,8 @@ class NodeContainerPolicy
using typename Super::Value;
private:
using typename Super::ByteAlloc;
using Super::kIsMap;
public:
......@@ -796,7 +805,10 @@ class NodeContainerPolicy
std::size_t /*capacity*/,
V&& visitor) const {
if (chunkAllocSize > 0) {
visitor(Super::alignedChunkAllocSize(chunkAllocSize), 1);
visitor(
allocationBytesForOverAligned<ByteAlloc, kRequiredVectorAlignment>(
chunkAllocSize),
1);
}
if (size > 0) {
visitor(sizeof(Value), size);
......@@ -1212,14 +1224,7 @@ class VectorContainerPolicy : public BasePolicy<
static std::size_t allocSize(
std::size_t prefixBytes,
std::size_t valueCapacity) {
auto n = valuesOffset(prefixBytes) + sizeof(Value) * valueCapacity;
if (alignof(Value) <= 8) {
// ensure that the result is a multiple of 16 to protect against
// allocators that don't always align to 16
n = -(-n & ~std::size_t{0xf});
}
FOLLY_SAFE_DCHECK((n % 16) == 0, "");
return n;
return valuesOffset(prefixBytes) + sizeof(Value) * valueCapacity;
}
public:
......@@ -1235,11 +1240,9 @@ class VectorContainerPolicy : public BasePolicy<
newCapacity <= (std::numeric_limits<Item>::max)(),
"");
{
ByteAlloc a{this->alloc()};
outChunkAllocation =
ByteAllocTraits::allocate(a, allocSize(chunkAllocSize, newCapacity));
}
outChunkAllocation =
allocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
Super::alloc(), allocSize(chunkAllocSize, newCapacity));
ValuePtr before = values_;
ValuePtr after = std::pointer_traits<ValuePtr>::pointer_to(
......@@ -1279,9 +1282,8 @@ class VectorContainerPolicy : public BasePolicy<
// on success, chunkAllocation is the old allocation, on failure it is the
// new one
if (chunkAllocation != nullptr) {
ByteAlloc a{this->alloc()};
ByteAllocTraits::deallocate(
a,
deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
Super::alloc(),
chunkAllocation,
allocSize(chunkAllocSize, (success ? oldCapacity : newCapacity)));
}
......@@ -1306,9 +1308,8 @@ class VectorContainerPolicy : public BasePolicy<
BytePtr chunkAllocation,
std::size_t chunkAllocSize) {
if (chunkAllocation != nullptr) {
ByteAlloc a{this->alloc()};
ByteAllocTraits::deallocate(
a, chunkAllocation, allocSize(chunkAllocSize, capacity));
deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
Super::alloc(), chunkAllocation, allocSize(chunkAllocSize, capacity));
values_ = nullptr;
}
}
......@@ -1321,7 +1322,10 @@ class VectorContainerPolicy : public BasePolicy<
V&& visitor) const {
FOLLY_SAFE_DCHECK((chunkAllocSize == 0) == (capacity == 0), "");
if (chunkAllocSize > 0) {
visitor(allocSize(chunkAllocSize, capacity), 1);
visitor(
allocationBytesForOverAligned<ByteAlloc, kRequiredVectorAlignment>(
allocSize(chunkAllocSize, capacity)),
1);
}
}
......
......@@ -24,7 +24,7 @@ namespace folly {
namespace f14 {
namespace detail {
TagVector kEmptyTagVector = {};
EmptyTagVectorType kEmptyTagVector = {};
} // namespace detail
} // namespace f14
......
......@@ -48,8 +48,8 @@
#include <folly/container/detail/F14IntrinsicsAvailability.h>
#if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
#if FOLLY_AARCH64
#if __ARM_FEATURE_CRC32
#if FOLLY_NEON
#ifdef __ARM_FEATURE_CRC32
#include <arm_acle.h> // __crc32cd
#endif
#include <arm_neon.h> // uint8x16t intrinsics
......@@ -246,12 +246,11 @@ using KeyTypeForEmplace = KeyTypeForEmplaceHelper<
template <typename T>
FOLLY_ALWAYS_INLINE static void prefetchAddr(T const* ptr) {
#if FOLLY_AARCH64
#ifndef _WIN32
__builtin_prefetch(static_cast<void const*>(ptr));
#elif FOLLY_NEON
__prefetch(static_cast<void const*>(ptr));
#else
// _mm_prefetch is x86_64-specific and comes from xmmintrin.h.
// It seems to compile to the same thing as __builtin_prefetch, but
// also works on windows.
_mm_prefetch(
static_cast<char const*>(static_cast<void const*>(ptr)), _MM_HINT_T0);
#endif
......@@ -267,21 +266,31 @@ FOLLY_ALWAYS_INLINE static unsigned findFirstSetNonZero(T mask) {
}
}
#if FOLLY_AARCH64
#if FOLLY_NEON
using TagVector = uint8x16_t;
using MaskType = uint64_t;
constexpr unsigned kMaskSpacing = 4;
#else
#else // SSE2
using TagVector = __m128i;
using MaskType = unsigned;
using MaskType = uint32_t;
constexpr unsigned kMaskSpacing = 1;
#endif
extern TagVector kEmptyTagVector;
// We could use unaligned loads to relax this requirement, but that
// would be both a performance penalty and require a bulkier packed
// ItemIter format
constexpr std::size_t kRequiredVectorAlignment =
constexpr_max(std::size_t{16}, alignof(max_align_t));
using EmptyTagVectorType = std::aligned_storage_t<
sizeof(TagVector) + kRequiredVectorAlignment,
alignof(max_align_t)>;
extern EmptyTagVectorType kEmptyTagVector;
template <unsigned BitCount>
struct FullMask {
......@@ -292,6 +301,74 @@ struct FullMask {
template <>
struct FullMask<1> : std::integral_constant<MaskType, 1> {};
#if FOLLY_ARM
// Mask iteration is different for ARM because that is the only platform
// for which the mask is bigger than a register.
// Iterates a mask, optimized for the case that only a few bits are set
class SparseMaskIter {
static_assert(kMaskSpacing == 4, "");
uint32_t interleavedMask_;
public:
explicit SparseMaskIter(MaskType mask)
: interleavedMask_{static_cast<uint32_t>(((mask >> 32) << 2) | mask)} {}
bool hasNext() {
return interleavedMask_ != 0;
}
unsigned next() {
FOLLY_SAFE_DCHECK(hasNext(), "");
unsigned i = findFirstSetNonZero(interleavedMask_);
interleavedMask_ &= (interleavedMask_ - 1);
return ((i >> 2) | (i << 2)) & 0xf;
}
};
// Iterates a mask, optimized for the case that most bits are set
class DenseMaskIter {
static_assert(kMaskSpacing == 4, "");
std::size_t count_;
unsigned index_;
uint8_t const* tags_;
public:
explicit DenseMaskIter(uint8_t const* tags, MaskType mask) {
if (mask == 0) {
count_ = 0;
} else {
count_ =
folly::popcount(static_cast<uint32_t>(((mask >> 32) << 2) | mask));
if (LIKELY((mask & 1) != 0)) {
index_ = 0;
} else {
index_ = findFirstSetNonZero(mask) / kMaskSpacing;
}
tags_ = tags;
}
}
bool hasNext() {
return count_ > 0;
}
unsigned next() {
auto rv = index_;
--count_;
if (count_ > 0) {
do {
++index_;
} while ((tags_[index_] & 0x80) == 0);
}
FOLLY_SAFE_DCHECK(index_ < 16, "");
return rv;
}
};
#else
// Iterates a mask, optimized for the case that only a few bits are set
class SparseMaskIter {
MaskType mask_;
......@@ -317,7 +394,7 @@ class DenseMaskIter {
unsigned index_{0};
public:
explicit DenseMaskIter(MaskType mask) : mask_{mask} {}
explicit DenseMaskIter(uint8_t const*, MaskType mask) : mask_{mask} {}
bool hasNext() {
return mask_ != 0;
......@@ -337,6 +414,7 @@ class DenseMaskIter {
}
}
};
#endif
// Iterates a mask, returning pairs of [begin,end) index covering blocks
// of set bits
......@@ -405,10 +483,10 @@ class FirstEmptyInMask {
};
template <typename ItemType>
struct alignas(16) F14Chunk {
struct alignas(kRequiredVectorAlignment) F14Chunk {
using Item = ItemType;
// Assuming alignof(max_align_t) == 16 (and assuming alignof(Item) >=
// For our 16 byte vector alignment (and assuming alignof(Item) >=
// 4) kCapacity of 14 is the most space efficient. Slightly smaller
// or larger capacities can help with cache alignment in a couple of
// cases without wasting too much space, but once the items are larger
......@@ -447,11 +525,15 @@ struct alignas(16) F14Chunk {
rawItems_;
static F14Chunk* emptyInstance() {
auto rv = static_cast<F14Chunk*>(static_cast<void*>(&kEmptyTagVector));
auto raw = reinterpret_cast<char*>(&kEmptyTagVector);
if (kRequiredVectorAlignment > alignof(max_align_t)) {
auto delta = kRequiredVectorAlignment -
(reinterpret_cast<uintptr_t>(raw) % kRequiredVectorAlignment);
raw += delta;
}
auto rv = reinterpret_cast<F14Chunk*>(raw);
FOLLY_SAFE_DCHECK(
!rv->occupied(0) && rv->chunk0Capacity() == 0 &&
rv->outboundOverflowCount() == 0,
"");
(reinterpret_cast<uintptr_t>(rv) % kRequiredVectorAlignment) == 0, "");
return rv;
}
......@@ -534,9 +616,9 @@ struct alignas(16) F14Chunk {
tags_[index] = 0;
}
#if FOLLY_AARCH64
#if FOLLY_NEON
////////
// Tag filtering using AArch64 Advanced SIMD (NEON) intrinsics
// Tag filtering using NEON intrinsics
SparseMaskIter tagMatchIter(uint8_t needle) const {
FOLLY_SAFE_DCHECK((needle & 0x80) != 0, "");
......@@ -550,7 +632,7 @@ struct alignas(16) F14Chunk {
return SparseMaskIter(mask);
}
uint64_t occupiedMask() const {
MaskType occupiedMask() const {
uint8x16_t tagV = vld1q_u8(&tags_[0]);
// signed shift extends top bit to all bits
auto occupiedV =
......@@ -560,7 +642,7 @@ struct alignas(16) F14Chunk {
}
#else
////////
// Tag filtering using x86_64 SSE2 intrinsics
// Tag filtering using SSE2 intrinsics
TagVector const* tagVector() const {
return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0]));
......@@ -575,14 +657,14 @@ struct alignas(16) F14Chunk {
return SparseMaskIter{mask};
}
unsigned occupiedMask() const {
MaskType occupiedMask() const {
auto tagV = _mm_load_si128(tagVector());
return _mm_movemask_epi8(tagV) & kFullMask;
}
#endif
DenseMaskIter occupiedIter() const {
return DenseMaskIter{occupiedMask()};
return DenseMaskIter{&tags_[0], occupiedMask()};
}
MaskRangeIter occupiedRangeIter() const {
......@@ -637,8 +719,7 @@ struct alignas(16) F14Chunk {
template <typename Ptr>
class PackedChunkItemPtr {
public:
PackedChunkItemPtr(Ptr p, std::size_t i) noexcept
: ptr_{p}, index_{static_cast<unsigned>(i)} {
PackedChunkItemPtr(Ptr p, std::size_t i) noexcept : ptr_{p}, index_{i} {
FOLLY_SAFE_DCHECK(ptr_ != nullptr || index_ == 0, "");
}
......@@ -666,7 +747,7 @@ class PackedChunkItemPtr {
private:
Ptr ptr_;
unsigned index_;
std::size_t index_;
};
// Bare pointer form, packed into a uintptr_t. Uses only bits wasted by
......@@ -1131,17 +1212,19 @@ class F14Table : public Policy {
// For hash functions we don't trust to avalanche, we repair things by
// applying a bit mixer to the user-supplied hash.
#if FOLLY_X64 || FOLLY_AARCH64
// 64-bit
static HashPair splitHash(std::size_t hash) {
static_assert(sizeof(std::size_t) == sizeof(uint64_t), "");
uint8_t tag;
if (!Policy::isAvalanchingHasher()) {
#if FOLLY_SSE > 4 || (FOLLY_SSE == 4 && FOLLY_SSE_MINOR >= 2)
#if FOLLY_SSE_PREREQ(4, 2)
// SSE4.2 CRC
auto c = _mm_crc32_u64(0, hash);
tag = static_cast<uint8_t>(~(c >> 25));
hash += c;
#elif FOLLY_AARCH64 && __ARM_FEATURE_CRC32
// AARCH64 CRC is Optional on armv8 (-march=armv8-a+crc), standard
// on armv8.1
#elif __ARM_FEATURE_CRC32
// CRC is optional on armv8 (-march=armv8-a+crc), standard on armv8.1
auto c = __crc32cd(0, hash);
tag = static_cast<uint8_t>(~(c >> 25));
hash += c;
......@@ -1178,6 +1261,35 @@ class F14Table : public Policy {
}
return std::make_pair(hash, tag);
}
#else
// 32-bit
static HashPair splitHash(std::size_t hash) {
static_assert(sizeof(std::size_t) == sizeof(uint32_t), "");
uint8_t tag;
if (!Policy::isAvalanchingHasher()) {
#if FOLLY_SSE_PREREQ(4, 2)
// SSE4.2 CRC
auto c = _mm_crc32_u32(0, hash);
tag = static_cast<uint8_t>(~(c >> 25));
hash += c;
#elif __ARM_FEATURE_CRC32
auto c = __crc32cw(0, hash);
tag = static_cast<uint8_t>(~(c >> 25));
hash += c;
#else
// finalizer for 32-bit murmur2
hash ^= hash >> 13;
hash *= 0x5bd1e995;
hash ^= hash >> 15;
tag = static_cast<uint8_t>(~(hash >> 25));
#endif
} else {
// we don't trust the top bit
tag = (hash >> 24) | 0x80;
}
return std::make_pair(hash, tag);
}
#endif
//////// memory management helpers
......
......@@ -1126,23 +1126,28 @@ TEST(F14VectorMap, destructuringErase) {
0);
}
TEST(F14ValueMap, vectorMaxSize) {
TEST(F14ValueMap, maxSize) {
F14ValueMap<int, int> m;
EXPECT_EQ(
m.max_size(),
std::numeric_limits<uint64_t>::max() / sizeof(std::pair<int, int>));
std::numeric_limits<std::size_t>::max() / sizeof(std::pair<int, int>));
}
TEST(F14NodeMap, vectorMaxSize) {
TEST(F14NodeMap, maxSize) {
F14NodeMap<int, int> m;
EXPECT_EQ(
m.max_size(),
std::numeric_limits<uint64_t>::max() / sizeof(std::pair<int, int>));
std::numeric_limits<std::size_t>::max() / sizeof(std::pair<int, int>));
}
TEST(F14VectorMap, vectorMaxSize) {
F14VectorMap<int, int> m;
EXPECT_EQ(m.max_size(), std::numeric_limits<uint32_t>::max());
EXPECT_EQ(
m.max_size(),
std::min(
std::size_t{std::numeric_limits<uint32_t>::max()},
std::numeric_limits<std::size_t>::max() /
sizeof(std::pair<int, int>)));
}
template <typename M>
......
......@@ -100,19 +100,31 @@ TEST(F14Set, getAllocatedMemorySize) {
{
folly::F14ValueSet<int> set;
set.insert(10);
EXPECT_EQ(sizeof(set), 32);
EXPECT_EQ(set.getAllocatedMemorySize(), 32);
EXPECT_EQ(sizeof(set), 4 * sizeof(void*));
if (alignof(folly::max_align_t) == 16) {
// chunks will be allocated as 2 max_align_t-s
EXPECT_EQ(set.getAllocatedMemorySize(), 32);
} else {
// chunks will be allocated using aligned_malloc with the true size
EXPECT_EQ(set.getAllocatedMemorySize(), 24);
}
}
{
folly::F14NodeSet<int> set;
set.insert(10);
EXPECT_EQ(sizeof(set), 32);
EXPECT_EQ(set.getAllocatedMemorySize(), 36);
EXPECT_EQ(sizeof(set), 4 * sizeof(void*));
if (alignof(folly::max_align_t) == 16) {
// chunks will be allocated as 2 max_align_t-s
EXPECT_EQ(set.getAllocatedMemorySize(), 36);
} else {
// chunks will be allocated using aligned_malloc with the true size
EXPECT_EQ(set.getAllocatedMemorySize(), 20 + 2 * sizeof(void*));
}
}
{
folly::F14VectorSet<int> set;
set.insert(10);
EXPECT_EQ(sizeof(set), 24);
EXPECT_EQ(sizeof(set), 8 + 2 * sizeof(void*));
EXPECT_EQ(set.getAllocatedMemorySize(), 32);
}
}
......@@ -735,19 +747,25 @@ TEST(F14VectorSet, destructuring) {
runInsertAndEmplace<F14VectorSet<Tracked<0>>>();
}
TEST(F14ValueSet, vectorMaxSize) {
TEST(F14ValueSet, maxSize) {
F14ValueSet<int> s;
EXPECT_EQ(s.max_size(), std::numeric_limits<uint64_t>::max() / sizeof(int));
EXPECT_EQ(
s.max_size(), std::numeric_limits<std::size_t>::max() / sizeof(int));
}
TEST(F14NodeSet, vectorMaxSize) {
TEST(F14NodeSet, maxSize) {
F14NodeSet<int> s;
EXPECT_EQ(s.max_size(), std::numeric_limits<uint64_t>::max() / sizeof(int));
EXPECT_EQ(
s.max_size(), std::numeric_limits<std::size_t>::max() / sizeof(int));
}
TEST(F14VectorSet, vectorMaxSize) {
TEST(F14VectorSet, maxSize) {
F14VectorSet<int> s;
EXPECT_EQ(s.max_size(), std::numeric_limits<uint32_t>::max());
EXPECT_EQ(
s.max_size(),
std::min(
std::size_t{std::numeric_limits<uint32_t>::max()},
std::numeric_limits<std::size_t>::max() / sizeof(int)));
}
template <typename S>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment