Commit 27929e3d authored by Nathan Bronson's avatar Nathan Bronson Committed by Facebook Github Bot

use vector intrinsics for F14 on 32-bit platforms

Summary:
This diff adds 32-bit support for F14, so SSE and NEON
intrinsics can be used on x86 and arm architectures (rather than just
x86_64 and aarch64).  The portability fallback to std::unordered_map
and std::unordered_set is now used only when vector intrinsics are not
available, or on PPC.

Reviewed By: shixiao

Differential Revision: D8586283

fbshipit-source-id: 1c4d090e80381fe7ad071c3059b3cb242c04c9f7
parent 3cb1c6d0
...@@ -301,6 +301,12 @@ constexpr auto kIsBigEndian = !kIsLittleEndian; ...@@ -301,6 +301,12 @@ constexpr auto kIsBigEndian = !kIsLittleEndian;
#define FOLLY_SSE_PREREQ(major, minor) \ #define FOLLY_SSE_PREREQ(major, minor) \
(FOLLY_SSE > major || FOLLY_SSE == major && FOLLY_SSE_MINOR >= minor) (FOLLY_SSE > major || FOLLY_SSE == major && FOLLY_SSE_MINOR >= minor)
#ifndef FOLLY_NEON
# if defined(__ARM_NEON)
# define FOLLY_NEON 1
# endif
#endif
#if FOLLY_UNUSUAL_GFLAGS_NAMESPACE #if FOLLY_UNUSUAL_GFLAGS_NAMESPACE
namespace FOLLY_GFLAGS_NAMESPACE { } namespace FOLLY_GFLAGS_NAMESPACE { }
namespace gflags { namespace gflags {
......
...@@ -18,16 +18,16 @@ ...@@ -18,16 +18,16 @@
#include <folly/Portability.h> #include <folly/Portability.h>
// clang-format off
// F14 has been implemented for x86_64 SSE2 and AARCH64 NEON (so far)
#ifndef FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE #ifndef FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
# if FOLLY_SSE >= 2 && FOLLY_X64 || FOLLY_AARCH64
# define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 1 // F14 has been implemented for SSE2 and NEON (so far)
# else #if FOLLY_SSE >= 2 || FOLLY_NEON
# define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 0 #define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 1
# pragma message \ #else
"Vector intrinsics / F14 support unavailable on this platform, " \ #define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 0
#pragma message \
"Vector intrinsics / F14 support unavailable on this platform, " \
"falling back to std::unordered_map / set" "falling back to std::unordered_map / set"
# endif
#endif #endif
// clang-format on
#endif
...@@ -25,7 +25,9 @@ ...@@ -25,7 +25,9 @@
#include <folly/Unit.h> #include <folly/Unit.h>
#include <folly/container/detail/F14Table.h> #include <folly/container/detail/F14Table.h>
#include <folly/hash/Hash.h> #include <folly/hash/Hash.h>
#include <folly/lang/Align.h>
#include <folly/lang/SafeAssert.h> #include <folly/lang/SafeAssert.h>
#include <folly/memory/Malloc.h>
#if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE #if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
...@@ -226,6 +228,10 @@ struct BasePolicy ...@@ -226,6 +228,10 @@ struct BasePolicy
std::size_t computeKeyHash(K const& key) const { std::size_t computeKeyHash(K const& key) const {
static_assert( static_assert(
isAvalanchingHasher() == IsAvalanchingHasher<Hasher, K>::value, ""); isAvalanchingHasher() == IsAvalanchingHasher<Hasher, K>::value, "");
static_assert(
!isAvalanchingHasher() ||
sizeof(decltype(hasher()(key))) >= sizeof(std::size_t),
"hasher is not avalanching if it doesn't return enough bits");
return hasher()(key); return hasher()(key);
} }
...@@ -272,14 +278,13 @@ struct BasePolicy ...@@ -272,14 +278,13 @@ struct BasePolicy
std::size_t /*capacity*/, std::size_t /*capacity*/,
P&& /*rhs*/) {} P&& /*rhs*/) {}
// Rounds chunkBytes up to the next multiple of 16 if it is possible std::size_t alignedAllocSize(std::size_t n) const {
// that a sub-Chunk allocation has a size that is not a multiple of 16. if (kRequiredVectorAlignment <= alignof(max_align_t) ||
static std::size_t alignedChunkAllocSize(std::size_t chunkAllocSize) { std::is_same<ByteAlloc, std::allocator<uint8_t>>::value) {
if ((sizeof(Item) % 8) != 0) { return n;
chunkAllocSize = -(-chunkAllocSize & ~std::size_t{0xf}); } else {
return n + kRequiredVectorAlignment;
} }
FOLLY_SAFE_DCHECK((chunkAllocSize % 16) == 0, "");
return chunkAllocSize;
} }
bool beforeRehash( bool beforeRehash(
...@@ -288,9 +293,9 @@ struct BasePolicy ...@@ -288,9 +293,9 @@ struct BasePolicy
std::size_t /*newCapacity*/, std::size_t /*newCapacity*/,
std::size_t chunkAllocSize, std::size_t chunkAllocSize,
BytePtr& outChunkAllocation) { BytePtr& outChunkAllocation) {
ByteAlloc a{alloc()};
outChunkAllocation = outChunkAllocation =
ByteAllocTraits::allocate(a, alignedChunkAllocSize(chunkAllocSize)); allocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
alloc(), chunkAllocSize);
return false; return false;
} }
...@@ -304,9 +309,8 @@ struct BasePolicy ...@@ -304,9 +309,8 @@ struct BasePolicy
std::size_t chunkAllocSize) { std::size_t chunkAllocSize) {
// on success, this will be the old allocation, on failure the new one // on success, this will be the old allocation, on failure the new one
if (chunkAllocation != nullptr) { if (chunkAllocation != nullptr) {
ByteAlloc a{alloc()}; deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
ByteAllocTraits::deallocate( alloc(), chunkAllocation, chunkAllocSize);
a, chunkAllocation, alignedChunkAllocSize(chunkAllocSize));
} }
} }
...@@ -321,10 +325,8 @@ struct BasePolicy ...@@ -321,10 +325,8 @@ struct BasePolicy
std::size_t /*capacity*/, std::size_t /*capacity*/,
BytePtr chunkAllocation, BytePtr chunkAllocation,
std::size_t chunkAllocSize) { std::size_t chunkAllocSize) {
FOLLY_SAFE_DCHECK(chunkAllocation != nullptr, ""); deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
ByteAlloc a{alloc()}; alloc(), chunkAllocation, chunkAllocSize);
ByteAllocTraits::deallocate(
a, chunkAllocation, alignedChunkAllocSize(chunkAllocSize));
} }
void prefetchValue(Item const&) const { void prefetchValue(Item const&) const {
...@@ -459,6 +461,8 @@ class ValueContainerPolicy : public BasePolicy< ...@@ -459,6 +461,8 @@ class ValueContainerPolicy : public BasePolicy<
using typename Super::Value; using typename Super::Value;
private: private:
using typename Super::ByteAlloc;
using Super::kIsMap; using Super::kIsMap;
public: public:
...@@ -571,7 +575,10 @@ class ValueContainerPolicy : public BasePolicy< ...@@ -571,7 +575,10 @@ class ValueContainerPolicy : public BasePolicy<
std::size_t /*capacity*/, std::size_t /*capacity*/,
V&& visitor) const { V&& visitor) const {
if (chunkAllocSize > 0) { if (chunkAllocSize > 0) {
visitor(Super::alignedChunkAllocSize(chunkAllocSize), 1); visitor(
allocationBytesForOverAligned<ByteAlloc, kRequiredVectorAlignment>(
chunkAllocSize),
1);
} }
} }
...@@ -698,6 +705,8 @@ class NodeContainerPolicy ...@@ -698,6 +705,8 @@ class NodeContainerPolicy
using typename Super::Value; using typename Super::Value;
private: private:
using typename Super::ByteAlloc;
using Super::kIsMap; using Super::kIsMap;
public: public:
...@@ -796,7 +805,10 @@ class NodeContainerPolicy ...@@ -796,7 +805,10 @@ class NodeContainerPolicy
std::size_t /*capacity*/, std::size_t /*capacity*/,
V&& visitor) const { V&& visitor) const {
if (chunkAllocSize > 0) { if (chunkAllocSize > 0) {
visitor(Super::alignedChunkAllocSize(chunkAllocSize), 1); visitor(
allocationBytesForOverAligned<ByteAlloc, kRequiredVectorAlignment>(
chunkAllocSize),
1);
} }
if (size > 0) { if (size > 0) {
visitor(sizeof(Value), size); visitor(sizeof(Value), size);
...@@ -1212,14 +1224,7 @@ class VectorContainerPolicy : public BasePolicy< ...@@ -1212,14 +1224,7 @@ class VectorContainerPolicy : public BasePolicy<
static std::size_t allocSize( static std::size_t allocSize(
std::size_t prefixBytes, std::size_t prefixBytes,
std::size_t valueCapacity) { std::size_t valueCapacity) {
auto n = valuesOffset(prefixBytes) + sizeof(Value) * valueCapacity; return valuesOffset(prefixBytes) + sizeof(Value) * valueCapacity;
if (alignof(Value) <= 8) {
// ensure that the result is a multiple of 16 to protect against
// allocators that don't always align to 16
n = -(-n & ~std::size_t{0xf});
}
FOLLY_SAFE_DCHECK((n % 16) == 0, "");
return n;
} }
public: public:
...@@ -1235,11 +1240,9 @@ class VectorContainerPolicy : public BasePolicy< ...@@ -1235,11 +1240,9 @@ class VectorContainerPolicy : public BasePolicy<
newCapacity <= (std::numeric_limits<Item>::max)(), newCapacity <= (std::numeric_limits<Item>::max)(),
""); "");
{ outChunkAllocation =
ByteAlloc a{this->alloc()}; allocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
outChunkAllocation = Super::alloc(), allocSize(chunkAllocSize, newCapacity));
ByteAllocTraits::allocate(a, allocSize(chunkAllocSize, newCapacity));
}
ValuePtr before = values_; ValuePtr before = values_;
ValuePtr after = std::pointer_traits<ValuePtr>::pointer_to( ValuePtr after = std::pointer_traits<ValuePtr>::pointer_to(
...@@ -1279,9 +1282,8 @@ class VectorContainerPolicy : public BasePolicy< ...@@ -1279,9 +1282,8 @@ class VectorContainerPolicy : public BasePolicy<
// on success, chunkAllocation is the old allocation, on failure it is the // on success, chunkAllocation is the old allocation, on failure it is the
// new one // new one
if (chunkAllocation != nullptr) { if (chunkAllocation != nullptr) {
ByteAlloc a{this->alloc()}; deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
ByteAllocTraits::deallocate( Super::alloc(),
a,
chunkAllocation, chunkAllocation,
allocSize(chunkAllocSize, (success ? oldCapacity : newCapacity))); allocSize(chunkAllocSize, (success ? oldCapacity : newCapacity)));
} }
...@@ -1306,9 +1308,8 @@ class VectorContainerPolicy : public BasePolicy< ...@@ -1306,9 +1308,8 @@ class VectorContainerPolicy : public BasePolicy<
BytePtr chunkAllocation, BytePtr chunkAllocation,
std::size_t chunkAllocSize) { std::size_t chunkAllocSize) {
if (chunkAllocation != nullptr) { if (chunkAllocation != nullptr) {
ByteAlloc a{this->alloc()}; deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
ByteAllocTraits::deallocate( Super::alloc(), chunkAllocation, allocSize(chunkAllocSize, capacity));
a, chunkAllocation, allocSize(chunkAllocSize, capacity));
values_ = nullptr; values_ = nullptr;
} }
} }
...@@ -1321,7 +1322,10 @@ class VectorContainerPolicy : public BasePolicy< ...@@ -1321,7 +1322,10 @@ class VectorContainerPolicy : public BasePolicy<
V&& visitor) const { V&& visitor) const {
FOLLY_SAFE_DCHECK((chunkAllocSize == 0) == (capacity == 0), ""); FOLLY_SAFE_DCHECK((chunkAllocSize == 0) == (capacity == 0), "");
if (chunkAllocSize > 0) { if (chunkAllocSize > 0) {
visitor(allocSize(chunkAllocSize, capacity), 1); visitor(
allocationBytesForOverAligned<ByteAlloc, kRequiredVectorAlignment>(
allocSize(chunkAllocSize, capacity)),
1);
} }
} }
......
...@@ -24,7 +24,7 @@ namespace folly { ...@@ -24,7 +24,7 @@ namespace folly {
namespace f14 { namespace f14 {
namespace detail { namespace detail {
TagVector kEmptyTagVector = {}; EmptyTagVectorType kEmptyTagVector = {};
} // namespace detail } // namespace detail
} // namespace f14 } // namespace f14
......
...@@ -48,8 +48,8 @@ ...@@ -48,8 +48,8 @@
#include <folly/container/detail/F14IntrinsicsAvailability.h> #include <folly/container/detail/F14IntrinsicsAvailability.h>
#if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE #if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
#if FOLLY_AARCH64 #if FOLLY_NEON
#if __ARM_FEATURE_CRC32 #ifdef __ARM_FEATURE_CRC32
#include <arm_acle.h> // __crc32cd #include <arm_acle.h> // __crc32cd
#endif #endif
#include <arm_neon.h> // uint8x16t intrinsics #include <arm_neon.h> // uint8x16t intrinsics
...@@ -246,12 +246,11 @@ using KeyTypeForEmplace = KeyTypeForEmplaceHelper< ...@@ -246,12 +246,11 @@ using KeyTypeForEmplace = KeyTypeForEmplaceHelper<
template <typename T> template <typename T>
FOLLY_ALWAYS_INLINE static void prefetchAddr(T const* ptr) { FOLLY_ALWAYS_INLINE static void prefetchAddr(T const* ptr) {
#if FOLLY_AARCH64 #ifndef _WIN32
__builtin_prefetch(static_cast<void const*>(ptr)); __builtin_prefetch(static_cast<void const*>(ptr));
#elif FOLLY_NEON
__prefetch(static_cast<void const*>(ptr));
#else #else
// _mm_prefetch is x86_64-specific and comes from xmmintrin.h.
// It seems to compile to the same thing as __builtin_prefetch, but
// also works on windows.
_mm_prefetch( _mm_prefetch(
static_cast<char const*>(static_cast<void const*>(ptr)), _MM_HINT_T0); static_cast<char const*>(static_cast<void const*>(ptr)), _MM_HINT_T0);
#endif #endif
...@@ -267,21 +266,31 @@ FOLLY_ALWAYS_INLINE static unsigned findFirstSetNonZero(T mask) { ...@@ -267,21 +266,31 @@ FOLLY_ALWAYS_INLINE static unsigned findFirstSetNonZero(T mask) {
} }
} }
#if FOLLY_AARCH64 #if FOLLY_NEON
using TagVector = uint8x16_t; using TagVector = uint8x16_t;
using MaskType = uint64_t; using MaskType = uint64_t;
constexpr unsigned kMaskSpacing = 4; constexpr unsigned kMaskSpacing = 4;
#else #else // SSE2
using TagVector = __m128i; using TagVector = __m128i;
using MaskType = unsigned; using MaskType = uint32_t;
constexpr unsigned kMaskSpacing = 1; constexpr unsigned kMaskSpacing = 1;
#endif #endif
extern TagVector kEmptyTagVector; // We could use unaligned loads to relax this requirement, but that
// would be both a performance penalty and require a bulkier packed
// ItemIter format
constexpr std::size_t kRequiredVectorAlignment =
constexpr_max(std::size_t{16}, alignof(max_align_t));
using EmptyTagVectorType = std::aligned_storage_t<
sizeof(TagVector) + kRequiredVectorAlignment,
alignof(max_align_t)>;
extern EmptyTagVectorType kEmptyTagVector;
template <unsigned BitCount> template <unsigned BitCount>
struct FullMask { struct FullMask {
...@@ -292,6 +301,74 @@ struct FullMask { ...@@ -292,6 +301,74 @@ struct FullMask {
template <> template <>
struct FullMask<1> : std::integral_constant<MaskType, 1> {}; struct FullMask<1> : std::integral_constant<MaskType, 1> {};
#if FOLLY_ARM
// Mask iteration is different for ARM because that is the only platform
// for which the mask is bigger than a register.
// Iterates a mask, optimized for the case that only a few bits are set
class SparseMaskIter {
static_assert(kMaskSpacing == 4, "");
uint32_t interleavedMask_;
public:
explicit SparseMaskIter(MaskType mask)
: interleavedMask_{static_cast<uint32_t>(((mask >> 32) << 2) | mask)} {}
bool hasNext() {
return interleavedMask_ != 0;
}
unsigned next() {
FOLLY_SAFE_DCHECK(hasNext(), "");
unsigned i = findFirstSetNonZero(interleavedMask_);
interleavedMask_ &= (interleavedMask_ - 1);
return ((i >> 2) | (i << 2)) & 0xf;
}
};
// Iterates a mask, optimized for the case that most bits are set
class DenseMaskIter {
static_assert(kMaskSpacing == 4, "");
std::size_t count_;
unsigned index_;
uint8_t const* tags_;
public:
explicit DenseMaskIter(uint8_t const* tags, MaskType mask) {
if (mask == 0) {
count_ = 0;
} else {
count_ =
folly::popcount(static_cast<uint32_t>(((mask >> 32) << 2) | mask));
if (LIKELY((mask & 1) != 0)) {
index_ = 0;
} else {
index_ = findFirstSetNonZero(mask) / kMaskSpacing;
}
tags_ = tags;
}
}
bool hasNext() {
return count_ > 0;
}
unsigned next() {
auto rv = index_;
--count_;
if (count_ > 0) {
do {
++index_;
} while ((tags_[index_] & 0x80) == 0);
}
FOLLY_SAFE_DCHECK(index_ < 16, "");
return rv;
}
};
#else
// Iterates a mask, optimized for the case that only a few bits are set // Iterates a mask, optimized for the case that only a few bits are set
class SparseMaskIter { class SparseMaskIter {
MaskType mask_; MaskType mask_;
...@@ -317,7 +394,7 @@ class DenseMaskIter { ...@@ -317,7 +394,7 @@ class DenseMaskIter {
unsigned index_{0}; unsigned index_{0};
public: public:
explicit DenseMaskIter(MaskType mask) : mask_{mask} {} explicit DenseMaskIter(uint8_t const*, MaskType mask) : mask_{mask} {}
bool hasNext() { bool hasNext() {
return mask_ != 0; return mask_ != 0;
...@@ -337,6 +414,7 @@ class DenseMaskIter { ...@@ -337,6 +414,7 @@ class DenseMaskIter {
} }
} }
}; };
#endif
// Iterates a mask, returning pairs of [begin,end) index covering blocks // Iterates a mask, returning pairs of [begin,end) index covering blocks
// of set bits // of set bits
...@@ -405,10 +483,10 @@ class FirstEmptyInMask { ...@@ -405,10 +483,10 @@ class FirstEmptyInMask {
}; };
template <typename ItemType> template <typename ItemType>
struct alignas(16) F14Chunk { struct alignas(kRequiredVectorAlignment) F14Chunk {
using Item = ItemType; using Item = ItemType;
// Assuming alignof(max_align_t) == 16 (and assuming alignof(Item) >= // For our 16 byte vector alignment (and assuming alignof(Item) >=
// 4) kCapacity of 14 is the most space efficient. Slightly smaller // 4) kCapacity of 14 is the most space efficient. Slightly smaller
// or larger capacities can help with cache alignment in a couple of // or larger capacities can help with cache alignment in a couple of
// cases without wasting too much space, but once the items are larger // cases without wasting too much space, but once the items are larger
...@@ -447,11 +525,15 @@ struct alignas(16) F14Chunk { ...@@ -447,11 +525,15 @@ struct alignas(16) F14Chunk {
rawItems_; rawItems_;
static F14Chunk* emptyInstance() { static F14Chunk* emptyInstance() {
auto rv = static_cast<F14Chunk*>(static_cast<void*>(&kEmptyTagVector)); auto raw = reinterpret_cast<char*>(&kEmptyTagVector);
if (kRequiredVectorAlignment > alignof(max_align_t)) {
auto delta = kRequiredVectorAlignment -
(reinterpret_cast<uintptr_t>(raw) % kRequiredVectorAlignment);
raw += delta;
}
auto rv = reinterpret_cast<F14Chunk*>(raw);
FOLLY_SAFE_DCHECK( FOLLY_SAFE_DCHECK(
!rv->occupied(0) && rv->chunk0Capacity() == 0 && (reinterpret_cast<uintptr_t>(rv) % kRequiredVectorAlignment) == 0, "");
rv->outboundOverflowCount() == 0,
"");
return rv; return rv;
} }
...@@ -534,9 +616,9 @@ struct alignas(16) F14Chunk { ...@@ -534,9 +616,9 @@ struct alignas(16) F14Chunk {
tags_[index] = 0; tags_[index] = 0;
} }
#if FOLLY_AARCH64 #if FOLLY_NEON
//////// ////////
// Tag filtering using AArch64 Advanced SIMD (NEON) intrinsics // Tag filtering using NEON intrinsics
SparseMaskIter tagMatchIter(uint8_t needle) const { SparseMaskIter tagMatchIter(uint8_t needle) const {
FOLLY_SAFE_DCHECK((needle & 0x80) != 0, ""); FOLLY_SAFE_DCHECK((needle & 0x80) != 0, "");
...@@ -550,7 +632,7 @@ struct alignas(16) F14Chunk { ...@@ -550,7 +632,7 @@ struct alignas(16) F14Chunk {
return SparseMaskIter(mask); return SparseMaskIter(mask);
} }
uint64_t occupiedMask() const { MaskType occupiedMask() const {
uint8x16_t tagV = vld1q_u8(&tags_[0]); uint8x16_t tagV = vld1q_u8(&tags_[0]);
// signed shift extends top bit to all bits // signed shift extends top bit to all bits
auto occupiedV = auto occupiedV =
...@@ -560,7 +642,7 @@ struct alignas(16) F14Chunk { ...@@ -560,7 +642,7 @@ struct alignas(16) F14Chunk {
} }
#else #else
//////// ////////
// Tag filtering using x86_64 SSE2 intrinsics // Tag filtering using SSE2 intrinsics
TagVector const* tagVector() const { TagVector const* tagVector() const {
return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0])); return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0]));
...@@ -575,14 +657,14 @@ struct alignas(16) F14Chunk { ...@@ -575,14 +657,14 @@ struct alignas(16) F14Chunk {
return SparseMaskIter{mask}; return SparseMaskIter{mask};
} }
unsigned occupiedMask() const { MaskType occupiedMask() const {
auto tagV = _mm_load_si128(tagVector()); auto tagV = _mm_load_si128(tagVector());
return _mm_movemask_epi8(tagV) & kFullMask; return _mm_movemask_epi8(tagV) & kFullMask;
} }
#endif #endif
DenseMaskIter occupiedIter() const { DenseMaskIter occupiedIter() const {
return DenseMaskIter{occupiedMask()}; return DenseMaskIter{&tags_[0], occupiedMask()};
} }
MaskRangeIter occupiedRangeIter() const { MaskRangeIter occupiedRangeIter() const {
...@@ -637,8 +719,7 @@ struct alignas(16) F14Chunk { ...@@ -637,8 +719,7 @@ struct alignas(16) F14Chunk {
template <typename Ptr> template <typename Ptr>
class PackedChunkItemPtr { class PackedChunkItemPtr {
public: public:
PackedChunkItemPtr(Ptr p, std::size_t i) noexcept PackedChunkItemPtr(Ptr p, std::size_t i) noexcept : ptr_{p}, index_{i} {
: ptr_{p}, index_{static_cast<unsigned>(i)} {
FOLLY_SAFE_DCHECK(ptr_ != nullptr || index_ == 0, ""); FOLLY_SAFE_DCHECK(ptr_ != nullptr || index_ == 0, "");
} }
...@@ -666,7 +747,7 @@ class PackedChunkItemPtr { ...@@ -666,7 +747,7 @@ class PackedChunkItemPtr {
private: private:
Ptr ptr_; Ptr ptr_;
unsigned index_; std::size_t index_;
}; };
// Bare pointer form, packed into a uintptr_t. Uses only bits wasted by // Bare pointer form, packed into a uintptr_t. Uses only bits wasted by
...@@ -1131,17 +1212,19 @@ class F14Table : public Policy { ...@@ -1131,17 +1212,19 @@ class F14Table : public Policy {
// For hash functions we don't trust to avalanche, we repair things by // For hash functions we don't trust to avalanche, we repair things by
// applying a bit mixer to the user-supplied hash. // applying a bit mixer to the user-supplied hash.
#if FOLLY_X64 || FOLLY_AARCH64
// 64-bit
static HashPair splitHash(std::size_t hash) { static HashPair splitHash(std::size_t hash) {
static_assert(sizeof(std::size_t) == sizeof(uint64_t), "");
uint8_t tag; uint8_t tag;
if (!Policy::isAvalanchingHasher()) { if (!Policy::isAvalanchingHasher()) {
#if FOLLY_SSE > 4 || (FOLLY_SSE == 4 && FOLLY_SSE_MINOR >= 2) #if FOLLY_SSE_PREREQ(4, 2)
// SSE4.2 CRC // SSE4.2 CRC
auto c = _mm_crc32_u64(0, hash); auto c = _mm_crc32_u64(0, hash);
tag = static_cast<uint8_t>(~(c >> 25)); tag = static_cast<uint8_t>(~(c >> 25));
hash += c; hash += c;
#elif FOLLY_AARCH64 && __ARM_FEATURE_CRC32 #elif __ARM_FEATURE_CRC32
// AARCH64 CRC is Optional on armv8 (-march=armv8-a+crc), standard // CRC is optional on armv8 (-march=armv8-a+crc), standard on armv8.1
// on armv8.1
auto c = __crc32cd(0, hash); auto c = __crc32cd(0, hash);
tag = static_cast<uint8_t>(~(c >> 25)); tag = static_cast<uint8_t>(~(c >> 25));
hash += c; hash += c;
...@@ -1178,6 +1261,35 @@ class F14Table : public Policy { ...@@ -1178,6 +1261,35 @@ class F14Table : public Policy {
} }
return std::make_pair(hash, tag); return std::make_pair(hash, tag);
} }
#else
// 32-bit
static HashPair splitHash(std::size_t hash) {
static_assert(sizeof(std::size_t) == sizeof(uint32_t), "");
uint8_t tag;
if (!Policy::isAvalanchingHasher()) {
#if FOLLY_SSE_PREREQ(4, 2)
// SSE4.2 CRC
auto c = _mm_crc32_u32(0, hash);
tag = static_cast<uint8_t>(~(c >> 25));
hash += c;
#elif __ARM_FEATURE_CRC32
auto c = __crc32cw(0, hash);
tag = static_cast<uint8_t>(~(c >> 25));
hash += c;
#else
// finalizer for 32-bit murmur2
hash ^= hash >> 13;
hash *= 0x5bd1e995;
hash ^= hash >> 15;
tag = static_cast<uint8_t>(~(hash >> 25));
#endif
} else {
// we don't trust the top bit
tag = (hash >> 24) | 0x80;
}
return std::make_pair(hash, tag);
}
#endif
//////// memory management helpers //////// memory management helpers
......
...@@ -1126,23 +1126,28 @@ TEST(F14VectorMap, destructuringErase) { ...@@ -1126,23 +1126,28 @@ TEST(F14VectorMap, destructuringErase) {
0); 0);
} }
TEST(F14ValueMap, vectorMaxSize) { TEST(F14ValueMap, maxSize) {
F14ValueMap<int, int> m; F14ValueMap<int, int> m;
EXPECT_EQ( EXPECT_EQ(
m.max_size(), m.max_size(),
std::numeric_limits<uint64_t>::max() / sizeof(std::pair<int, int>)); std::numeric_limits<std::size_t>::max() / sizeof(std::pair<int, int>));
} }
TEST(F14NodeMap, vectorMaxSize) { TEST(F14NodeMap, maxSize) {
F14NodeMap<int, int> m; F14NodeMap<int, int> m;
EXPECT_EQ( EXPECT_EQ(
m.max_size(), m.max_size(),
std::numeric_limits<uint64_t>::max() / sizeof(std::pair<int, int>)); std::numeric_limits<std::size_t>::max() / sizeof(std::pair<int, int>));
} }
TEST(F14VectorMap, vectorMaxSize) { TEST(F14VectorMap, vectorMaxSize) {
F14VectorMap<int, int> m; F14VectorMap<int, int> m;
EXPECT_EQ(m.max_size(), std::numeric_limits<uint32_t>::max()); EXPECT_EQ(
m.max_size(),
std::min(
std::size_t{std::numeric_limits<uint32_t>::max()},
std::numeric_limits<std::size_t>::max() /
sizeof(std::pair<int, int>)));
} }
template <typename M> template <typename M>
......
...@@ -100,19 +100,31 @@ TEST(F14Set, getAllocatedMemorySize) { ...@@ -100,19 +100,31 @@ TEST(F14Set, getAllocatedMemorySize) {
{ {
folly::F14ValueSet<int> set; folly::F14ValueSet<int> set;
set.insert(10); set.insert(10);
EXPECT_EQ(sizeof(set), 32); EXPECT_EQ(sizeof(set), 4 * sizeof(void*));
EXPECT_EQ(set.getAllocatedMemorySize(), 32); if (alignof(folly::max_align_t) == 16) {
// chunks will be allocated as 2 max_align_t-s
EXPECT_EQ(set.getAllocatedMemorySize(), 32);
} else {
// chunks will be allocated using aligned_malloc with the true size
EXPECT_EQ(set.getAllocatedMemorySize(), 24);
}
} }
{ {
folly::F14NodeSet<int> set; folly::F14NodeSet<int> set;
set.insert(10); set.insert(10);
EXPECT_EQ(sizeof(set), 32); EXPECT_EQ(sizeof(set), 4 * sizeof(void*));
EXPECT_EQ(set.getAllocatedMemorySize(), 36); if (alignof(folly::max_align_t) == 16) {
// chunks will be allocated as 2 max_align_t-s
EXPECT_EQ(set.getAllocatedMemorySize(), 36);
} else {
// chunks will be allocated using aligned_malloc with the true size
EXPECT_EQ(set.getAllocatedMemorySize(), 20 + 2 * sizeof(void*));
}
} }
{ {
folly::F14VectorSet<int> set; folly::F14VectorSet<int> set;
set.insert(10); set.insert(10);
EXPECT_EQ(sizeof(set), 24); EXPECT_EQ(sizeof(set), 8 + 2 * sizeof(void*));
EXPECT_EQ(set.getAllocatedMemorySize(), 32); EXPECT_EQ(set.getAllocatedMemorySize(), 32);
} }
} }
...@@ -735,19 +747,25 @@ TEST(F14VectorSet, destructuring) { ...@@ -735,19 +747,25 @@ TEST(F14VectorSet, destructuring) {
runInsertAndEmplace<F14VectorSet<Tracked<0>>>(); runInsertAndEmplace<F14VectorSet<Tracked<0>>>();
} }
TEST(F14ValueSet, vectorMaxSize) { TEST(F14ValueSet, maxSize) {
F14ValueSet<int> s; F14ValueSet<int> s;
EXPECT_EQ(s.max_size(), std::numeric_limits<uint64_t>::max() / sizeof(int)); EXPECT_EQ(
s.max_size(), std::numeric_limits<std::size_t>::max() / sizeof(int));
} }
TEST(F14NodeSet, vectorMaxSize) { TEST(F14NodeSet, maxSize) {
F14NodeSet<int> s; F14NodeSet<int> s;
EXPECT_EQ(s.max_size(), std::numeric_limits<uint64_t>::max() / sizeof(int)); EXPECT_EQ(
s.max_size(), std::numeric_limits<std::size_t>::max() / sizeof(int));
} }
TEST(F14VectorSet, vectorMaxSize) { TEST(F14VectorSet, maxSize) {
F14VectorSet<int> s; F14VectorSet<int> s;
EXPECT_EQ(s.max_size(), std::numeric_limits<uint32_t>::max()); EXPECT_EQ(
s.max_size(),
std::min(
std::size_t{std::numeric_limits<uint32_t>::max()},
std::numeric_limits<std::size_t>::max() / sizeof(int)));
} }
template <typename S> template <typename S>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment