Commit 19c316ae authored by Nathan Bronson's avatar Nathan Bronson Committed by Facebook Github Bot

native aarch64 support for F14 hash tables

Summary:
This diff adds support for the F14 algorithm using the advanced
SIMD instructions (NEON) that are part of aarch64.

Reviewed By: yfeldblum

Differential Revision: D7765273

fbshipit-source-id: 866a8a3481ad60b8aadcfb39718d6a5e62bbe07c
parent 25f4df21
......@@ -2,11 +2,12 @@
F14 is a 14-way probing hash table that resolves collisions by double
hashing. Up to 14 keys are stored in a chunk at a single hash table
position. SSE2 vector instructions are used to filter within a chunk;
intra-chunk search takes only a handful of instructions. **F14** refers
to the fact that the algorithm **F**ilters up to **14** keys at a time.
This strategy allows the hash table to be operated at a high maximum
load factor (12/14) while still keeping probe chains very short.
position. Vector instructions (SSE2 on x86_64, NEON on aarch64)
are used to filter within a chunk; intra-chunk search takes only a
handful of instructions. **F14** refers to the fact that the algorithm
**F**ilters up to **14** keys at a time. This strategy allows the hash
table to be operated at a high maximum load factor (12/14) while still
keeping probe chains very short.
F14 provides compelling replacements for most of the hash tables we use in
production at Facebook. Switching to it can improve memory efficiency
......@@ -157,10 +158,12 @@ unlikely to perform any key comparisons, successful searches are likely
to perform exactly 1 comparison, and all of the resulting branches are
pretty predictable.
The vector search uses SSE2 intrinsics. SSE2 is a non-optional part
of the x86_64 platform, so every 64-bit x86 platform supports them.
AARCH64's vector instructions will allow a similar strategy, although
the lack of a movemask operation complicates things a bit.
The vector search is coded using SIMD intrinsics, SSE2 on x86_64 and
NEON on aarch64. These instructions are a non-optional part of those
platforms (unlike later SIMD instruction sets like AVX2 or SVE), so no
special compilation flags are required. The exact vector operations
performed differs between x86_64 and aarch64 because aarch64 lacks a
movemask instruction, but the F14 algorithm is the same.
## WHAT ABOUT MEMORY OVERHEAD FOR SMALL TABLES?
......
......@@ -19,9 +19,9 @@
#include <folly/Portability.h>
// clang-format off
// F14 is only available on x86 with SSE2 intrinsics (so far)
// F14 has been implemented for SSE2 and AARCH64 NEON (so far)
#ifndef FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
# if FOLLY_SSE >= 2
# if FOLLY_SSE >= 2 || FOLLY_AARCH64
# define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 1
# else
# define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 0
......
......@@ -89,7 +89,7 @@ struct BasePolicy
return IsAvalanchingHasher<Hasher, Key>::value;
}
using Chunk = SSE2Chunk<Item>;
using Chunk = F14Chunk<Item>;
using ChunkPtr = typename std::pointer_traits<
typename AllocTraits::pointer>::template rebind<Chunk>;
using ItemIter = F14ItemIter<ChunkPtr>;
......@@ -240,7 +240,7 @@ class BaseIter : public std::iterator<
ValuePtr,
decltype(*std::declval<ValuePtr>())> {
protected:
using Chunk = SSE2Chunk<Item>;
using Chunk = F14Chunk<Item>;
using ChunkPtr =
typename std::pointer_traits<ValuePtr>::template rebind<Chunk>;
using ItemIter = F14ItemIter<ChunkPtr>;
......
......@@ -24,7 +24,7 @@ namespace folly {
namespace f14 {
namespace detail {
__m128i kEmptyTagVector = {};
TagVector kEmptyTagVector = {};
} // namespace detail
} // namespace f14
......
......@@ -35,7 +35,9 @@
#include <folly/Portability.h>
#include <folly/ScopeGuard.h>
#include <folly/Traits.h>
#include <folly/functional/ApplyTuple.h>
#include <folly/functional/Invoke.h>
#include <folly/lang/Align.h>
#include <folly/lang/Assume.h>
#include <folly/lang/Exception.h>
#include <folly/lang/Launder.h>
......@@ -48,9 +50,13 @@
#include <folly/container/detail/F14Memory.h>
#if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
#if FOLLY_AARCH64
#include <arm_neon.h>
#else // SSE2
#include <immintrin.h> // __m128i intrinsics
#include <xmmintrin.h> // _mm_prefetch
#endif
#endif
#ifdef _WIN32
#include <intrin.h> // for _mul128
......@@ -198,26 +204,196 @@ struct EnableIfIsTransparent<
template <typename T>
FOLLY_ALWAYS_INLINE static void prefetchAddr(T const* ptr) {
#if FOLLY_AARCH64
__builtin_prefetch(static_cast<void const*>(ptr));
#else
// _mm_prefetch is x86_64-specific and comes from xmmintrin.h.
// It compiles to the same thing as __builtin_prefetch.
// It seems to compile to the same thing as __builtin_prefetch, but
// also works on windows.
_mm_prefetch(
static_cast<char const*>(static_cast<void const*>(ptr)), _MM_HINT_T0);
#endif
}
extern __m128i kEmptyTagVector;
#if FOLLY_AARCH64
using TagVector = uint8x16_t;
#else
using TagVector = __m128i;
#endif
extern TagVector kEmptyTagVector;
// Iterates a 64-bit mask where elements are strided by 8 and the elements
// at indexes 8 and higher are layered back over the bottom 64-bits with
// a 4-bit offset.
//
// bitIndex = ((tagIndex * 8) % 64) + (tagIndex >= 8 ? 4 : 0)
//
// Iteration occurs in bitIndex order, not tagIndex. That should be fine
// for a sparse iterator, where we expect either 0 or 1 tag.
class Sparse8Interleaved4MaskIter {
uint64_t mask_;
public:
explicit Sparse8Interleaved4MaskIter(uint64_t mask) : mask_{mask} {}
bool hasNext() {
return mask_ != 0;
}
unsigned next() {
FOLLY_SAFE_DCHECK(hasNext(), "");
unsigned mixed = __builtin_ctzll(mask_);
FOLLY_SAFE_DCHECK((mixed % 4) == 0, "");
mask_ &= (mask_ - 1);
// mixed >> 3 has the bottom 3 bits of the result (no masking needed
// because all of the higher bits will be empty). mixed & 4 holds the
// bit that should be result & 8. We can merge it in either before or
// after sliding. Merging it before means we need to shift it left 4
// (so that the right shift 3 turns it into a left 1), which happens
// to be the same as multiplication by 17.
return ((mixed * 0x11) >> 3) & 0xf;
}
};
// Iterates downward on occupied indexes by just checking tags[i] instead
// of using a mask
class TagCheckingIter {
uint8_t const* tags_;
int nextIndex_;
public:
explicit TagCheckingIter(uint8_t const* tags, int maxIndex)
: tags_{tags}, nextIndex_{maxIndex} {}
bool hasNext() {
return nextIndex_ >= 0;
}
unsigned next() {
auto rv = static_cast<unsigned>(nextIndex_);
do {
--nextIndex_;
} while (nextIndex_ >= 0 && tags_[nextIndex_] == 0);
return rv;
}
};
// Holds the result of an index query that has an optional result,
// interpreting an index of -1 to be the empty answer
class IndexHolder {
int index_;
public:
explicit IndexHolder(int index) : index_{index} {}
bool hasIndex() const {
return index_ >= 0;
}
unsigned index() const {
FOLLY_SAFE_DCHECK(hasIndex(), "");
return static_cast<unsigned>(index_);
}
};
// Iterates a mask, optimized for the case that only a few bits are set
class SparseMaskIter {
unsigned mask_;
public:
explicit SparseMaskIter(unsigned mask) : mask_{mask} {}
bool hasNext() {
return mask_ != 0;
}
unsigned next() {
FOLLY_SAFE_DCHECK(hasNext(), "");
unsigned i = __builtin_ctz(mask_);
mask_ &= (mask_ - 1);
return i;
}
};
// Iterates a mask, optimized for the case that most bits are set
class DenseMaskIter {
unsigned mask_;
unsigned index_{0};
public:
explicit DenseMaskIter(unsigned mask) : mask_{mask} {}
bool hasNext() {
return mask_ != 0;
}
unsigned next() {
FOLLY_SAFE_DCHECK(hasNext(), "");
if (LIKELY((mask_ & 1) != 0)) {
mask_ >>= 1;
return index_++;
} else {
unsigned s = __builtin_ctz(mask_);
unsigned rv = index_ + s;
mask_ >>= (s + 1);
index_ = rv + 1;
return rv;
}
}
};
// Holds the result of an index query that has an optional result,
// interpreting a mask of 0 to be the empty answer and the index of the
// last set bit to be the non-empty answer
class LastOccupiedInMask {
unsigned mask_;
public:
explicit LastOccupiedInMask(unsigned mask) : mask_{mask} {}
bool hasIndex() const {
return mask_ != 0;
}
unsigned index() const {
folly::assume(mask_ != 0);
return folly::findLastSet(mask_) - 1;
}
};
// Holds the result of an index query that has an optional result,
// interpreting a mask of 0 to be the empty answer and the index of the
// first set bit to be the non-empty answer
class FirstEmptyInMask {
unsigned mask_;
public:
explicit FirstEmptyInMask(unsigned mask) : mask_{mask} {}
bool hasIndex() const {
return mask_ != 0;
}
unsigned index() const {
FOLLY_SAFE_DCHECK(mask_ != 0, "");
return __builtin_ctz(mask_);
}
};
template <typename ItemType>
struct alignas(std::max_align_t) SSE2Chunk {
struct alignas(max_align_t) F14Chunk {
using Item = ItemType;
// Assuming alignof(std::max_align_t) == 16 (and assuming alignof(Item)
// >= 4) kCapacity of 14 is always most space efficient. Slightly
// smaller or larger capacities can help with cache alignment in a
// couple of cases without wasting too much space, but once the items
// are larger then we're unlikely to get much benefit anyway. The only
// case we optimize is using kCapacity of 12 for 4 byte items, which
// makes the chunk take exactly 1 cache line, and adding 16 bytes of
// padding for 16 byte items so that a chunk takes exactly 4 cache lines.
// Assuming alignof(max_align_t) == 16 (and assuming alignof(Item) >=
// 4) kCapacity of 14 is the most space efficient. Slightly smaller
// or larger capacities can help with cache alignment in a couple of
// cases without wasting too much space, but once the items are larger
// then we're unlikely to get much benefit anyway. The only case we
// optimize is using kCapacity of 12 for 4 byte items, which makes the
// chunk take exactly 1 cache line, and adding 16 bytes of padding for
// 16 byte items so that a chunk takes exactly 4 cache lines.
static constexpr unsigned kCapacity = sizeof(Item) == 4 ? 12 : 14;
static constexpr unsigned kDesiredCapacity = kCapacity - 2;
......@@ -248,24 +424,32 @@ struct alignas(std::max_align_t) SSE2Chunk {
kAllocatedCapacity>
rawItems_;
static SSE2Chunk* emptyInstance() {
auto rv = static_cast<SSE2Chunk*>(static_cast<void*>(&kEmptyTagVector));
static F14Chunk* emptyInstance() {
auto rv = static_cast<F14Chunk*>(static_cast<void*>(&kEmptyTagVector));
FOLLY_SAFE_DCHECK(
rv->occupiedMask() == 0 && rv->chunk0Capacity() == 0 &&
!rv->occupied(0) && rv->chunk0Capacity() == 0 &&
rv->outboundOverflowCount() == 0,
"");
return rv;
}
void clear() {
// tags_ = {}; control_ = 0; outboundOverflowCount_ = 0;
// gcc < 6 doesn't exploit chunk alignment to generate the optimal
// SSE clear from memset. This is very hot code, so it is worth
// handling that case specially.
#if FOLLY_SSE >= 2 && __GNUC__ <= 5 && !__clang__
// this doesn't violate strict aliasing rules because __m128i is
// tagged as __may_alias__
auto* v = static_cast<__m128i*>(static_cast<void*>(&tags_[0]));
_mm_store_si128(v, _mm_setzero_si128());
// tags_ = {}; control_ = 0; outboundOverflowCount_ = 0;
#else
std::memset(&tags_[0], '\0', 16);
#endif
}
void copyOverflowInfoFrom(SSE2Chunk const& rhs) {
void copyOverflowInfoFrom(F14Chunk const& rhs) {
FOLLY_SAFE_DCHECK(hostedOverflowCount() == 0, "");
control_ += rhs.control_ & 0xf0;
outboundOverflowCount_ = rhs.outboundOverflowCount_;
......@@ -328,16 +512,84 @@ struct alignas(std::max_align_t) SSE2Chunk {
tags_[index] = 0;
}
__m128i const* tagVector() const {
return static_cast<__m128i const*>(static_cast<void const*>(&tags_[0]));
#if FOLLY_AARCH64
////////
// Tag filtering using AArch64 Advanced SIMD (NEON) intrinsics
Sparse8Interleaved4MaskIter tagMatchIter(uint8_t needle) const {
FOLLY_SAFE_DCHECK((needle & 0x80) != 0, "");
uint8x16_t tagV = vld1q_u8(&tags_[0]);
auto needleV = vdupq_n_u8(needle);
auto eqV = vceqq_u8(tagV, needleV);
auto bitsV = vreinterpretq_u64_u8(vshrq_n_u8(eqV, 7));
auto hi = vgetq_lane_u64(bitsV, 1);
auto lo = vgetq_lane_u64(bitsV, 0);
static_assert(kCapacity >= 8, "");
hi &= ((uint64_t{1} << (8 * (kCapacity - 8))) - 1);
auto mixed = (hi << 4) | lo;
return Sparse8Interleaved4MaskIter{mixed};
}
template <typename F, std::size_t... I>
static constexpr uint8x16_t fixedVectorHelper(
F const& func,
index_sequence<I...>) {
return uint8x16_t{func(I)...};
}
template <typename F>
static constexpr uint8x16_t fixedVector(F const& func) {
return fixedVectorHelper(
[&](std::size_t i) { return i < kCapacity ? func(i) : uint8_t{0}; },
make_index_sequence<16>{});
}
int lastOccupiedIndex() const {
uint8x16_t tagV = vld1q_u8(&tags_[0]);
// signed shift extends top bit to all bits
auto occupiedV =
vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(tagV), 7));
auto indexV =
fixedVector([](std::size_t i) { return static_cast<uint8_t>(i + 1); });
auto occupiedIndexV = vandq_u8(occupiedV, indexV);
return vmaxvq_u8(occupiedIndexV) - 1;
}
TagCheckingIter occupiedIter() const {
return TagCheckingIter{&tags_[0], lastOccupiedIndex()};
}
IndexHolder lastOccupied() const {
return IndexHolder{lastOccupiedIndex()};
}
IndexHolder firstEmpty() const {
uint8x16_t tagV = vld1q_u8(&tags_[0]);
// occupied tags have sign bit set when interpreted as int8_t, so
// empty ones are non-negative
auto emptyV = vcgeq_s8(vreinterpretq_s8_u8(tagV), vdupq_n_s8(0));
auto indexV =
fixedVector([](std::size_t i) { return static_cast<uint8_t>(~i); });
auto emptyIndexV = vandq_u8(emptyV, indexV);
// none empty -> i == 0xff == int8_t{-1}
int8_t i = static_cast<int8_t>(~vmaxvq_u8(emptyIndexV));
return IndexHolder{i};
}
#else
////////
// Tag filtering using x86_64 SSE2 intrinsics
unsigned tagMatchMask(uint8_t needle) const {
TagVector const* tagVector() const {
return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0]));
}
SparseMaskIter tagMatchIter(uint8_t needle) const {
FOLLY_SAFE_DCHECK((needle & 0x80) != 0, "");
auto tagV = _mm_load_si128(tagVector());
auto needleV = _mm_set1_epi8(needle);
auto eqV = _mm_cmpeq_epi8(tagV, needleV);
return _mm_movemask_epi8(eqV) & kFullMask;
auto mask = _mm_movemask_epi8(eqV) & kFullMask;
return SparseMaskIter{mask};
}
unsigned occupiedMask() const {
......@@ -345,22 +597,22 @@ struct alignas(std::max_align_t) SSE2Chunk {
return _mm_movemask_epi8(tagV) & kFullMask;
}
bool occupied(std::size_t index) const {
FOLLY_SAFE_DCHECK(tags_[index] == 0 || (tags_[index] & 0x80) != 0, "");
return tags_[index] != 0;
DenseMaskIter occupiedIter() const {
return DenseMaskIter{occupiedMask()};
}
unsigned emptyMask() const {
return occupiedMask() ^ kFullMask;
LastOccupiedInMask lastOccupied() const {
return LastOccupiedInMask{occupiedMask()};
}
unsigned lastOccupiedIndex() const {
auto m = occupiedMask();
// assume + findLastSet results in optimal __builtin_clz on gcc
folly::assume(m != 0);
unsigned i = folly::findLastSet(m) - 1;
FOLLY_SAFE_DCHECK(occupied(i), "");
return i;
FirstEmptyInMask firstEmpty() const {
return FirstEmptyInMask{occupiedMask() ^ kFullMask};
}
#endif
bool occupied(std::size_t index) const {
FOLLY_SAFE_DCHECK(tags_[index] == 0 || (tags_[index] & 0x80) != 0, "");
return tags_[index] != 0;
}
Item* itemAddr(std::size_t i) const {
......@@ -378,60 +630,16 @@ struct alignas(std::max_align_t) SSE2Chunk {
return *folly::launder(itemAddr(i));
}
static SSE2Chunk& owner(Item& item, std::size_t index) {
static F14Chunk& owner(Item& item, std::size_t index) {
auto rawAddr =
static_cast<uint8_t*>(static_cast<void*>(std::addressof(item))) -
offsetof(SSE2Chunk, rawItems_) - index * sizeof(Item);
auto chunkAddr = static_cast<SSE2Chunk*>(static_cast<void*>(rawAddr));
offsetof(F14Chunk, rawItems_) - index * sizeof(Item);
auto chunkAddr = static_cast<F14Chunk*>(static_cast<void*>(rawAddr));
FOLLY_SAFE_DCHECK(std::addressof(item) == chunkAddr->itemAddr(index), "");
return *chunkAddr;
}
};
class SparseMaskIter {
unsigned mask_;
public:
explicit SparseMaskIter(unsigned mask) : mask_{mask} {}
bool hasNext() {
return mask_ != 0;
}
unsigned next() {
FOLLY_SAFE_DCHECK(hasNext(), "");
unsigned i = __builtin_ctz(mask_);
mask_ &= (mask_ - 1);
return i;
}
};
class DenseMaskIter {
unsigned mask_;
unsigned index_{0};
public:
explicit DenseMaskIter(unsigned mask) : mask_{mask} {}
bool hasNext() {
return mask_ != 0;
}
unsigned next() {
FOLLY_SAFE_DCHECK(hasNext(), "");
if (LIKELY((mask_ & 1) != 0)) {
mask_ >>= 1;
return index_++;
} else {
unsigned s = __builtin_ctz(mask_);
unsigned rv = index_ + s;
mask_ >>= (s + 1);
index_ = rv + 1;
return rv;
}
}
};
////////////////
template <typename ChunkPtr>
......@@ -507,9 +715,9 @@ class F14ItemIter {
return;
}
--c;
auto m = c->occupiedMask();
if (LIKELY(m != 0)) {
index_ = folly::findLastSet(m) - 1;
auto last = c->lastOccupied();
if (LIKELY(last.hasIndex())) {
index_ = last.index();
itemPtr_ = std::pointer_traits<ItemPtr>::pointer_to(c->item(index_));
return;
}
......@@ -534,9 +742,9 @@ class F14ItemIter {
// exhausted the current chunk
FOLLY_SAFE_DCHECK(!c->eof(), "");
--c;
auto m = c->occupiedMask();
if (LIKELY(m != 0)) {
index_ = folly::findLastSet(m) - 1;
auto last = c->lastOccupied();
if (LIKELY(last.hasIndex())) {
index_ = last.index();
itemPtr_ = std::pointer_traits<ItemPtr>::pointer_to(c->item(index_));
return;
}
......@@ -627,7 +835,7 @@ class F14Table : public Policy {
private:
using HashPair = typename F14HashToken::HashPair;
using Chunk = SSE2Chunk<Item>;
using Chunk = F14Chunk<Item>;
using ChunkAlloc = typename std::allocator_traits<
allocator_type>::template rebind_alloc<Chunk>;
using ChunkPtr = typename std::allocator_traits<ChunkAlloc>::pointer;
......@@ -864,7 +1072,7 @@ class F14Table : public Policy {
ByteAlloc a{this->alloc()};
uint8_t* raw = &*std::allocator_traits<ByteAlloc>::allocate(
a, allocSize(chunkCount, maxSizeWithoutRehash));
static_assert(std::is_trivial<Chunk>::value, "SSE2Chunk should be POD");
static_assert(std::is_trivial<Chunk>::value, "F14Chunk should be POD");
auto chunks = static_cast<Chunk*>(static_cast<void*>(raw));
for (std::size_t i = 0; i < chunkCount; ++i) {
chunks[i].clear();
......@@ -998,8 +1206,7 @@ class F14Table : public Policy {
if (sizeof(Chunk) > 64) {
prefetchAddr(chunk->itemAddr(8));
}
auto mask = chunk->tagMatchMask(hp.second);
SparseMaskIter hits{mask};
auto hits = chunk->tagMatchIter(hp.second);
while (hits.hasNext()) {
auto i = hits.next();
if (LIKELY(this->keyMatchesItem(key, chunk->item(i)))) {
......@@ -1188,15 +1395,15 @@ class F14Table : public Policy {
do {
dstChunk->copyOverflowInfoFrom(*srcChunk);
auto mask = srcChunk->occupiedMask();
auto iter = srcChunk->occupiedIter();
if (Policy::prefetchBeforeCopy()) {
for (DenseMaskIter iter{mask}; iter.hasNext();) {
this->prefetchValue(srcChunk->citem(iter.next()));
for (auto piter = iter; piter.hasNext();) {
this->prefetchValue(srcChunk->citem(piter.next()));
}
}
std::size_t dstI = 0;
for (DenseMaskIter iter{mask}; iter.hasNext(); ++dstI) {
for (; iter.hasNext(); ++dstI) {
auto srcI = iter.next();
auto&& srcValue = src.valueAtItemForCopy(srcChunk->citem(srcI));
auto dst = dstChunk->itemAddr(dstI);
......@@ -1215,7 +1422,7 @@ class F14Table : public Policy {
if (Policy::kEnableItemIteration) {
sizeAndPackedBegin_.packedBegin() =
ItemIter{chunks_ + maxChunkIndex,
folly::popcount(chunks_[maxChunkIndex].occupiedMask()) - 1}
chunks_[maxChunkIndex].lastOccupied().index()}
.pack();
}
}
......@@ -1268,16 +1475,16 @@ class F14Table : public Policy {
std::size_t srcChunkIndex = src.lastOccupiedChunk() - src.chunks_;
while (true) {
Chunk const* srcChunk = &src.chunks_[srcChunkIndex];
auto mask = srcChunk->occupiedMask();
auto iter = srcChunk->occupiedIter();
if (Policy::prefetchBeforeRehash()) {
for (DenseMaskIter iter{mask}; iter.hasNext();) {
this->prefetchValue(srcChunk->citem(iter.next()));
for (auto piter = iter; piter.hasNext();) {
this->prefetchValue(srcChunk->citem(piter.next()));
}
}
if (srcChunk->hostedOverflowCount() == 0) {
// all items are in their preferred chunk (no probing), so we
// don't need to compute any hash values
for (DenseMaskIter iter{mask}; iter.hasNext();) {
while (iter.hasNext()) {
auto i = iter.next();
auto& srcItem = srcChunk->citem(i);
auto&& srcValue = src.valueAtItemForCopy(srcItem);
......@@ -1289,7 +1496,7 @@ class F14Table : public Policy {
}
} else {
// any chunk's items might be in here
for (DenseMaskIter iter{mask}; iter.hasNext();) {
while (iter.hasNext()) {
auto i = iter.next();
auto& srcItem = srcChunk->citem(i);
auto&& srcValue = src.valueAtItemForCopy(srcItem);
......@@ -1407,13 +1614,13 @@ class F14Table : public Policy {
auto srcChunk = origChunks + origChunkCount - 1;
std::size_t remaining = size();
while (remaining > 0) {
auto mask = srcChunk->occupiedMask();
auto iter = srcChunk->occupiedIter();
if (Policy::prefetchBeforeRehash()) {
for (DenseMaskIter iter{mask}; iter.hasNext();) {
this->prefetchValue(srcChunk->item(iter.next()));
for (auto piter = iter; piter.hasNext();) {
this->prefetchValue(srcChunk->item(piter.next()));
}
}
for (DenseMaskIter iter{mask}; iter.hasNext();) {
while (iter.hasNext()) {
--remaining;
auto srcI = iter.next();
Item& srcItem = srcChunk->item(srcI);
......@@ -1515,19 +1722,19 @@ class F14Table : public Policy {
std::size_t index = hp.first;
ChunkPtr chunk = chunks_ + (index & chunkMask_);
auto emptyMask = chunk->emptyMask();
auto firstEmpty = chunk->firstEmpty();
if (emptyMask == 0) {
if (!firstEmpty.hasIndex()) {
std::size_t delta = probeDelta(hp);
do {
chunk->incrOutboundOverflowCount();
index += delta;
chunk = chunks_ + (index & chunkMask_);
emptyMask = chunk->emptyMask();
} while (emptyMask == 0);
firstEmpty = chunk->firstEmpty();
} while (!firstEmpty.hasIndex());
chunk->adjustHostedOverflowCount(Chunk::kIncrHostedOverflowCount);
}
std::size_t itemIndex = __builtin_ctz(emptyMask);
std::size_t itemIndex = firstEmpty.index();
chunk->setTag(itemIndex, hp.second);
ItemIter iter{chunk, itemIndex};
......@@ -1559,13 +1766,13 @@ class F14Table : public Policy {
if (Policy::destroyItemOnClear()) {
for (std::size_t ci = 0; ci <= chunkMask_; ++ci) {
ChunkPtr chunk = chunks_ + ci;
auto mask = chunk->occupiedMask();
auto iter = chunk->occupiedIter();
if (Policy::prefetchBeforeDestroy()) {
for (DenseMaskIter iter{mask}; iter.hasNext();) {
this->prefetchValue(chunk->item(iter.next()));
for (auto piter = iter; piter.hasNext();) {
this->prefetchValue(chunk->item(piter.next()));
}
}
for (DenseMaskIter iter{mask}; iter.hasNext();) {
while (iter.hasNext()) {
this->destroyItem(chunk->item(iter.next()));
}
}
......@@ -1699,15 +1906,20 @@ class F14Table : public Policy {
ChunkPtr chunk = chunks_ + ci;
FOLLY_SAFE_DCHECK(chunk->eof() == (ci == 0), "");
auto mask = chunk->occupiedMask();
n1 += folly::popcount(mask);
auto iter = chunk->occupiedIter();
std::size_t chunkOccupied = 0;
for (auto piter = iter; piter.hasNext(); piter.next()) {
++chunkOccupied;
}
n1 += chunkOccupied;
histoAt(stats.chunkOccupancyHisto, folly::popcount(mask))++;
histoAt(stats.chunkOccupancyHisto, chunkOccupied)++;
histoAt(
stats.chunkOutboundOverflowHisto, chunk->outboundOverflowCount())++;
histoAt(stats.chunkHostedOverflowHisto, chunk->hostedOverflowCount())++;
for (DenseMaskIter iter{mask}; iter.hasNext();) {
while (iter.hasNext()) {
auto ii = iter.next();
++n2;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment