native aarch64 support for F14 hash tables

Summary: This diff adds support for the F14 algorithm using the advanced SIMD instructions (NEON) that are part of aarch64. Reviewed By: yfeldblum Differential Revision: D7765273 fbshipit-source-id: 866a8a3481ad60b8aadcfb39718d6a5e62bbe07c

native aarch64 support for F14 hash tables
Summary: This diff adds support for the F14 algorithm using the advanced SIMD instructions (NEON) that are part of aarch64. Reviewed By: yfeldblum Differential Revision: D7765273 fbshipit-source-id: 866a8a3481ad60b8aadcfb39718d6a5e62bbe07c
19c316ae · Nathan Bronson · Facebook Github Bot · 25f4df21 · 19c316ae · 19c316ae
Commit 19c316ae authored May 01, 2018 by Nathan Bronson Committed by Facebook Github Bot May 01, 2018
5 changed files
--- a/folly/container/F14.md
+++ b/folly/container/F14.md
@@ -2,11 +2,12 @@

 F14 is a 14-way probing hash table that resolves collisions by double
 hashing.  Up to 14 keys are stored in a chunk at a single hash table
-position.  SSE2 vector instructions are used to filter within a chunk;
-intra-chunk search takes only a handful of instructions.  **F14** refers
-to the fact that the algorithm **F**ilters up to **14** keys at a time.
-This strategy allows the hash table to be operated at a high maximum
-load factor (12/14) while still keeping probe chains very short.
+position.  Vector instructions (SSE2 on x86_64, NEON on aarch64)
+are used to filter within a chunk; intra-chunk search takes only a
+handful of instructions.  **F14** refers to the fact that the algorithm
+**F**ilters up to **14** keys at a time.  This strategy allows the hash
+table to be operated at a high maximum load factor (12/14) while still
+keeping probe chains very short.

 F14 provides compelling replacements for most of the hash tables we use in
 production at Facebook.  Switching to it can improve memory efficiency
@@ -157,10 +158,12 @@ unlikely to perform any key comparisons, successful searches are likely
 to perform exactly 1 comparison, and all of the resulting branches are
 pretty predictable.

-The vector search uses SSE2 intrinsics.  SSE2 is a non-optional part
-of the x86_64 platform, so every 64-bit x86 platform supports them.
-AARCH64's vector instructions will allow a similar strategy, although
-the lack of a movemask operation complicates things a bit.
+The vector search is coded using SIMD intrinsics, SSE2 on x86_64 and
+NEON on aarch64.  These instructions are a non-optional part of those
+platforms (unlike later SIMD instruction sets like AVX2 or SVE), so no
+special compilation flags are required.  The exact vector operations
+performed differs between x86_64 and aarch64 because aarch64 lacks a
+movemask instruction, but the F14 algorithm is the same.

 ## WHAT ABOUT MEMORY OVERHEAD FOR SMALL TABLES?


--- a/folly/container/detail/F14IntrinsicsAvailability.h
+++ b/folly/container/detail/F14IntrinsicsAvailability.h
@@ -19,9 +19,9 @@
 #include <folly/Portability.h>

 // clang-format off
-// F14 is only available on x86 with SSE2 intrinsics (so far)
+// F14 has been implemented for SSE2 and AARCH64 NEON (so far)
 #ifndef FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
-# if FOLLY_SSE >= 2
+# if FOLLY_SSE >= 2 || FOLLY_AARCH64
 #  define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 1
 # else
 #  define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 0

--- a/folly/container/detail/F14Policy.h
+++ b/folly/container/detail/F14Policy.h
@@ -89,7 +89,7 @@ struct BasePolicy
    return IsAvalanchingHasher<Hasher, Key>::value;
  }

-  using Chunk = SSE2Chunk<Item>;
+  using Chunk = F14Chunk<Item>;
  using ChunkPtr = typename std::pointer_traits<
      typename AllocTraits::pointer>::template rebind<Chunk>;
  using ItemIter = F14ItemIter<ChunkPtr>;
@@ -240,7 +240,7 @@ class BaseIter : public std::iterator<
                     ValuePtr,
                     decltype(*std::declval<ValuePtr>())> {
 protected:
-  using Chunk = SSE2Chunk<Item>;
+  using Chunk = F14Chunk<Item>;
  using ChunkPtr =
      typename std::pointer_traits<ValuePtr>::template rebind<Chunk>;
  using ItemIter = F14ItemIter<ChunkPtr>;

--- a/folly/container/detail/F14Table.cpp
+++ b/folly/container/detail/F14Table.cpp
@@ -24,7 +24,7 @@ namespace folly {
 namespace f14 {
 namespace detail {

-__m128i kEmptyTagVector = {};
+TagVector kEmptyTagVector = {};

 } // namespace detail
 } // namespace f14

--- a/folly/container/detail/F14Table.h
+++ b/folly/container/detail/F14Table.h
@@ -35,7 +35,9 @@
 #include <folly/Portability.h>
 #include <folly/ScopeGuard.h>
 #include <folly/Traits.h>
+#include <folly/functional/ApplyTuple.h>
 #include <folly/functional/Invoke.h>
+#include <folly/lang/Align.h>
 #include <folly/lang/Assume.h>
 #include <folly/lang/Exception.h>
 #include <folly/lang/Launder.h>
@@ -48,9 +50,13 @@
 #include <folly/container/detail/F14Memory.h>

 #if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
+#if FOLLY_AARCH64
+#include <arm_neon.h>
+#else // SSE2
 #include <immintrin.h> // __m128i intrinsics
 #include <xmmintrin.h> // _mm_prefetch
 #endif
+#endif

 #ifdef _WIN32
 #include <intrin.h> // for _mul128
@@ -198,26 +204,196 @@ struct EnableIfIsTransparent<

 template <typename T>
 FOLLY_ALWAYS_INLINE static void prefetchAddr(T const* ptr) {
+#if FOLLY_AARCH64
+  __builtin_prefetch(static_cast<void const*>(ptr));
+#else
  // _mm_prefetch is x86_64-specific and comes from xmmintrin.h.
-  // It compiles to the same thing as __builtin_prefetch.
+  // It seems to compile to the same thing as __builtin_prefetch, but
+  // also works on windows.
  _mm_prefetch(
      static_cast<char const*>(static_cast<void const*>(ptr)), _MM_HINT_T0);
+#endif
 }

-extern __m128i kEmptyTagVector;
+#if FOLLY_AARCH64
+using TagVector = uint8x16_t;
+#else
+using TagVector = __m128i;
+#endif
+
+extern TagVector kEmptyTagVector;
+
+// Iterates a 64-bit mask where elements are strided by 8 and the elements
+// at indexes 8 and higher are layered back over the bottom 64-bits with
+// a 4-bit offset.
+//
+// bitIndex = ((tagIndex * 8) % 64) + (tagIndex >= 8 ? 4 : 0)
+//
+// Iteration occurs in bitIndex order, not tagIndex.  That should be fine
+// for a sparse iterator, where we expect either 0 or 1 tag.
+class Sparse8Interleaved4MaskIter {
+  uint64_t mask_;
+
+ public:
+  explicit Sparse8Interleaved4MaskIter(uint64_t mask) : mask_{mask} {}
+
+  bool hasNext() {
+    return mask_ != 0;
+  }
+
+  unsigned next() {
+    FOLLY_SAFE_DCHECK(hasNext(), "");
+    unsigned mixed = __builtin_ctzll(mask_);
+    FOLLY_SAFE_DCHECK((mixed % 4) == 0, "");
+    mask_ &= (mask_ - 1);
+
+    // mixed >> 3 has the bottom 3 bits of the result (no masking needed
+    // because all of the higher bits will be empty).  mixed & 4 holds the
+    // bit that should be result & 8.  We can merge it in either before or
+    // after sliding.  Merging it before means we need to shift it left 4
+    // (so that the right shift 3 turns it into a left 1), which happens
+    // to be the same as multiplication by 17.
+    return ((mixed * 0x11) >> 3) & 0xf;
+  }
+};
+
+// Iterates downward on occupied indexes by just checking tags[i] instead
+// of using a mask
+class TagCheckingIter {
+  uint8_t const* tags_;
+  int nextIndex_;
+
+ public:
+  explicit TagCheckingIter(uint8_t const* tags, int maxIndex)
+      : tags_{tags}, nextIndex_{maxIndex} {}
+
+  bool hasNext() {
+    return nextIndex_ >= 0;
+  }
+
+  unsigned next() {
+    auto rv = static_cast<unsigned>(nextIndex_);
+    do {
+      --nextIndex_;
+    } while (nextIndex_ >= 0 && tags_[nextIndex_] == 0);
+    return rv;
+  }
+};
+
+// Holds the result of an index query that has an optional result,
+// interpreting an index of -1 to be the empty answer
+class IndexHolder {
+  int index_;
+
+ public:
+  explicit IndexHolder(int index) : index_{index} {}
+
+  bool hasIndex() const {
+    return index_ >= 0;
+  }
+
+  unsigned index() const {
+    FOLLY_SAFE_DCHECK(hasIndex(), "");
+    return static_cast<unsigned>(index_);
+  }
+};
+
+// Iterates a mask, optimized for the case that only a few bits are set
+class SparseMaskIter {
+  unsigned mask_;
+
+ public:
+  explicit SparseMaskIter(unsigned mask) : mask_{mask} {}
+
+  bool hasNext() {
+    return mask_ != 0;
+  }
+
+  unsigned next() {
+    FOLLY_SAFE_DCHECK(hasNext(), "");
+    unsigned i = __builtin_ctz(mask_);
+    mask_ &= (mask_ - 1);
+    return i;
+  }
+};
+
+// Iterates a mask, optimized for the case that most bits are set
+class DenseMaskIter {
+  unsigned mask_;
+  unsigned index_{0};
+
+ public:
+  explicit DenseMaskIter(unsigned mask) : mask_{mask} {}
+
+  bool hasNext() {
+    return mask_ != 0;
+  }
+
+  unsigned next() {
+    FOLLY_SAFE_DCHECK(hasNext(), "");
+    if (LIKELY((mask_ & 1) != 0)) {
+      mask_ >>= 1;
+      return index_++;
+    } else {
+      unsigned s = __builtin_ctz(mask_);
+      unsigned rv = index_ + s;
+      mask_ >>= (s + 1);
+      index_ = rv + 1;
+      return rv;
+    }
+  }
+};
+
+// Holds the result of an index query that has an optional result,
+// interpreting a mask of 0 to be the empty answer and the index of the
+// last set bit to be the non-empty answer
+class LastOccupiedInMask {
+  unsigned mask_;
+
+ public:
+  explicit LastOccupiedInMask(unsigned mask) : mask_{mask} {}
+
+  bool hasIndex() const {
+    return mask_ != 0;
+  }
+
+  unsigned index() const {
+    folly::assume(mask_ != 0);
+    return folly::findLastSet(mask_) - 1;
+  }
+};
+
+// Holds the result of an index query that has an optional result,
+// interpreting a mask of 0 to be the empty answer and the index of the
+// first set bit to be the non-empty answer
+class FirstEmptyInMask {
+  unsigned mask_;
+
+ public:
+  explicit FirstEmptyInMask(unsigned mask) : mask_{mask} {}
+
+  bool hasIndex() const {
+    return mask_ != 0;
+  }
+
+  unsigned index() const {
+    FOLLY_SAFE_DCHECK(mask_ != 0, "");
+    return __builtin_ctz(mask_);
+  }
+};

 template <typename ItemType>
-struct alignas(std::max_align_t) SSE2Chunk {
+struct alignas(max_align_t) F14Chunk {
  using Item = ItemType;

-  // Assuming alignof(std::max_align_t) == 16 (and assuming alignof(Item)
-  // >= 4) kCapacity of 14 is always most space efficient.  Slightly
-  // smaller or larger capacities can help with cache alignment in a
-  // couple of cases without wasting too much space, but once the items
-  // are larger then we're unlikely to get much benefit anyway.  The only
-  // case we optimize is using kCapacity of 12 for 4 byte items, which
-  // makes the chunk take exactly 1 cache line, and adding 16 bytes of
-  // padding for 16 byte items so that a chunk takes exactly 4 cache lines.
+  // Assuming alignof(max_align_t) == 16 (and assuming alignof(Item) >=
+  // 4) kCapacity of 14 is the most space efficient.  Slightly smaller
+  // or larger capacities can help with cache alignment in a couple of
+  // cases without wasting too much space, but once the items are larger
+  // then we're unlikely to get much benefit anyway.  The only case we
+  // optimize is using kCapacity of 12 for 4 byte items, which makes the
+  // chunk take exactly 1 cache line, and adding 16 bytes of padding for
+  // 16 byte items so that a chunk takes exactly 4 cache lines.
  static constexpr unsigned kCapacity = sizeof(Item) == 4 ? 12 : 14;

  static constexpr unsigned kDesiredCapacity = kCapacity - 2;
@@ -248,24 +424,32 @@ struct alignas(std::max_align_t) SSE2Chunk {
      kAllocatedCapacity>
      rawItems_;

-  static SSE2Chunk* emptyInstance() {
-    auto rv = static_cast<SSE2Chunk*>(static_cast<void*>(&kEmptyTagVector));
+  static F14Chunk* emptyInstance() {
+    auto rv = static_cast<F14Chunk*>(static_cast<void*>(&kEmptyTagVector));
    FOLLY_SAFE_DCHECK(
-        rv->occupiedMask() == 0 && rv->chunk0Capacity() == 0 &&
+        !rv->occupied(0) && rv->chunk0Capacity() == 0 &&
            rv->outboundOverflowCount() == 0,
        "");
    return rv;
  }

  void clear() {
+    // tags_ = {}; control_ = 0; outboundOverflowCount_ = 0;
+
+    // gcc < 6 doesn't exploit chunk alignment to generate the optimal
+    // SSE clear from memset.  This is very hot code, so it is worth
+    // handling that case specially.
+#if FOLLY_SSE >= 2 && __GNUC__ <= 5 && !__clang__
    // this doesn't violate strict aliasing rules because __m128i is
    // tagged as __may_alias__
    auto* v = static_cast<__m128i*>(static_cast<void*>(&tags_[0]));
    _mm_store_si128(v, _mm_setzero_si128());
-    // tags_ = {}; control_ = 0; outboundOverflowCount_ = 0;
+#else
+    std::memset(&tags_[0], '\0', 16);
+#endif
  }

-  void copyOverflowInfoFrom(SSE2Chunk const& rhs) {
+  void copyOverflowInfoFrom(F14Chunk const& rhs) {
    FOLLY_SAFE_DCHECK(hostedOverflowCount() == 0, "");
    control_ += rhs.control_ & 0xf0;
    outboundOverflowCount_ = rhs.outboundOverflowCount_;
@@ -328,16 +512,84 @@ struct alignas(std::max_align_t) SSE2Chunk {
    tags_[index] = 0;
  }

-  __m128i const* tagVector() const {
-    return static_cast<__m128i const*>(static_cast<void const*>(&tags_[0]));
+#if FOLLY_AARCH64
+  ////////
+  // Tag filtering using AArch64 Advanced SIMD (NEON) intrinsics
+
+  Sparse8Interleaved4MaskIter tagMatchIter(uint8_t needle) const {
+    FOLLY_SAFE_DCHECK((needle & 0x80) != 0, "");
+    uint8x16_t tagV = vld1q_u8(&tags_[0]);
+    auto needleV = vdupq_n_u8(needle);
+    auto eqV = vceqq_u8(tagV, needleV);
+    auto bitsV = vreinterpretq_u64_u8(vshrq_n_u8(eqV, 7));
+    auto hi = vgetq_lane_u64(bitsV, 1);
+    auto lo = vgetq_lane_u64(bitsV, 0);
+    static_assert(kCapacity >= 8, "");
+    hi &= ((uint64_t{1} << (8 * (kCapacity - 8))) - 1);
+    auto mixed = (hi << 4) | lo;
+    return Sparse8Interleaved4MaskIter{mixed};
+  }
+
+  template <typename F, std::size_t... I>
+  static constexpr uint8x16_t fixedVectorHelper(
+      F const& func,
+      index_sequence<I...>) {
+    return uint8x16_t{func(I)...};
+  }
+
+  template <typename F>
+  static constexpr uint8x16_t fixedVector(F const& func) {
+    return fixedVectorHelper(
+        [&](std::size_t i) { return i < kCapacity ? func(i) : uint8_t{0}; },
+        make_index_sequence<16>{});
+  }
+
+  int lastOccupiedIndex() const {
+    uint8x16_t tagV = vld1q_u8(&tags_[0]);
+    // signed shift extends top bit to all bits
+    auto occupiedV =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(tagV), 7));
+    auto indexV =
+        fixedVector([](std::size_t i) { return static_cast<uint8_t>(i + 1); });
+    auto occupiedIndexV = vandq_u8(occupiedV, indexV);
+    return vmaxvq_u8(occupiedIndexV) - 1;
+  }
+
+  TagCheckingIter occupiedIter() const {
+    return TagCheckingIter{&tags_[0], lastOccupiedIndex()};
+  }
+
+  IndexHolder lastOccupied() const {
+    return IndexHolder{lastOccupiedIndex()};
+  }
+
+  IndexHolder firstEmpty() const {
+    uint8x16_t tagV = vld1q_u8(&tags_[0]);
+    // occupied tags have sign bit set when interpreted as int8_t, so
+    // empty ones are non-negative
+    auto emptyV = vcgeq_s8(vreinterpretq_s8_u8(tagV), vdupq_n_s8(0));
+    auto indexV =
+        fixedVector([](std::size_t i) { return static_cast<uint8_t>(~i); });
+    auto emptyIndexV = vandq_u8(emptyV, indexV);
+    // none empty -> i == 0xff == int8_t{-1}
+    int8_t i = static_cast<int8_t>(~vmaxvq_u8(emptyIndexV));
+    return IndexHolder{i};
  }
+#else
+  ////////
+  // Tag filtering using x86_64 SSE2 intrinsics

-  unsigned tagMatchMask(uint8_t needle) const {
+  TagVector const* tagVector() const {
+    return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0]));
+  }
+
+  SparseMaskIter tagMatchIter(uint8_t needle) const {
    FOLLY_SAFE_DCHECK((needle & 0x80) != 0, "");
    auto tagV = _mm_load_si128(tagVector());
    auto needleV = _mm_set1_epi8(needle);
    auto eqV = _mm_cmpeq_epi8(tagV, needleV);
-    return _mm_movemask_epi8(eqV) & kFullMask;
+    auto mask = _mm_movemask_epi8(eqV) & kFullMask;
+    return SparseMaskIter{mask};
  }

  unsigned occupiedMask() const {
@@ -345,22 +597,22 @@ struct alignas(std::max_align_t) SSE2Chunk {
    return _mm_movemask_epi8(tagV) & kFullMask;
  }

-  bool occupied(std::size_t index) const {
-    FOLLY_SAFE_DCHECK(tags_[index] == 0 || (tags_[index] & 0x80) != 0, "");
-    return tags_[index] != 0;
+  DenseMaskIter occupiedIter() const {
+    return DenseMaskIter{occupiedMask()};
  }

-  unsigned emptyMask() const {
-    return occupiedMask() ^ kFullMask;
+  LastOccupiedInMask lastOccupied() const {
+    return LastOccupiedInMask{occupiedMask()};
  }

-  unsigned lastOccupiedIndex() const {
-    auto m = occupiedMask();
-    // assume + findLastSet results in optimal __builtin_clz on gcc
-    folly::assume(m != 0);
-    unsigned i = folly::findLastSet(m) - 1;
-    FOLLY_SAFE_DCHECK(occupied(i), "");
-    return i;
+  FirstEmptyInMask firstEmpty() const {
+    return FirstEmptyInMask{occupiedMask() ^ kFullMask};
+  }
+#endif
+
+  bool occupied(std::size_t index) const {
+    FOLLY_SAFE_DCHECK(tags_[index] == 0 || (tags_[index] & 0x80) != 0, "");
+    return tags_[index] != 0;
  }

  Item* itemAddr(std::size_t i) const {
@@ -378,60 +630,16 @@ struct alignas(std::max_align_t) SSE2Chunk {
    return *folly::launder(itemAddr(i));
  }

-  static SSE2Chunk& owner(Item& item, std::size_t index) {
+  static F14Chunk& owner(Item& item, std::size_t index) {
    auto rawAddr =
        static_cast<uint8_t*>(static_cast<void*>(std::addressof(item))) -
-        offsetof(SSE2Chunk, rawItems_) - index * sizeof(Item);
-    auto chunkAddr = static_cast<SSE2Chunk*>(static_cast<void*>(rawAddr));
+        offsetof(F14Chunk, rawItems_) - index * sizeof(Item);
+    auto chunkAddr = static_cast<F14Chunk*>(static_cast<void*>(rawAddr));
    FOLLY_SAFE_DCHECK(std::addressof(item) == chunkAddr->itemAddr(index), "");
    return *chunkAddr;
  }
 };

-class SparseMaskIter {
-  unsigned mask_;
-
- public:
-  explicit SparseMaskIter(unsigned mask) : mask_{mask} {}
-
-  bool hasNext() {
-    return mask_ != 0;
-  }
-
-  unsigned next() {
-    FOLLY_SAFE_DCHECK(hasNext(), "");
-    unsigned i = __builtin_ctz(mask_);
-    mask_ &= (mask_ - 1);
-    return i;
-  }
-};
-
-class DenseMaskIter {
-  unsigned mask_;
-  unsigned index_{0};
-
- public:
-  explicit DenseMaskIter(unsigned mask) : mask_{mask} {}
-
-  bool hasNext() {
-    return mask_ != 0;
-  }
-
-  unsigned next() {
-    FOLLY_SAFE_DCHECK(hasNext(), "");
-    if (LIKELY((mask_ & 1) != 0)) {
-      mask_ >>= 1;
-      return index_++;
-    } else {
-      unsigned s = __builtin_ctz(mask_);
-      unsigned rv = index_ + s;
-      mask_ >>= (s + 1);
-      index_ = rv + 1;
-      return rv;
-    }
-  }
-};
-
 ////////////////

 template <typename ChunkPtr>
@@ -507,9 +715,9 @@ class F14ItemIter {
        return;
      }
      --c;
-      auto m = c->occupiedMask();
-      if (LIKELY(m != 0)) {
-        index_ = folly::findLastSet(m) - 1;
+      auto last = c->lastOccupied();
+      if (LIKELY(last.hasIndex())) {
+        index_ = last.index();
        itemPtr_ = std::pointer_traits<ItemPtr>::pointer_to(c->item(index_));
        return;
      }
@@ -534,9 +742,9 @@ class F14ItemIter {
      // exhausted the current chunk
      FOLLY_SAFE_DCHECK(!c->eof(), "");
      --c;
-      auto m = c->occupiedMask();
-      if (LIKELY(m != 0)) {
-        index_ = folly::findLastSet(m) - 1;
+      auto last = c->lastOccupied();
+      if (LIKELY(last.hasIndex())) {
+        index_ = last.index();
        itemPtr_ = std::pointer_traits<ItemPtr>::pointer_to(c->item(index_));
        return;
      }
@@ -627,7 +835,7 @@ class F14Table : public Policy {
 private:
  using HashPair = typename F14HashToken::HashPair;

-  using Chunk = SSE2Chunk<Item>;
+  using Chunk = F14Chunk<Item>;
  using ChunkAlloc = typename std::allocator_traits<
      allocator_type>::template rebind_alloc<Chunk>;
  using ChunkPtr = typename std::allocator_traits<ChunkAlloc>::pointer;
@@ -864,7 +1072,7 @@ class F14Table : public Policy {
    ByteAlloc a{this->alloc()};
    uint8_t* raw = &*std::allocator_traits<ByteAlloc>::allocate(
        a, allocSize(chunkCount, maxSizeWithoutRehash));
-    static_assert(std::is_trivial<Chunk>::value, "SSE2Chunk should be POD");
+    static_assert(std::is_trivial<Chunk>::value, "F14Chunk should be POD");
    auto chunks = static_cast<Chunk*>(static_cast<void*>(raw));
    for (std::size_t i = 0; i < chunkCount; ++i) {
      chunks[i].clear();
@@ -998,8 +1206,7 @@ class F14Table : public Policy {
      if (sizeof(Chunk) > 64) {
        prefetchAddr(chunk->itemAddr(8));
      }
-      auto mask = chunk->tagMatchMask(hp.second);
-      SparseMaskIter hits{mask};
+      auto hits = chunk->tagMatchIter(hp.second);
      while (hits.hasNext()) {
        auto i = hits.next();
        if (LIKELY(this->keyMatchesItem(key, chunk->item(i)))) {
@@ -1188,15 +1395,15 @@ class F14Table : public Policy {
      do {
        dstChunk->copyOverflowInfoFrom(*srcChunk);

-        auto mask = srcChunk->occupiedMask();
+        auto iter = srcChunk->occupiedIter();
        if (Policy::prefetchBeforeCopy()) {
-          for (DenseMaskIter iter{mask}; iter.hasNext();) {
-            this->prefetchValue(srcChunk->citem(iter.next()));
+          for (auto piter = iter; piter.hasNext();) {
+            this->prefetchValue(srcChunk->citem(piter.next()));
          }
        }

        std::size_t dstI = 0;
-        for (DenseMaskIter iter{mask}; iter.hasNext(); ++dstI) {
+        for (; iter.hasNext(); ++dstI) {
          auto srcI = iter.next();
          auto&& srcValue = src.valueAtItemForCopy(srcChunk->citem(srcI));
          auto dst = dstChunk->itemAddr(dstI);
@@ -1215,7 +1422,7 @@ class F14Table : public Policy {
      if (Policy::kEnableItemIteration) {
        sizeAndPackedBegin_.packedBegin() =
            ItemIter{chunks_ + maxChunkIndex,
-                     folly::popcount(chunks_[maxChunkIndex].occupiedMask()) - 1}
+                     chunks_[maxChunkIndex].lastOccupied().index()}
                .pack();
      }
    }
@@ -1268,16 +1475,16 @@ class F14Table : public Policy {
    std::size_t srcChunkIndex = src.lastOccupiedChunk() - src.chunks_;
    while (true) {
      Chunk const* srcChunk = &src.chunks_[srcChunkIndex];
-      auto mask = srcChunk->occupiedMask();
+      auto iter = srcChunk->occupiedIter();
      if (Policy::prefetchBeforeRehash()) {
-        for (DenseMaskIter iter{mask}; iter.hasNext();) {
-          this->prefetchValue(srcChunk->citem(iter.next()));
+        for (auto piter = iter; piter.hasNext();) {
+          this->prefetchValue(srcChunk->citem(piter.next()));
        }
      }
      if (srcChunk->hostedOverflowCount() == 0) {
        // all items are in their preferred chunk (no probing), so we
        // don't need to compute any hash values
-        for (DenseMaskIter iter{mask}; iter.hasNext();) {
+        while (iter.hasNext()) {
          auto i = iter.next();
          auto& srcItem = srcChunk->citem(i);
          auto&& srcValue = src.valueAtItemForCopy(srcItem);
@@ -1289,7 +1496,7 @@ class F14Table : public Policy {
        }
      } else {
        // any chunk's items might be in here
-        for (DenseMaskIter iter{mask}; iter.hasNext();) {
+        while (iter.hasNext()) {
          auto i = iter.next();
          auto& srcItem = srcChunk->citem(i);
          auto&& srcValue = src.valueAtItemForCopy(srcItem);
@@ -1407,13 +1614,13 @@ class F14Table : public Policy {
      auto srcChunk = origChunks + origChunkCount - 1;
      std::size_t remaining = size();
      while (remaining > 0) {
-        auto mask = srcChunk->occupiedMask();
+        auto iter = srcChunk->occupiedIter();
        if (Policy::prefetchBeforeRehash()) {
-          for (DenseMaskIter iter{mask}; iter.hasNext();) {
-            this->prefetchValue(srcChunk->item(iter.next()));
+          for (auto piter = iter; piter.hasNext();) {
+            this->prefetchValue(srcChunk->item(piter.next()));
          }
        }
-        for (DenseMaskIter iter{mask}; iter.hasNext();) {
+        while (iter.hasNext()) {
          --remaining;
          auto srcI = iter.next();
          Item& srcItem = srcChunk->item(srcI);
@@ -1515,19 +1722,19 @@ class F14Table : public Policy {

    std::size_t index = hp.first;
    ChunkPtr chunk = chunks_ + (index & chunkMask_);
-    auto emptyMask = chunk->emptyMask();
+    auto firstEmpty = chunk->firstEmpty();

-    if (emptyMask == 0) {
+    if (!firstEmpty.hasIndex()) {
      std::size_t delta = probeDelta(hp);
      do {
        chunk->incrOutboundOverflowCount();
        index += delta;
        chunk = chunks_ + (index & chunkMask_);
-        emptyMask = chunk->emptyMask();
-      } while (emptyMask == 0);
+        firstEmpty = chunk->firstEmpty();
+      } while (!firstEmpty.hasIndex());
      chunk->adjustHostedOverflowCount(Chunk::kIncrHostedOverflowCount);
    }
-    std::size_t itemIndex = __builtin_ctz(emptyMask);
+    std::size_t itemIndex = firstEmpty.index();

    chunk->setTag(itemIndex, hp.second);
    ItemIter iter{chunk, itemIndex};
@@ -1559,13 +1766,13 @@ class F14Table : public Policy {
      if (Policy::destroyItemOnClear()) {
        for (std::size_t ci = 0; ci <= chunkMask_; ++ci) {
          ChunkPtr chunk = chunks_ + ci;
-          auto mask = chunk->occupiedMask();
+          auto iter = chunk->occupiedIter();
          if (Policy::prefetchBeforeDestroy()) {
-            for (DenseMaskIter iter{mask}; iter.hasNext();) {
-              this->prefetchValue(chunk->item(iter.next()));
+            for (auto piter = iter; piter.hasNext();) {
+              this->prefetchValue(chunk->item(piter.next()));
            }
          }
-          for (DenseMaskIter iter{mask}; iter.hasNext();) {
+          while (iter.hasNext()) {
            this->destroyItem(chunk->item(iter.next()));
          }
        }
@@ -1699,15 +1906,20 @@ class F14Table : public Policy {
      ChunkPtr chunk = chunks_ + ci;
      FOLLY_SAFE_DCHECK(chunk->eof() == (ci == 0), "");

-      auto mask = chunk->occupiedMask();
-      n1 += folly::popcount(mask);
+      auto iter = chunk->occupiedIter();
+
+      std::size_t chunkOccupied = 0;
+      for (auto piter = iter; piter.hasNext(); piter.next()) {
+        ++chunkOccupied;
+      }
+      n1 += chunkOccupied;

-      histoAt(stats.chunkOccupancyHisto, folly::popcount(mask))++;
+      histoAt(stats.chunkOccupancyHisto, chunkOccupied)++;
      histoAt(
          stats.chunkOutboundOverflowHisto, chunk->outboundOverflowCount())++;
      histoAt(stats.chunkHostedOverflowHisto, chunk->hostedOverflowCount())++;

-      for (DenseMaskIter iter{mask}; iter.hasNext();) {
+      while (iter.hasNext()) {
        auto ii = iter.next();
        ++n2;