use vector intrinsics for F14 on 32-bit platforms

Summary: This diff adds 32-bit support for F14, so SSE and NEON intrinsics can be used on x86 and arm architectures (rather than just x86_64 and aarch64). The portability fallback to std::unordered_map and std::unordered_set is now used only when vector intrinsics are not available, or on PPC. Reviewed By: shixiao Differential Revision: D8586283 fbshipit-source-id: 1c4d090e80381fe7ad071c3059b3cb242c04c9f7

use vector intrinsics for F14 on 32-bit platforms
Summary: This diff adds 32-bit support for F14, so SSE and NEON intrinsics can be used on x86 and arm architectures (rather than just x86_64 and aarch64). The portability fallback to std::unordered_map and std::unordered_set is now used only when vector intrinsics are not available, or on PPC. Reviewed By: shixiao Differential Revision: D8586283 fbshipit-source-id: 1c4d090e80381fe7ad071c3059b3cb242c04c9f7
27929e3d · Nathan Bronson · Facebook Github Bot · 3cb1c6d0 · 27929e3d · 27929e3d
Commit 27929e3d authored Jun 26, 2018 by Nathan Bronson Committed by Facebook Github Bot Jun 26, 2018
7 changed files
--- a/folly/Portability.h
+++ b/folly/Portability.h
@@ -301,6 +301,12 @@ constexpr auto kIsBigEndian = !kIsLittleEndian;
 #define FOLLY_SSE_PREREQ(major, minor) \
  (FOLLY_SSE > major || FOLLY_SSE == major && FOLLY_SSE_MINOR >= minor)

+#ifndef FOLLY_NEON
+# if defined(__ARM_NEON)
+#  define FOLLY_NEON 1
+# endif
+#endif
+
 #if FOLLY_UNUSUAL_GFLAGS_NAMESPACE
 namespace FOLLY_GFLAGS_NAMESPACE { }
 namespace gflags {

--- a/folly/container/detail/F14IntrinsicsAvailability.h
+++ b/folly/container/detail/F14IntrinsicsAvailability.h
@@ -18,16 +18,16 @@

 #include <folly/Portability.h>

-// clang-format off
-// F14 has been implemented for x86_64 SSE2 and AARCH64 NEON (so far)
 #ifndef FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
-# if FOLLY_SSE >= 2 && FOLLY_X64 || FOLLY_AARCH64
-#  define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 1
-# else
-#  define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 0
-#  pragma message                                                     \
-    "Vector intrinsics / F14 support unavailable on this platform, "  \
+
+// F14 has been implemented for SSE2 and NEON (so far)
+#if FOLLY_SSE >= 2 || FOLLY_NEON
+#define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 1
+#else
+#define FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE 0
+#pragma message                                                      \
+    "Vector intrinsics / F14 support unavailable on this platform, " \
    "falling back to std::unordered_map / set"
-# endif
 #endif
-// clang-format on
+
+#endif
--- a/folly/container/detail/F14Policy.h
+++ b/folly/container/detail/F14Policy.h
@@ -25,7 +25,9 @@
 #include <folly/Unit.h>
 #include <folly/container/detail/F14Table.h>
 #include <folly/hash/Hash.h>
+#include <folly/lang/Align.h>
 #include <folly/lang/SafeAssert.h>
+#include <folly/memory/Malloc.h>

 #if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE

@@ -226,6 +228,10 @@ struct BasePolicy
  std::size_t computeKeyHash(K const& key) const {
    static_assert(
        isAvalanchingHasher() == IsAvalanchingHasher<Hasher, K>::value, "");
+    static_assert(
+        !isAvalanchingHasher() ||
+            sizeof(decltype(hasher()(key))) >= sizeof(std::size_t),
+        "hasher is not avalanching if it doesn't return enough bits");
    return hasher()(key);
  }

@@ -272,14 +278,13 @@ struct BasePolicy
      std::size_t /*capacity*/,
      P&& /*rhs*/) {}

-  // Rounds chunkBytes up to the next multiple of 16 if it is possible
-  // that a sub-Chunk allocation has a size that is not a multiple of 16.
-  static std::size_t alignedChunkAllocSize(std::size_t chunkAllocSize) {
-    if ((sizeof(Item) % 8) != 0) {
-      chunkAllocSize = -(-chunkAllocSize & ~std::size_t{0xf});
+  std::size_t alignedAllocSize(std::size_t n) const {
+    if (kRequiredVectorAlignment <= alignof(max_align_t) ||
+        std::is_same<ByteAlloc, std::allocator<uint8_t>>::value) {
+      return n;
+    } else {
+      return n + kRequiredVectorAlignment;
    }
-    FOLLY_SAFE_DCHECK((chunkAllocSize % 16) == 0, "");
-    return chunkAllocSize;
  }

  bool beforeRehash(
@@ -288,9 +293,9 @@ struct BasePolicy
      std::size_t /*newCapacity*/,
      std::size_t chunkAllocSize,
      BytePtr& outChunkAllocation) {
-    ByteAlloc a{alloc()};
    outChunkAllocation =
-        ByteAllocTraits::allocate(a, alignedChunkAllocSize(chunkAllocSize));
+        allocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
+            alloc(), chunkAllocSize);
    return false;
  }

@@ -304,9 +309,8 @@ struct BasePolicy
      std::size_t chunkAllocSize) {
    // on success, this will be the old allocation, on failure the new one
    if (chunkAllocation != nullptr) {
-      ByteAlloc a{alloc()};
-      ByteAllocTraits::deallocate(
-          a, chunkAllocation, alignedChunkAllocSize(chunkAllocSize));
+      deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
+          alloc(), chunkAllocation, chunkAllocSize);
    }
  }

@@ -321,10 +325,8 @@ struct BasePolicy
      std::size_t /*capacity*/,
      BytePtr chunkAllocation,
      std::size_t chunkAllocSize) {
-    FOLLY_SAFE_DCHECK(chunkAllocation != nullptr, "");
-    ByteAlloc a{alloc()};
-    ByteAllocTraits::deallocate(
-        a, chunkAllocation, alignedChunkAllocSize(chunkAllocSize));
+    deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
+        alloc(), chunkAllocation, chunkAllocSize);
  }

  void prefetchValue(Item const&) const {
@@ -459,6 +461,8 @@ class ValueContainerPolicy : public BasePolicy<
  using typename Super::Value;

 private:
+  using typename Super::ByteAlloc;
+
  using Super::kIsMap;

 public:
@@ -571,7 +575,10 @@ class ValueContainerPolicy : public BasePolicy<
      std::size_t /*capacity*/,
      V&& visitor) const {
    if (chunkAllocSize > 0) {
-      visitor(Super::alignedChunkAllocSize(chunkAllocSize), 1);
+      visitor(
+          allocationBytesForOverAligned<ByteAlloc, kRequiredVectorAlignment>(
+              chunkAllocSize),
+          1);
    }
  }

@@ -698,6 +705,8 @@ class NodeContainerPolicy
  using typename Super::Value;

 private:
+  using typename Super::ByteAlloc;
+
  using Super::kIsMap;

 public:
@@ -796,7 +805,10 @@ class NodeContainerPolicy
      std::size_t /*capacity*/,
      V&& visitor) const {
    if (chunkAllocSize > 0) {
-      visitor(Super::alignedChunkAllocSize(chunkAllocSize), 1);
+      visitor(
+          allocationBytesForOverAligned<ByteAlloc, kRequiredVectorAlignment>(
+              chunkAllocSize),
+          1);
    }
    if (size > 0) {
      visitor(sizeof(Value), size);
@@ -1212,14 +1224,7 @@ class VectorContainerPolicy : public BasePolicy<
  static std::size_t allocSize(
      std::size_t prefixBytes,
      std::size_t valueCapacity) {
-    auto n = valuesOffset(prefixBytes) + sizeof(Value) * valueCapacity;
-    if (alignof(Value) <= 8) {
-      // ensure that the result is a multiple of 16 to protect against
-      // allocators that don't always align to 16
-      n = -(-n & ~std::size_t{0xf});
-    }
-    FOLLY_SAFE_DCHECK((n % 16) == 0, "");
-    return n;
+    return valuesOffset(prefixBytes) + sizeof(Value) * valueCapacity;
  }

 public:
@@ -1235,11 +1240,9 @@ class VectorContainerPolicy : public BasePolicy<
            newCapacity <= (std::numeric_limits<Item>::max)(),
        "");

-    {
-      ByteAlloc a{this->alloc()};
-      outChunkAllocation =
-          ByteAllocTraits::allocate(a, allocSize(chunkAllocSize, newCapacity));
-    }
+    outChunkAllocation =
+        allocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
+            Super::alloc(), allocSize(chunkAllocSize, newCapacity));

    ValuePtr before = values_;
    ValuePtr after = std::pointer_traits<ValuePtr>::pointer_to(
@@ -1279,9 +1282,8 @@ class VectorContainerPolicy : public BasePolicy<
    // on success, chunkAllocation is the old allocation, on failure it is the
    // new one
    if (chunkAllocation != nullptr) {
-      ByteAlloc a{this->alloc()};
-      ByteAllocTraits::deallocate(
-          a,
+      deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
+          Super::alloc(),
          chunkAllocation,
          allocSize(chunkAllocSize, (success ? oldCapacity : newCapacity)));
    }
@@ -1306,9 +1308,8 @@ class VectorContainerPolicy : public BasePolicy<
      BytePtr chunkAllocation,
      std::size_t chunkAllocSize) {
    if (chunkAllocation != nullptr) {
-      ByteAlloc a{this->alloc()};
-      ByteAllocTraits::deallocate(
-          a, chunkAllocation, allocSize(chunkAllocSize, capacity));
+      deallocateOverAligned<ByteAlloc, kRequiredVectorAlignment>(
+          Super::alloc(), chunkAllocation, allocSize(chunkAllocSize, capacity));
      values_ = nullptr;
    }
  }
@@ -1321,7 +1322,10 @@ class VectorContainerPolicy : public BasePolicy<
      V&& visitor) const {
    FOLLY_SAFE_DCHECK((chunkAllocSize == 0) == (capacity == 0), "");
    if (chunkAllocSize > 0) {
-      visitor(allocSize(chunkAllocSize, capacity), 1);
+      visitor(
+          allocationBytesForOverAligned<ByteAlloc, kRequiredVectorAlignment>(
+              allocSize(chunkAllocSize, capacity)),
+          1);
    }
  }


--- a/folly/container/detail/F14Table.cpp
+++ b/folly/container/detail/F14Table.cpp
@@ -24,7 +24,7 @@ namespace folly {
 namespace f14 {
 namespace detail {

-TagVector kEmptyTagVector = {};
+EmptyTagVectorType kEmptyTagVector = {};

 } // namespace detail
 } // namespace f14

--- a/folly/container/detail/F14Table.h
+++ b/folly/container/detail/F14Table.h
@@ -48,8 +48,8 @@
 #include <folly/container/detail/F14IntrinsicsAvailability.h>

 #if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
-#if FOLLY_AARCH64
-#if __ARM_FEATURE_CRC32
+#if FOLLY_NEON
+#ifdef __ARM_FEATURE_CRC32
 #include <arm_acle.h> // __crc32cd
 #endif
 #include <arm_neon.h> // uint8x16t intrinsics
@@ -246,12 +246,11 @@ using KeyTypeForEmplace = KeyTypeForEmplaceHelper<

 template <typename T>
 FOLLY_ALWAYS_INLINE static void prefetchAddr(T const* ptr) {
-#if FOLLY_AARCH64
+#ifndef _WIN32
  __builtin_prefetch(static_cast<void const*>(ptr));
+#elif FOLLY_NEON
+  __prefetch(static_cast<void const*>(ptr));
 #else
-  // _mm_prefetch is x86_64-specific and comes from xmmintrin.h.
-  // It seems to compile to the same thing as __builtin_prefetch, but
-  // also works on windows.
  _mm_prefetch(
      static_cast<char const*>(static_cast<void const*>(ptr)), _MM_HINT_T0);
 #endif
@@ -267,21 +266,31 @@ FOLLY_ALWAYS_INLINE static unsigned findFirstSetNonZero(T mask) {
  }
 }

-#if FOLLY_AARCH64
+#if FOLLY_NEON
 using TagVector = uint8x16_t;

 using MaskType = uint64_t;

 constexpr unsigned kMaskSpacing = 4;
-#else
+#else // SSE2
 using TagVector = __m128i;

-using MaskType = unsigned;
+using MaskType = uint32_t;

 constexpr unsigned kMaskSpacing = 1;
 #endif

-extern TagVector kEmptyTagVector;
+// We could use unaligned loads to relax this requirement, but that
+// would be both a performance penalty and require a bulkier packed
+// ItemIter format
+constexpr std::size_t kRequiredVectorAlignment =
+    constexpr_max(std::size_t{16}, alignof(max_align_t));
+
+using EmptyTagVectorType = std::aligned_storage_t<
+    sizeof(TagVector) + kRequiredVectorAlignment,
+    alignof(max_align_t)>;
+
+extern EmptyTagVectorType kEmptyTagVector;

 template <unsigned BitCount>
 struct FullMask {
@@ -292,6 +301,74 @@ struct FullMask {
 template <>
 struct FullMask<1> : std::integral_constant<MaskType, 1> {};

+#if FOLLY_ARM
+// Mask iteration is different for ARM because that is the only platform
+// for which the mask is bigger than a register.
+
+// Iterates a mask, optimized for the case that only a few bits are set
+class SparseMaskIter {
+  static_assert(kMaskSpacing == 4, "");
+
+  uint32_t interleavedMask_;
+
+ public:
+  explicit SparseMaskIter(MaskType mask)
+      : interleavedMask_{static_cast<uint32_t>(((mask >> 32) << 2) | mask)} {}
+
+  bool hasNext() {
+    return interleavedMask_ != 0;
+  }
+
+  unsigned next() {
+    FOLLY_SAFE_DCHECK(hasNext(), "");
+    unsigned i = findFirstSetNonZero(interleavedMask_);
+    interleavedMask_ &= (interleavedMask_ - 1);
+    return ((i >> 2) | (i << 2)) & 0xf;
+  }
+};
+
+// Iterates a mask, optimized for the case that most bits are set
+class DenseMaskIter {
+  static_assert(kMaskSpacing == 4, "");
+
+  std::size_t count_;
+  unsigned index_;
+  uint8_t const* tags_;
+
+ public:
+  explicit DenseMaskIter(uint8_t const* tags, MaskType mask) {
+    if (mask == 0) {
+      count_ = 0;
+    } else {
+      count_ =
+          folly::popcount(static_cast<uint32_t>(((mask >> 32) << 2) | mask));
+      if (LIKELY((mask & 1) != 0)) {
+        index_ = 0;
+      } else {
+        index_ = findFirstSetNonZero(mask) / kMaskSpacing;
+      }
+      tags_ = tags;
+    }
+  }
+
+  bool hasNext() {
+    return count_ > 0;
+  }
+
+  unsigned next() {
+    auto rv = index_;
+    --count_;
+    if (count_ > 0) {
+      do {
+        ++index_;
+      } while ((tags_[index_] & 0x80) == 0);
+    }
+    FOLLY_SAFE_DCHECK(index_ < 16, "");
+    return rv;
+  }
+};
+
+#else
 // Iterates a mask, optimized for the case that only a few bits are set
 class SparseMaskIter {
  MaskType mask_;
@@ -317,7 +394,7 @@ class DenseMaskIter {
  unsigned index_{0};

 public:
-  explicit DenseMaskIter(MaskType mask) : mask_{mask} {}
+  explicit DenseMaskIter(uint8_t const*, MaskType mask) : mask_{mask} {}

  bool hasNext() {
    return mask_ != 0;
@@ -337,6 +414,7 @@ class DenseMaskIter {
    }
  }
 };
+#endif

 // Iterates a mask, returning pairs of [begin,end) index covering blocks
 // of set bits
@@ -405,10 +483,10 @@ class FirstEmptyInMask {
 };

 template <typename ItemType>
-struct alignas(16) F14Chunk {
+struct alignas(kRequiredVectorAlignment) F14Chunk {
  using Item = ItemType;

-  // Assuming alignof(max_align_t) == 16 (and assuming alignof(Item) >=
+  // For our 16 byte vector alignment (and assuming alignof(Item) >=
  // 4) kCapacity of 14 is the most space efficient.  Slightly smaller
  // or larger capacities can help with cache alignment in a couple of
  // cases without wasting too much space, but once the items are larger
@@ -447,11 +525,15 @@ struct alignas(16) F14Chunk {
      rawItems_;

  static F14Chunk* emptyInstance() {
-    auto rv = static_cast<F14Chunk*>(static_cast<void*>(&kEmptyTagVector));
+    auto raw = reinterpret_cast<char*>(&kEmptyTagVector);
+    if (kRequiredVectorAlignment > alignof(max_align_t)) {
+      auto delta = kRequiredVectorAlignment -
+          (reinterpret_cast<uintptr_t>(raw) % kRequiredVectorAlignment);
+      raw += delta;
+    }
+    auto rv = reinterpret_cast<F14Chunk*>(raw);
    FOLLY_SAFE_DCHECK(
-        !rv->occupied(0) && rv->chunk0Capacity() == 0 &&
-            rv->outboundOverflowCount() == 0,
-        "");
+        (reinterpret_cast<uintptr_t>(rv) % kRequiredVectorAlignment) == 0, "");
    return rv;
  }

@@ -534,9 +616,9 @@ struct alignas(16) F14Chunk {
    tags_[index] = 0;
  }

-#if FOLLY_AARCH64
+#if FOLLY_NEON
  ////////
-  // Tag filtering using AArch64 Advanced SIMD (NEON) intrinsics
+  // Tag filtering using NEON intrinsics

  SparseMaskIter tagMatchIter(uint8_t needle) const {
    FOLLY_SAFE_DCHECK((needle & 0x80) != 0, "");
@@ -550,7 +632,7 @@ struct alignas(16) F14Chunk {
    return SparseMaskIter(mask);
  }

-  uint64_t occupiedMask() const {
+  MaskType occupiedMask() const {
    uint8x16_t tagV = vld1q_u8(&tags_[0]);
    // signed shift extends top bit to all bits
    auto occupiedV =
@@ -560,7 +642,7 @@ struct alignas(16) F14Chunk {
  }
 #else
  ////////
-  // Tag filtering using x86_64 SSE2 intrinsics
+  // Tag filtering using SSE2 intrinsics

  TagVector const* tagVector() const {
    return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0]));
@@ -575,14 +657,14 @@ struct alignas(16) F14Chunk {
    return SparseMaskIter{mask};
  }

-  unsigned occupiedMask() const {
+  MaskType occupiedMask() const {
    auto tagV = _mm_load_si128(tagVector());
    return _mm_movemask_epi8(tagV) & kFullMask;
  }
 #endif

  DenseMaskIter occupiedIter() const {
-    return DenseMaskIter{occupiedMask()};
+    return DenseMaskIter{&tags_[0], occupiedMask()};
  }

  MaskRangeIter occupiedRangeIter() const {
@@ -637,8 +719,7 @@ struct alignas(16) F14Chunk {
 template <typename Ptr>
 class PackedChunkItemPtr {
 public:
-  PackedChunkItemPtr(Ptr p, std::size_t i) noexcept
-      : ptr_{p}, index_{static_cast<unsigned>(i)} {
+  PackedChunkItemPtr(Ptr p, std::size_t i) noexcept : ptr_{p}, index_{i} {
    FOLLY_SAFE_DCHECK(ptr_ != nullptr || index_ == 0, "");
  }

@@ -666,7 +747,7 @@ class PackedChunkItemPtr {

 private:
  Ptr ptr_;
-  unsigned index_;
+  std::size_t index_;
 };

 // Bare pointer form, packed into a uintptr_t.  Uses only bits wasted by
@@ -1131,17 +1212,19 @@ class F14Table : public Policy {
  // For hash functions we don't trust to avalanche, we repair things by
  // applying a bit mixer to the user-supplied hash.

+#if FOLLY_X64 || FOLLY_AARCH64
+  // 64-bit
  static HashPair splitHash(std::size_t hash) {
+    static_assert(sizeof(std::size_t) == sizeof(uint64_t), "");
    uint8_t tag;
    if (!Policy::isAvalanchingHasher()) {
-#if FOLLY_SSE > 4 || (FOLLY_SSE == 4 && FOLLY_SSE_MINOR >= 2)
+#if FOLLY_SSE_PREREQ(4, 2)
      // SSE4.2 CRC
      auto c = _mm_crc32_u64(0, hash);
      tag = static_cast<uint8_t>(~(c >> 25));
      hash += c;
-#elif FOLLY_AARCH64 && __ARM_FEATURE_CRC32
-      // AARCH64 CRC is Optional on armv8 (-march=armv8-a+crc), standard
-      // on armv8.1
+#elif __ARM_FEATURE_CRC32
+      // CRC is optional on armv8 (-march=armv8-a+crc), standard on armv8.1
      auto c = __crc32cd(0, hash);
      tag = static_cast<uint8_t>(~(c >> 25));
      hash += c;
@@ -1178,6 +1261,35 @@ class F14Table : public Policy {
    }
    return std::make_pair(hash, tag);
  }
+#else
+  // 32-bit
+  static HashPair splitHash(std::size_t hash) {
+    static_assert(sizeof(std::size_t) == sizeof(uint32_t), "");
+    uint8_t tag;
+    if (!Policy::isAvalanchingHasher()) {
+#if FOLLY_SSE_PREREQ(4, 2)
+      // SSE4.2 CRC
+      auto c = _mm_crc32_u32(0, hash);
+      tag = static_cast<uint8_t>(~(c >> 25));
+      hash += c;
+#elif __ARM_FEATURE_CRC32
+      auto c = __crc32cw(0, hash);
+      tag = static_cast<uint8_t>(~(c >> 25));
+      hash += c;
+#else
+      // finalizer for 32-bit murmur2
+      hash ^= hash >> 13;
+      hash *= 0x5bd1e995;
+      hash ^= hash >> 15;
+      tag = static_cast<uint8_t>(~(hash >> 25));
+#endif
+    } else {
+      // we don't trust the top bit
+      tag = (hash >> 24) | 0x80;
+    }
+    return std::make_pair(hash, tag);
+  }
+#endif

  //////// memory management helpers


--- a/folly/container/test/F14MapTest.cpp
+++ b/folly/container/test/F14MapTest.cpp
@@ -1126,23 +1126,28 @@ TEST(F14VectorMap, destructuringErase) {
      0);
 }

-TEST(F14ValueMap, vectorMaxSize) {
+TEST(F14ValueMap, maxSize) {
  F14ValueMap<int, int> m;
  EXPECT_EQ(
      m.max_size(),
-      std::numeric_limits<uint64_t>::max() / sizeof(std::pair<int, int>));
+      std::numeric_limits<std::size_t>::max() / sizeof(std::pair<int, int>));
 }

-TEST(F14NodeMap, vectorMaxSize) {
+TEST(F14NodeMap, maxSize) {
  F14NodeMap<int, int> m;
  EXPECT_EQ(
      m.max_size(),
-      std::numeric_limits<uint64_t>::max() / sizeof(std::pair<int, int>));
+      std::numeric_limits<std::size_t>::max() / sizeof(std::pair<int, int>));
 }

 TEST(F14VectorMap, vectorMaxSize) {
  F14VectorMap<int, int> m;
-  EXPECT_EQ(m.max_size(), std::numeric_limits<uint32_t>::max());
+  EXPECT_EQ(
+      m.max_size(),
+      std::min(
+          std::size_t{std::numeric_limits<uint32_t>::max()},
+          std::numeric_limits<std::size_t>::max() /
+              sizeof(std::pair<int, int>)));
 }

 template <typename M>

--- a/folly/container/test/F14SetTest.cpp
+++ b/folly/container/test/F14SetTest.cpp
@@ -100,19 +100,31 @@ TEST(F14Set, getAllocatedMemorySize) {
  {
    folly::F14ValueSet<int> set;
    set.insert(10);
-    EXPECT_EQ(sizeof(set), 32);
-    EXPECT_EQ(set.getAllocatedMemorySize(), 32);
+    EXPECT_EQ(sizeof(set), 4 * sizeof(void*));
+    if (alignof(folly::max_align_t) == 16) {
+      // chunks will be allocated as 2 max_align_t-s
+      EXPECT_EQ(set.getAllocatedMemorySize(), 32);
+    } else {
+      // chunks will be allocated using aligned_malloc with the true size
+      EXPECT_EQ(set.getAllocatedMemorySize(), 24);
+    }
  }
  {
    folly::F14NodeSet<int> set;
    set.insert(10);
-    EXPECT_EQ(sizeof(set), 32);
-    EXPECT_EQ(set.getAllocatedMemorySize(), 36);
+    EXPECT_EQ(sizeof(set), 4 * sizeof(void*));
+    if (alignof(folly::max_align_t) == 16) {
+      // chunks will be allocated as 2 max_align_t-s
+      EXPECT_EQ(set.getAllocatedMemorySize(), 36);
+    } else {
+      // chunks will be allocated using aligned_malloc with the true size
+      EXPECT_EQ(set.getAllocatedMemorySize(), 20 + 2 * sizeof(void*));
+    }
  }
  {
    folly::F14VectorSet<int> set;
    set.insert(10);
-    EXPECT_EQ(sizeof(set), 24);
+    EXPECT_EQ(sizeof(set), 8 + 2 * sizeof(void*));
    EXPECT_EQ(set.getAllocatedMemorySize(), 32);
  }
 }
@@ -735,19 +747,25 @@ TEST(F14VectorSet, destructuring) {
  runInsertAndEmplace<F14VectorSet<Tracked<0>>>();
 }

-TEST(F14ValueSet, vectorMaxSize) {
+TEST(F14ValueSet, maxSize) {
  F14ValueSet<int> s;
-  EXPECT_EQ(s.max_size(), std::numeric_limits<uint64_t>::max() / sizeof(int));
+  EXPECT_EQ(
+      s.max_size(), std::numeric_limits<std::size_t>::max() / sizeof(int));
 }

-TEST(F14NodeSet, vectorMaxSize) {
+TEST(F14NodeSet, maxSize) {
  F14NodeSet<int> s;
-  EXPECT_EQ(s.max_size(), std::numeric_limits<uint64_t>::max() / sizeof(int));
+  EXPECT_EQ(
+      s.max_size(), std::numeric_limits<std::size_t>::max() / sizeof(int));
 }

-TEST(F14VectorSet, vectorMaxSize) {
+TEST(F14VectorSet, maxSize) {
  F14VectorSet<int> s;
-  EXPECT_EQ(s.max_size(), std::numeric_limits<uint32_t>::max());
+  EXPECT_EQ(
+      s.max_size(),
+      std::min(
+          std::size_t{std::numeric_limits<uint32_t>::max()},
+          std::numeric_limits<std::size_t>::max() / sizeof(int)));
 }

 template <typename S>