Implemented LtHash in folly/experimental/crypto

Summary: Added LtHash, a cryptographic homomorphic hash, to folly/experimental/crypto. This has a soft dependency on libsodium and the code will not be compiled if libsodium is not detected by cmake. Reviewed By: djwatson Differential Revision: D13390825 fbshipit-source-id: f7597ced7bcc7b403e8bbaa733837b795128f1b3

Implemented LtHash in folly/experimental/crypto
Summary: Added LtHash, a cryptographic homomorphic hash, to folly/experimental/crypto. This has a soft dependency on libsodium and the code will not be compiled if libsodium is not detected by cmake. Reviewed By: djwatson Differential Revision: D13390825 fbshipit-source-id: f7597ced7bcc7b403e8bbaa733837b795128f1b3
a0e3a745 · Ilya Maykov · Facebook Github Bot · f42dd787 · a0e3a745 · a0e3a745
Commit a0e3a745 authored Feb 28, 2019 by Ilya Maykov Committed by Facebook Github Bot Feb 28, 2019
10 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,12 +170,52 @@ if (NOT ${LIBAIO_FOUND})
    ${FOLLY_DIR}/experimental/io/AsyncIO.h
  )
 endif()
-if (NOT ${LIBSODIUM_FOUND})
+if (${LIBSODIUM_FOUND})
+  string(FIND ${CMAKE_LIBRARY_ARCHITECTURE} "x86_64" IS_X86_64_ARCH)
+  if (${IS_X86_64_ARCH} STREQUAL "-1")
+    message(
+      STATUS
+      "arch ${CMAKE_LIBRARY_ARCHITECTURE} does not match x86_64, "
+      "skipping setting SSE2/AVX2 compile flags for LtHash SIMD code"
+    )
+  else()
+    message(
+      STATUS
+      "arch ${CMAKE_LIBRARY_ARCHITECTURE} matches x86_64, "
+      "setting SSE2/AVX2 compile flags for LtHash SIMD code"
+    )
+    set_source_files_properties(
+      ${FOLLY_DIR}/experimental/crypto/detail/MathOperation_AVX2.cpp
+      PROPERTIES
+      COMPILE_FLAGS
+      -mavx -mavx2 -msse2
+    )
+    set_source_files_properties(
+      ${FOLLY_DIR}/experimental/crypto/detail/MathOperation_Simple.cpp
+      PROPERTIES
+      COMPILE_FLAGS
+      -mno-avx -mno-avx2 -mno-sse2
+    )
+    set_source_files_properties(
+      ${FOLLY_DIR}/experimental/crypto/detail/MathOperation_SSE2.cpp
+      PROPERTIES
+      COMPILE_FLAGS
+      -mno-avx -mno-avx2 -msse2
+    )
+  endif()
+else()
  list(REMOVE_ITEM files
    ${FOLLY_DIR}/experimental/crypto/Blake2xb.cpp
+    ${FOLLY_DIR}/experimental/crypto/detail/MathOperation_AVX2.cpp
+    ${FOLLY_DIR}/experimental/crypto/detail/MathOperation_Simple.cpp
+    ${FOLLY_DIR}/experimental/crypto/detail/MathOperation_SSE2.cpp
+    ${FOLLY_DIR}/experimental/crypto/LtHash.cpp
  )
  list(REMOVE_ITEM hfiles
    ${FOLLY_DIR}/experimental/crypto/Blake2xb.h
+    ${FOLLY_DIR}/experimental/crypto/detail/LtHashInternal.h
+    ${FOLLY_DIR}/experimental/crypto/LtHash-inl.h
+    ${FOLLY_DIR}/experimental/crypto/LtHash.h
  )
 endif()
 if(CMAKE_SYSTEM_NAME STREQUAL "Windows")

--- a/folly/experimental/crypto/LtHash-inl.h
+++ b/folly/experimental/crypto/LtHash-inl.h
--- a/folly/experimental/crypto/LtHash.cpp
+++ b/folly/experimental/crypto/LtHash.cpp
+/*
+ * Copyright 2017-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/experimental/crypto/LtHash.h>
+#include <folly/CpuId.h>
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
+#include <folly/Memory.h>
+namespace folly {
+namespace crypto {
+namespace detail {
+folly::IOBuf allocateCacheAlignedIOBuf(size_t size) {
+  void* ptr = folly::aligned_malloc(size, kCacheLineSize);
+  if (ptr == nullptr) {
+    throw std::bad_alloc();
+  }
+  return folly::IOBuf(
+      folly::IOBuf::TAKE_OWNERSHIP,
+      ptr,
+      static_cast<uint64_t>(size), // capacity
+      0ULL, // initial size
+      [](void* addr, void* /* userData*/) { folly::aligned_free(addr); });
+}
+std::unique_ptr<folly::IOBuf> allocateCacheAlignedIOBufUnique(size_t size) {
+  return std::make_unique<folly::IOBuf>(allocateCacheAlignedIOBuf(size));
+}
+bool isCacheAlignedAddress(const void* addr) {
+  size_t addrValue = reinterpret_cast<size_t>(addr);
+  return (addrValue & (kCacheLineSize - 1)) == 0;
+}
+// static
+template <>
+bool MathOperation<MathEngine::SIMPLE>::isAvailable() {
+  return true;
+}
+// static
+template <>
+bool MathOperation<MathEngine::SSE2>::isAvailable() {
+  static const bool kIsAvailable =
+      CpuId().sse2() && MathOperation<MathEngine::SSE2>::isImplemented();
+  return kIsAvailable;
+}
+// static
+template <>
+bool MathOperation<MathEngine::AVX2>::isAvailable() {
+  static const bool kIsAvailable =
+      CpuId().avx2() && MathOperation<MathEngine::AVX2>::isImplemented();
+  return kIsAvailable;
+}
+// static
+template <>
+bool MathOperation<MathEngine::AUTO>::isAvailable() {
+  return true;
+}
+// static
+template <>
+bool MathOperation<MathEngine::AUTO>::isImplemented() {
+  return true;
+}
+// static
+template <>
+void MathOperation<MathEngine::AUTO>::add(
+    uint64_t dataMask,
+    size_t bitsPerElement,
+    folly::ByteRange b1,
+    folly::ByteRange b2,
+    folly::MutableByteRange out) {
+  // Note: implementation is a function pointer that is initialized to point
+  // at the fastest available implementation the first time this function is
+  // called.
+  static auto implementation = []() {
+    if (MathOperation<MathEngine::AVX2>::isAvailable()) {
+      LOG(INFO) << "Selected AVX2 MathEngine for add() operation";
+      return MathOperation<MathEngine::AVX2>::add;
+    } else if (MathOperation<MathEngine::SSE2>::isAvailable()) {
+      LOG(INFO) << "Selected SSE2 MathEngine for add() operation";
+      return MathOperation<MathEngine::SSE2>::add;
+    } else {
+      LOG(INFO) << "Selected SIMPLE MathEngine for add() operation";
+      return MathOperation<MathEngine::SIMPLE>::add;
+    }
+  }();
+  implementation(dataMask, bitsPerElement, b1, b2, out);
+}
+// static
+template <>
+void MathOperation<MathEngine::AUTO>::sub(
+    uint64_t dataMask,
+    size_t bitsPerElement,
+    folly::ByteRange b1,
+    folly::ByteRange b2,
+    folly::MutableByteRange out) {
+  // Note: implementation is a function pointer that is initialized to point
+  // at the fastest available implementation the first time this function is
+  // called.
+  static auto implementation = []() {
+    if (MathOperation<MathEngine::AVX2>::isAvailable()) {
+      LOG(INFO) << "Selected AVX2 MathEngine for sub() operation";
+      return MathOperation<MathEngine::AVX2>::sub;
+    } else if (MathOperation<MathEngine::SSE2>::isAvailable()) {
+      LOG(INFO) << "Selected SSE2 MathEngine for sub() operation";
+      return MathOperation<MathEngine::SSE2>::sub;
+    } else {
+      LOG(INFO) << "Selected SIMPLE MathEngine for sub() operation";
+      return MathOperation<MathEngine::SIMPLE>::sub;
+    }
+  }();
+  implementation(dataMask, bitsPerElement, b1, b2, out);
+}
+// static
+template <>
+void MathOperation<MathEngine::AUTO>::clearPaddingBits(
+    uint64_t dataMask,
+    folly::MutableByteRange buf) {
+  // Note: implementation is a function pointer that is initialized to point
+  // at the fastest available implementation the first time this function is
+  // called.
+  static auto implementation = []() {
+    if (MathOperation<MathEngine::AVX2>::isAvailable()) {
+      LOG(INFO) << "Selected AVX2 MathEngine for clearPaddingBits() operation";
+      return MathOperation<MathEngine::AVX2>::clearPaddingBits;
+    } else if (MathOperation<MathEngine::SSE2>::isAvailable()) {
+      LOG(INFO) << "Selected SSE2 MathEngine for clearPaddingBits() operation";
+      return MathOperation<MathEngine::SSE2>::clearPaddingBits;
+    } else {
+      LOG(INFO)
+          << "Selected SIMPLE MathEngine for clearPaddingBits() operation";
+      return MathOperation<MathEngine::SIMPLE>::clearPaddingBits;
+    }
+  }();
+  implementation(dataMask, buf);
+}
+// static
+template <>
+bool MathOperation<MathEngine::AUTO>::checkPaddingBits(
+    uint64_t dataMask,
+    folly::ByteRange buf) {
+  // Note: implementation is a function pointer that is initialized to point
+  // at the fastest available implementation the first time this function is
+  // called.
+  static auto implementation = []() {
+    if (MathOperation<MathEngine::AVX2>::isAvailable()) {
+      LOG(INFO) << "Selected AVX2 MathEngine for checkPaddingBits() operation";
+      return MathOperation<MathEngine::AVX2>::checkPaddingBits;
+    } else if (MathOperation<MathEngine::SSE2>::isAvailable()) {
+      LOG(INFO) << "Selected SSE2 MathEngine for checkPaddingBits() operation";
+      return MathOperation<MathEngine::SSE2>::checkPaddingBits;
+    } else {
+      LOG(INFO)
+          << "Selected SIMPLE MathEngine for checkPaddingBits() operation";
+      return MathOperation<MathEngine::SIMPLE>::checkPaddingBits;
+    }
+  }();
+  return implementation(dataMask, buf);
+}
+template struct MathOperation<MathEngine::AUTO>;
+} // namespace detail
+} // namespace crypto
+} // namespace folly
--- a/folly/experimental/crypto/LtHash.h
+++ b/folly/experimental/crypto/LtHash.h
+/*
+ * Copyright 2017-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cstddef>
+#include <memory>
+#include <folly/Range.h>
+#include <folly/experimental/crypto/Blake2xb.h>
+#include <folly/io/IOBuf.h>
+namespace folly {
+namespace crypto {
+namespace detail {
+/**
+ * Allocates an IOBuf of the given size, aligned on a cache line boundary.
+ * Similar to folly::IOBuf::create(), the returned IOBuf has an initial
+ * capacity == size and an initial length == 0.
+ */
+folly::IOBuf allocateCacheAlignedIOBuf(size_t size);
+/**
+ * Similar to allocateCacheAlignedIOBuf(), but returns a unique_ptr to an IOBuf
+ * instead of an IOBuf.
+ */
+std::unique_ptr<folly::IOBuf> allocateCacheAlignedIOBufUnique(size_t size);
+/**
+ * Returns true if the given memory address is aligned on a cache line boundary
+ * and false if it isn't.
+ */
+bool isCacheAlignedAddress(const void* addr);
+} // namespace detail
+/**
+ * Templated homomorphic hash, using LtHash (lattice-based crypto).
+ * Template parameters: B = element size in bits, N = number of elements.
+ *
+ * Current constraints (checked at compile time with static asserts):
+ * (1) B must be 16, 20 or 32.
+ * (2) N must be > 999.
+ * (3) when B is 16, N must be divisible by 32.
+ * (4) when B is 20, N must be divisible by 24.
+ * (5) when B is 32, N must be divisible by 16.
+ */
+template <std::size_t B, std::size_t N>
+class LtHash {
+ public:
+  explicit LtHash(const folly::IOBuf& initialChecksum = {});
+  /**
+   * Like the above constructor but takes ownership of the checksum buffer,
+   * avoiding a copy if these conditions about the input buffer are met:
+   * - initialChecksum->isChained() is false
+   * - initialChecksum->isShared() is false
+   * - detail::isCacheAlignedAddress(initialChecksum.data()) is true
+   *
+   * If you want to take advantage of this and need to make sure your IOBuf
+   * address is aligned on a cache line boundary, you can use the
+   * function detail::allocateCacheAlignedIOBufUnique() to do it.
+   */
+  explicit LtHash(std::unique_ptr<folly::IOBuf> initialChecksum);
+  // Note: we explicitly implement copy constructor and copy assignment
+  // operator to make sure the checksum_ IOBuf is deep-copied.
+  LtHash(const LtHash<B, N>& that);
+  LtHash<B, N>& operator=(const LtHash<B, N>& that);
+  LtHash(LtHash<B, N>&& that) noexcept = default;
+  LtHash<B, N>& operator=(LtHash<B, N>&& that) noexcept = default;
+  ~LtHash() = default;
+  /**
+   * Resets the checksum in this LtHash. This puts the hash into the same
+   * state as if it was just constructed with the zero-argument constructor.
+   */
+  void reset();
+  /**
+   * IMPORTANT: Unlike regular hash, the incremental hash functions operate on
+   * individual objects, not a stream of data. For example, the following
+   * example codes will lead to different checksum values.
+   * (1) addObject("Hello"); addObject(" World");
+   * (2) addObject("Hello World");
+   * because addObject() calculates hashes for the two words separately, and
+   * aggregate them to update checksum.
+   *
+   * addObject() is commutative. LtHash generates the same checksum over a
+   * given set of objects regardless of the order they were added.
+   * Example: H(a + b + c) = H(b + c + a)
+   *
+   * addObject() can be called with multiple ByteRange parameters, in which
+   * case it will behave as if it was called with a single ByteRange which
+   * contained the concatenation of all the input ByteRanges. This allows
+   * adding an object whose hash is computed from several non-contiguous
+   * ranges of data, without having to copy the data to a contiguous
+   * piece of memory.
+   *
+   * Example: addObject(r1, r2, r3) is equivalent to
+   * addObject(r4) where r4 contains the concatenation of r1 + r2 + r3.
+   */
+  template <typename... Args>
+  LtHash<B, N>& addObject(folly::ByteRange firstRange, Args&&... moreRanges);
+  /**
+   * removeObject() is the inverse function of addObject(). Note that it does
+   * NOT check whether the object has been actually added to LtHash. The caller
+   * should ensure that the object is valid.
+   *
+   * Example: H(a - a + b - b + c - c) = H(a + b + c - a - b - c) = H()
+   *
+   * Similar to addObject(), removeObject() can be called with more than one
+   * ByteRange parameter.
+   */
+  template <typename... Args>
+  LtHash<B, N>& removeObject(folly::ByteRange firstRange, Args&&... moreRanges);
+  /**
+   * Because the addObject() operation in LtHash is commutative and transitive,
+   * it's possible to break down a large LtHash computation (i.e. adding 100k
+   * objects) into several parallel steps each of which computes a LtHash of a
+   * subset of the objects, and then add the LtHash objects together.
+   * Pseudocode:
+   *
+   *   std::vector<std::string> objects = ...;
+   *   Future<LtHash<20, 1008>> h1 = computeInBackgroundThread(
+   *       &objects[0], &objects[10000]);
+   *   Future<LtHash<20, 1008>> h2 = computeInBackgroundThread(
+   *       &objects[10001], &objects[20000]);
+   *   LtHash<20, 1008> result = h1.get() + h2.get();
+   */
+  LtHash<B, N>& operator+=(const LtHash<B, N>& rhs);
+  friend LtHash<B, N> operator+(
+      const LtHash<B, N>& lhs,
+      const LtHash<B, N>& rhs) {
+    LtHash<B, N> result = lhs;
+    result += rhs;
+    return result;
+  }
+  friend LtHash<B, N> operator+(LtHash<B, N>&& lhs, const LtHash<B, N>& rhs) {
+    LtHash<B, N> result = std::move(lhs);
+    result += rhs;
+    return result;
+  }
+  friend LtHash<B, N> operator+(const LtHash<B, N>& lhs, LtHash<B, N>&& rhs) {
+    // addition is commutative so we can just swap the two arguments
+    return std::move(rhs) + lhs;
+  }
+  friend LtHash<B, N> operator+(LtHash<B, N>&& lhs, LtHash<B, N>&& rhs) {
+    LtHash<B, N> result = std::move(lhs);
+    result += rhs;
+    return result;
+  }
+  /**
+   * The subtraction operator is provided for symmetry, but I'm not sure if
+   * anyone will ever actually use it outside of tests.
+   */
+  LtHash<B, N>& operator-=(const LtHash<B, N>& rhs);
+  friend LtHash<B, N> operator-(
+      const LtHash<B, N>& lhs,
+      const LtHash<B, N>& rhs) {
+    LtHash<B, N> result = lhs;
+    result -= rhs;
+    return result;
+  }
+  friend LtHash<B, N> operator-(LtHash<B, N>&& lhs, const LtHash<B, N>& rhs) {
+    LtHash<B, N> result = std::move(lhs);
+    result -= rhs;
+    return result;
+  }
+  /**
+   * Equality comparison operator, implemented in a data-independent way to
+   * guard against timing attacks. Always use this to check if two LtHash
+   * values are equal instead of manually comparing checksum buffers.
+   */
+  bool operator==(const LtHash<B, N>& that) const;
+  /**
+   * Equality comparison operator for checksum in ByteRange, implemented in a
+   * data-independent way to guard against timing attacks.
+   */
+  bool checksumEquals(folly::ByteRange otherChecksum) const;
+  /**
+   * Inequality comparison operator.
+   */
+  bool operator!=(const LtHash<B, N>& that) const;
+  /**
+   * Sets the intial checksum value to use for processing objects in the
+   * xxxObject() calls.
+   */
+  void setChecksum(const folly::IOBuf& checksum);
+  /**
+   * Like the above method but takes ownership of the checksum buffer,
+   * avoiding a copy if these conditions about the input buffer are met:
+   * - checksum->isChained() is false
+   * - checksum->isShared() is false
+   * - detail::isCacheAlignedAddress(checksum.data()) is true
+   *
+   * If you want to take advantage of this and need to make sure your IOBuf
+   * address is aligned on a cache line boundary, you can use the
+   * function detail::allocateCacheAlignedIOBufUnique() to do it.
+   */
+  void setChecksum(std::unique_ptr<folly::IOBuf> checksum);
+  /**
+   * Returns the total length of the checksum (element_count * element_length)
+   */
+  static constexpr size_t getChecksumSizeBytes();
+  /**
+   * Returns the template parameter B.
+   */
+  static constexpr size_t getElementSizeInBits();
+  /**
+   * Returns the number of elements that get packed into a single uint64_t.
+   */
+  static constexpr size_t getElementsPerUint64();
+  /**
+   * Returns the template parameter N.
+   */
+  static constexpr size_t getElementCount();
+  /**
+   * Retruns true if the internal checksum uses padding bits between elements.
+   */
+  static constexpr bool hasPaddingBits();
+  /**
+   * Returns a copy of the current checksum value
+   */
+  std::unique_ptr<folly::IOBuf> getChecksum() const;
+ private:
+  template <typename... Args>
+  void hashObject(
+      folly::MutableByteRange out,
+      folly::ByteRange firstRange,
+      Args&&... moreRanges);
+  template <typename... Args>
+  void
+  updateDigest(Blake2xb& digest, folly::ByteRange range, Args&&... moreRanges);
+  void updateDigest(Blake2xb& digest);
+  // current checksum
+  folly::IOBuf checksum_;
+};
+} // namespace crypto
+} // namespace folly
+#include <folly/experimental/crypto/LtHash-inl.h>
+namespace folly {
+namespace crypto {
+// This is the fastest and smallest specialization and should be
+// preferred in most cases. It provides over 200 bits of security
+// which should be good enough for most cases.
+using LtHash16_1024 = LtHash<16, 1024>;
+// These specializations are available to users who want a higher
+// level of cryptographic security. They are slower and larger than
+// the one above.
+using LtHash20_1008 = LtHash<20, 1008>;
+using LtHash32_1024 = LtHash<32, 1024>;
+} // namespace crypto
+} // namespace folly
--- a/folly/experimental/crypto/detail/LtHashInternal.h
+++ b/folly/experimental/crypto/detail/LtHashInternal.h
+/*
+ * Copyright 2017-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <folly/Range.h>
+namespace folly {
+namespace crypto {
+namespace detail {
+// As of 2019, most (or all?) modern Intel CPUs have 64-byte L1 cache lines,
+// and aligning data buffers on cache line boundaries on such CPUs
+// noticeably benefits performance (up to 10% difference).
+//
+// If you change this, code that depends on it in MathOperation_*.cpp may
+// break and could need fixing.
+constexpr size_t kCacheLineSize = 64;
+// Invariants about kCacheLineSize that other logic depends on: it must be
+// a power of 2 and cannot be zero.
+static_assert(kCacheLineSize > 0, "kCacheLineSize cannot be 0");
+static_assert(
+    (kCacheLineSize & (kCacheLineSize - 1)) == 0,
+    "kCacheLineSize must be a power of 2");
+/**
+ * Defines available math engines that we can use to perform element-wise
+ * modular addition or subtraction of element vectors.
+ * - AUTO: pick the best available, from best to worst: AVX2, SSE2, SIMPLE
+ * - SIMPLE: perform addition/subtraction using uint64_t values
+ * - SSE2: perform addition/subtraction using 128-bit __m128i values.
+ *   Intel only, requires SSE2 instruction support.
+ * - AVX2: perform addition/subtraction using 256-bit __m256i values.
+ *   Intel only, requires AVX2 instruction support.
+ */
+enum class MathEngine { AUTO, SIMPLE, SSE2, AVX2 };
+/**
+ * This actually implements the bulk addition/subtraction operations.
+ */
+template <MathEngine E>
+struct MathOperation {
+  /**
+   * Returns true if the math engine E is supported by the CPU and OS and is
+   * implemented.
+   */
+  static bool isAvailable();
+  /**
+   * Returns true if the math engine E is implemented.
+   */
+  static bool isImplemented();
+  /**
+   * Performs element-wise modular addition of 2 vectors of elements packed
+   * into the buffers b1 and b2. Writes the output into the buffer out. The
+   * output buffer may be the same as one of the input buffers. The dataMask
+   * parameter should be Bits<B>::kDataMask() where B is the element size
+   * in bits.
+   */
+  static void add(
+      uint64_t dataMask,
+      size_t bitsPerElement,
+      ByteRange b1,
+      ByteRange b2,
+      MutableByteRange out);
+  /**
+   * Performs element-wise modular subtraction of 2 groups of elements packed
+   * into the buffers b1 and b2. Note that (a - b) % M == (a + (M - b)) % M,
+   *  which is how we actually implement it to avoid underflow issues. The
+   * dataMask parameter should be Bits<B>::kDataMask() where B is the element
+   * size in bits.
+   */
+  static void sub(
+      uint64_t dataMask,
+      size_t bitsPerElement,
+      ByteRange b1,
+      ByteRange b2,
+      MutableByteRange out);
+  /**
+   * Clears the padding bits of the given buffer according to the given
+   * data mask: for each uint64_t in the input buffer, all 0 bits in the
+   * data mask are cleared, and all 1 bits in the data mask are preserved.
+   */
+  static void clearPaddingBits(uint64_t dataMask, MutableByteRange buf);
+  /**
+   * Returns true if the given checksum buffer contains 0 bits at the padding
+   * bit positions, according to the given data mask.
+   */
+  static bool checkPaddingBits(uint64_t dataMask, ByteRange buf);
+};
+// These forward declarations of explicit template instantiations seem to be
+// required to get things to compile. I tried to get things to work without it,
+// but the compiler complained when I had any AVX2 types in this header, so I
+// think they need to be hidden in the .cpp file for some reason.
+#define FORWARD_DECLARE_EXTERN_TEMPLATE(E)                                   \
+  template <>                                                                \
+  bool MathOperation<E>::isAvailable();                                      \
+  template <>                                                                \
+  bool MathOperation<E>::isImplemented();                                    \
+  template <>                                                                \
+  void MathOperation<E>::add(                                                \
+      uint64_t dataMask,                                                     \
+      size_t bitsPerElement,                                                 \
+      ByteRange b1,                                                          \
+      ByteRange b2,                                                          \
+      MutableByteRange out);                                                 \
+  template <>                                                                \
+  void MathOperation<E>::sub(                                                \
+      uint64_t dataMask,                                                     \
+      size_t bitsPerElement,                                                 \
+      ByteRange b1,                                                          \
+      ByteRange b2,                                                          \
+      MutableByteRange out);                                                 \
+  template <>                                                                \
+  void MathOperation<E>::clearPaddingBits(                                   \
+      uint64_t dataMask, MutableByteRange buf);                              \
+  template <>                                                                \
+  bool MathOperation<E>::checkPaddingBits(uint64_t dataMask, ByteRange buf); \
+  extern template struct MathOperation<E>
+FORWARD_DECLARE_EXTERN_TEMPLATE(MathEngine::AUTO);
+FORWARD_DECLARE_EXTERN_TEMPLATE(MathEngine::SIMPLE);
+FORWARD_DECLARE_EXTERN_TEMPLATE(MathEngine::SSE2);
+FORWARD_DECLARE_EXTERN_TEMPLATE(MathEngine::AVX2);
+#undef FORWARD_DECLARE_EXTERN_TEMPLATE
+} // namespace detail
+} // namespace crypto
+} // namespace folly
--- a/folly/experimental/crypto/detail/MathOperation_AVX2.cpp
+++ b/folly/experimental/crypto/detail/MathOperation_AVX2.cpp
+/*
+ * Copyright 2017-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Implementation of the MathOperation<MathEngine::AVX2> template
+// specializations.
+#include <folly/experimental/crypto/detail/LtHashInternal.h>
+#ifdef __AVX2__
+#include <immintrin.h>
+#include <sodium.h>
+#include <folly/lang/Bits.h>
+#endif // __AVX2__
+#include <folly/Memory.h>
+namespace folly {
+namespace crypto {
+namespace detail {
+#ifdef __AVX2__
+// static
+template <>
+bool MathOperation<MathEngine::AVX2>::isImplemented() {
+  return true;
+}
+// static
+template <>
+void MathOperation<MathEngine::AVX2>::add(
+    uint64_t dataMask,
+    size_t bitsPerElement,
+    ByteRange b1,
+    ByteRange b2,
+    MutableByteRange out) {
+  DCHECK_EQ(b1.size(), b2.size());
+  DCHECK_EQ(b1.size(), out.size());
+  DCHECK_EQ(0, b1.size() % kCacheLineSize);
+  static_assert(
+      kCacheLineSize % sizeof(__m256i) == 0,
+      "kCacheLineSize must be a multiple of sizeof(__m256i)");
+  static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m256i);
+  static_assert(
+      kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m256i)");
+  // gcc issues 'ignoring attributes on template argument' warning if
+  // __m256i is used below, so have to type explicitly
+  alignas(kCacheLineSize) std::array<
+      long long __attribute__((__vector_size__(sizeof(__m256i)))),
+      kValsPerCacheLine>
+      results;
+  // Note: AVX2 is Intel x86_64 only which is little-endian, so we don't need
+  // the Endian::little() conversions when loading or storing data.
+  if (bitsPerElement == 16 || bitsPerElement == 32) {
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const __m256i* v1p = reinterpret_cast<const __m256i*>(b1.data() + pos);
+      const __m256i* v2p = reinterpret_cast<const __m256i*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        __m256i v1 = _mm256_load_si256(v1p + i);
+        __m256i v2 = _mm256_load_si256(v2p + i);
+        if (bitsPerElement == 16) {
+          results[i] = _mm256_add_epi16(v1, v2);
+        } else { // bitsPerElement == 32
+          results[i] = _mm256_add_epi32(v1, v2);
+        }
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  } else {
+    __m256i mask = _mm256_set1_epi64x(dataMask);
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const __m256i* v1p = reinterpret_cast<const __m256i*>(b1.data() + pos);
+      const __m256i* v2p = reinterpret_cast<const __m256i*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        __m256i v1 = _mm256_load_si256(v1p + i);
+        __m256i v2 = _mm256_load_si256(v2p + i);
+        results[i] = _mm256_and_si256(_mm256_add_epi64(v1, v2), mask);
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  }
+}
+// static
+template <>
+void MathOperation<MathEngine::AVX2>::sub(
+    uint64_t dataMask,
+    size_t bitsPerElement,
+    ByteRange b1,
+    ByteRange b2,
+    MutableByteRange out) {
+  DCHECK_EQ(b1.size(), b2.size());
+  DCHECK_EQ(b1.size(), out.size());
+  DCHECK_EQ(0, b1.size() % kCacheLineSize);
+  static_assert(
+      kCacheLineSize % sizeof(__m256i) == 0,
+      "kCacheLineSize must be a multiple of sizeof(__m256i)");
+  static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m256i);
+  static_assert(
+      kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m256i)");
+  // gcc issues 'ignoring attributes on template argument' warning if
+  // __m256i is used below, so have to type explicitly
+  alignas(kCacheLineSize) std::array<
+      long long __attribute__((__vector_size__(sizeof(__m256i)))),
+      kValsPerCacheLine>
+      results;
+  // Note: AVX2 is Intel x86_64 only which is little-endian, so we don't need
+  // the Endian::little() conversions when loading or storing data.
+  if (bitsPerElement == 16 || bitsPerElement == 32) {
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const __m256i* v1p = reinterpret_cast<const __m256i*>(b1.data() + pos);
+      const __m256i* v2p = reinterpret_cast<const __m256i*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        __m256i v1 = _mm256_load_si256(v1p + i);
+        __m256i v2 = _mm256_load_si256(v2p + i);
+        if (bitsPerElement == 16) {
+          results[i] = _mm256_sub_epi16(v1, v2);
+        } else { // bitsPerElement == 32
+          results[i] = _mm256_sub_epi32(v1, v2);
+        }
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  } else {
+    __m256i mask = _mm256_set1_epi64x(dataMask);
+    __m256i paddingMask = _mm256_set1_epi64x(~dataMask);
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const __m256i* v1p = reinterpret_cast<const __m256i*>(b1.data() + pos);
+      const __m256i* v2p = reinterpret_cast<const __m256i*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        __m256i v1 = _mm256_load_si256(v1p + i);
+        __m256i v2 = _mm256_load_si256(v2p + i);
+        __m256i negV2 =
+            _mm256_and_si256(_mm256_sub_epi64(paddingMask, v2), mask);
+        results[i] = _mm256_and_si256(_mm256_add_epi64(v1, negV2), mask);
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  }
+}
+template <>
+void MathOperation<MathEngine::AVX2>::clearPaddingBits(
+    uint64_t dataMask,
+    MutableByteRange buf) {
+  if (dataMask == 0xffffffffffffffffULL) {
+    return;
+  }
+  DCHECK_EQ(0, buf.size() % kCacheLineSize);
+  static_assert(
+      kCacheLineSize % sizeof(__m256i) == 0,
+      "kCacheLineSize must be a multiple of sizeof(__m256i)");
+  static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m256i);
+  static_assert(
+      kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m256i)");
+  // gcc issues 'ignoring attributes on template argument' warning if
+  // __m256i is used below, so have to type explicitly
+  alignas(kCacheLineSize) std::array<
+      long long __attribute__((__vector_size__(sizeof(__m256i)))),
+      kValsPerCacheLine>
+      results;
+  __m256i mask = _mm256_set1_epi64x(dataMask);
+  for (size_t pos = 0; pos < buf.size(); pos += kCacheLineSize) {
+    const __m256i* p = reinterpret_cast<const __m256i*>(buf.data() + pos);
+    for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+      results[i] = _mm256_and_si256(_mm256_load_si256(p + i), mask);
+    }
+    std::memcpy(buf.data() + pos, results.data(), sizeof(results));
+  }
+}
+template <>
+bool MathOperation<MathEngine::AVX2>::checkPaddingBits(
+    uint64_t dataMask,
+    ByteRange buf) {
+  if (dataMask == 0xffffffffffffffffULL) {
+    return true;
+  }
+  DCHECK_EQ(0, buf.size() % sizeof(__m256i));
+  __m256i paddingMask = _mm256_set1_epi64x(~dataMask);
+  static const __m256i kZero = _mm256_setzero_si256();
+  for (size_t pos = 0; pos < buf.size(); pos += sizeof(__m256i)) {
+    __m256i val =
+        _mm256_load_si256(reinterpret_cast<const __m256i*>(buf.data() + pos));
+    __m256i paddingBits = _mm256_and_si256(val, paddingMask);
+    if (sodium_memcmp(&paddingBits, &kZero, sizeof(kZero)) != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+#else // !__AVX2__
+// static
+template <>
+bool MathOperation<MathEngine::AVX2>::isImplemented() {
+  return false;
+}
+// static
+template <>
+void MathOperation<MathEngine::AVX2>::add(
+    uint64_t /* dataMask */,
+    size_t bitsPerElement,
+    ByteRange /* b1 */,
+    ByteRange /* b2 */,
+    MutableByteRange /* out */) {
+  if (bitsPerElement != 0) { // hack to defeat [[noreturn]] compiler warning
+    LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::AVX2>::"
+               << "add() called";
+  }
+}
+// static
+template <>
+void MathOperation<MathEngine::AVX2>::sub(
+    uint64_t /* dataMask */,
+    size_t bitsPerElement,
+    ByteRange /* b1 */,
+    ByteRange /* b2 */,
+    MutableByteRange /* out */) {
+  if (bitsPerElement != 0) { // hack to defeat [[noreturn]] compiler warning
+    LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::AVX2>::"
+               << "sub() called";
+  }
+}
+template <>
+void MathOperation<MathEngine::AVX2>::clearPaddingBits(
+    uint64_t /* dataMask */,
+    MutableByteRange buf) {
+  if (buf.data() != nullptr) { // hack to defeat [[noreturn]] compiler warning
+    LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::AVX2>::"
+               << "clearPaddingBits() called";
+  }
+}
+template <>
+bool MathOperation<MathEngine::AVX2>::checkPaddingBits(
+    uint64_t /* dataMask */,
+    ByteRange buf) {
+  if (buf.data() != nullptr) { // hack to defeat [[noreturn]] compiler warning
+    LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::AVX2>::"
+               << "checkPaddingBits() called";
+  }
+  return false;
+}
+#endif // __AVX2__
+template struct MathOperation<MathEngine::AVX2>;
+} // namespace detail
+} // namespace crypto
+} // namespace folly
--- a/folly/experimental/crypto/detail/MathOperation_SSE2.cpp
+++ b/folly/experimental/crypto/detail/MathOperation_SSE2.cpp
+/*
+ * Copyright 2017-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Implementation of the MathOperation<MathEngine::SSE2> template
+// specializations.
+#include <folly/experimental/crypto/detail/LtHashInternal.h>
+#ifdef __SSE2__
+#include <emmintrin.h>
+#include <sodium.h>
+#include <folly/lang/Bits.h>
+#endif // __SSE2__
+#include <folly/Memory.h>
+namespace folly {
+namespace crypto {
+namespace detail {
+#ifdef __SSE2__
+// static
+template <>
+bool MathOperation<MathEngine::SSE2>::isImplemented() {
+  return true;
+}
+// static
+template <>
+void MathOperation<MathEngine::SSE2>::add(
+    uint64_t dataMask,
+    size_t bitsPerElement,
+    ByteRange b1,
+    ByteRange b2,
+    MutableByteRange out) {
+  DCHECK_EQ(b1.size(), b2.size());
+  DCHECK_EQ(b1.size(), out.size());
+  DCHECK_EQ(0, b1.size() % kCacheLineSize);
+  static_assert(
+      kCacheLineSize % sizeof(__m128i) == 0,
+      "kCacheLineSize must be a multiple of sizeof(__m128i)");
+  static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m128i);
+  static_assert(
+      kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m128i)");
+  // gcc issues 'ignoring attributes on template argument' warning if
+  // __m128i is used below, so have to type explicitly
+  alignas(kCacheLineSize) std::array<
+      long long __attribute__((__vector_size__(sizeof(__m128i)))),
+      kValsPerCacheLine>
+      results;
+  // Note: SSE2 is Intel x86(_64) only which is little-endian, so we don't need
+  // the Endian::little() conversions when loading or storing data.
+  if (bitsPerElement == 16 || bitsPerElement == 32) {
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const __m128i* v1p = reinterpret_cast<const __m128i*>(b1.data() + pos);
+      const __m128i* v2p = reinterpret_cast<const __m128i*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        __m128i v1 = _mm_load_si128(v1p + i);
+        __m128i v2 = _mm_load_si128(v2p + i);
+        if (bitsPerElement == 16) {
+          results[i] = _mm_add_epi16(v1, v2);
+        } else { // bitsPerElement == 32
+          results[i] = _mm_add_epi32(v1, v2);
+        }
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  } else {
+    __m128i mask = _mm_set_epi64x(dataMask, dataMask);
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const __m128i* v1p = reinterpret_cast<const __m128i*>(b1.data() + pos);
+      const __m128i* v2p = reinterpret_cast<const __m128i*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        __m128i v1 = _mm_load_si128(v1p + i);
+        __m128i v2 = _mm_load_si128(v2p + i);
+        results[i] = _mm_and_si128(_mm_add_epi64(v1, v2), mask);
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  }
+}
+// static
+template <>
+void MathOperation<MathEngine::SSE2>::sub(
+    uint64_t dataMask,
+    size_t bitsPerElement,
+    ByteRange b1,
+    ByteRange b2,
+    MutableByteRange out) {
+  DCHECK_EQ(b1.size(), b2.size());
+  DCHECK_EQ(b1.size(), out.size());
+  DCHECK_EQ(0, b1.size() % kCacheLineSize);
+  static_assert(
+      kCacheLineSize % sizeof(__m128i) == 0,
+      "kCacheLineSize must be a multiple of sizeof(__m128i)");
+  static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m128i);
+  static_assert(
+      kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m128i)");
+  // gcc issues 'ignoring attributes on template argument' warning if
+  // __m128i is used below, so have to type explicitly
+  alignas(kCacheLineSize) std::array<
+      long long __attribute__((__vector_size__(sizeof(__m128i)))),
+      kValsPerCacheLine>
+      results;
+  // Note: SSE2 is Intel x86(_64) only which is little-endian, so we don't need
+  // the Endian::little() conversions when loading or storing data.
+  if (bitsPerElement == 16 || bitsPerElement == 32) {
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const __m128i* v1p = reinterpret_cast<const __m128i*>(b1.data() + pos);
+      const __m128i* v2p = reinterpret_cast<const __m128i*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        __m128i v1 = _mm_load_si128(v1p + i);
+        __m128i v2 = _mm_load_si128(v2p + i);
+        if (bitsPerElement == 16) {
+          results[i] = _mm_sub_epi16(v1, v2);
+        } else { // bitsPerElement == 32
+          results[i] = _mm_sub_epi32(v1, v2);
+        }
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  } else {
+    __m128i mask = _mm_set_epi64x(dataMask, dataMask);
+    __m128i paddingMask = _mm_set_epi64x(~dataMask, ~dataMask);
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const __m128i* v1p = reinterpret_cast<const __m128i*>(b1.data() + pos);
+      const __m128i* v2p = reinterpret_cast<const __m128i*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        __m128i v1 = _mm_load_si128(v1p + i);
+        __m128i v2 = _mm_load_si128(v2p + i);
+        __m128i negV2 = _mm_and_si128(_mm_sub_epi64(paddingMask, v2), mask);
+        results[i] = _mm_and_si128(_mm_add_epi64(v1, negV2), mask);
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  }
+}
+template <>
+void MathOperation<MathEngine::SSE2>::clearPaddingBits(
+    uint64_t dataMask,
+    MutableByteRange buf) {
+  if (dataMask == 0xffffffffffffffffULL) {
+    return;
+  }
+  DCHECK_EQ(0, buf.size() % kCacheLineSize);
+  static_assert(
+      kCacheLineSize % sizeof(__m128i) == 0,
+      "kCacheLineSize must be a multiple of sizeof(__m128i)");
+  static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m128i);
+  static_assert(
+      kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m128i)");
+  // gcc issues 'ignoring attributes on template argument' warning if
+  // __m128i is used below, so have to type explicitly
+  alignas(kCacheLineSize) std::array<
+      long long __attribute__((__vector_size__(sizeof(__m128i)))),
+      kValsPerCacheLine>
+      results;
+  __m128i mask = _mm_set_epi64x(dataMask, dataMask);
+  for (size_t pos = 0; pos < buf.size(); pos += kCacheLineSize) {
+    const __m128i* p = reinterpret_cast<const __m128i*>(buf.data() + pos);
+    for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+      results[i] = _mm_and_si128(_mm_load_si128(p + i), mask);
+    }
+    std::memcpy(buf.data() + pos, results.data(), sizeof(results));
+  }
+}
+template <>
+bool MathOperation<MathEngine::SSE2>::checkPaddingBits(
+    uint64_t dataMask,
+    ByteRange buf) {
+  if (dataMask == 0xffffffffffffffffULL) {
+    return true;
+  }
+  DCHECK_EQ(0, buf.size() % sizeof(__m128i));
+  __m128i paddingMask = _mm_set_epi64x(~dataMask, ~dataMask);
+  static const __m128i kZero = _mm_setzero_si128();
+  for (size_t pos = 0; pos < buf.size(); pos += sizeof(__m128i)) {
+    __m128i val =
+        _mm_load_si128(reinterpret_cast<const __m128i*>(buf.data() + pos));
+    __m128i paddingBits = _mm_and_si128(val, paddingMask);
+    if (sodium_memcmp(&paddingBits, &kZero, sizeof(kZero)) != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+#else // !__SSE2__
+// static
+template <>
+bool MathOperation<MathEngine::SSE2>::isImplemented() {
+  return false;
+}
+// static
+template <>
+void MathOperation<MathEngine::SSE2>::add(
+    uint64_t /* dataMask */,
+    size_t bitsPerElement,
+    ByteRange /* b1 */,
+    ByteRange /* b2 */,
+    MutableByteRange /* out */) {
+  if (bitsPerElement != 0) { // hack to defeat [[noreturn]] compiler warning
+    LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::SSE2>::"
+               << "add() called";
+  }
+}
+// static
+template <>
+void MathOperation<MathEngine::SSE2>::sub(
+    uint64_t /* dataMask */,
+    size_t bitsPerElement,
+    ByteRange /* b1 */,
+    ByteRange /* b2 */,
+    MutableByteRange /* out */) {
+  if (bitsPerElement != 0) { // hack to defeat [[noreturn]] compiler warning
+    LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::SSE2>::"
+               << "sub() called";
+  }
+}
+template <>
+void MathOperation<MathEngine::SSE2>::clearPaddingBits(
+    uint64_t /* dataMask */,
+    MutableByteRange buf) {
+  if (buf.data() != nullptr) { // hack to defeat [[noreturn]] compiler warning
+    LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::SSE2>::"
+               << "clearPaddingBits() called";
+  }
+  return; // not reached
+}
+template <>
+bool MathOperation<MathEngine::SSE2>::checkPaddingBits(
+    uint64_t /* dataMask */,
+    ByteRange buf) {
+  if (buf.data() != nullptr) { // hack to defeat [[noreturn]] compiler warning
+    LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::SSE2>::"
+               << "checkPaddingBits() called";
+  }
+  return false;
+}
+#endif // __SSE2__
+template struct MathOperation<MathEngine::SSE2>;
+} // namespace detail
+} // namespace crypto
+} // namespace folly
--- a/folly/experimental/crypto/detail/MathOperation_Simple.cpp
+++ b/folly/experimental/crypto/detail/MathOperation_Simple.cpp
+/*
+ * Copyright 2017-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Implementation of the MathOperation<MathEngine::SIMPLE> template
+// specializations.
+#include <folly/experimental/crypto/detail/LtHashInternal.h>
+#include <folly/Memory.h>
+#include <folly/lang/Bits.h>
+namespace folly {
+namespace crypto {
+namespace detail {
+// static
+template <>
+bool MathOperation<MathEngine::SIMPLE>::isImplemented() {
+  return true;
+}
+// static
+template <>
+void MathOperation<MathEngine::SIMPLE>::add(
+    uint64_t dataMask,
+    size_t bitsPerElement,
+    ByteRange b1,
+    ByteRange b2,
+    MutableByteRange out) {
+  DCHECK_EQ(b1.size(), b2.size());
+  DCHECK_EQ(b1.size(), out.size());
+  DCHECK_EQ(0, b1.size() % kCacheLineSize);
+  static_assert(
+      kCacheLineSize % sizeof(uint64_t) == 0,
+      "kCacheLineSize must be a multiple of sizeof(uint64_t)");
+  static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(uint64_t);
+  static_assert(
+      kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(uint64_t)");
+  alignas(kCacheLineSize) std::array<uint64_t, kValsPerCacheLine> results;
+  if (bitsPerElement == 16 || bitsPerElement == 32) {
+    // When bitsPerElement is 16:
+    // There are no padding bits, 4x 16-bit values fit exactly into a uint64_t:
+    // uint64_t U = [ uint16_t W, uint16_t X, uint16_t Y, uint16_t Z ].
+    // We break them up into A and B groups, with each group containing
+    // alternating elements, such that A | B = the original number:
+    // uint64_t A = [ uint16_t W,          0, uint16_t Y,          0 ]
+    // uint64_t B = [          0, uint16_t X,          0, uint16_t Z ]
+    // Then we add the A group and B group independently, and bitwise-OR
+    // the results.
+    // When bitsPerElement is 32:
+    // There are no padding bits, 2x 32-bit values fit exactly into a uint64_t.
+    // We independently add the high and low halves and then XOR them together.
+    const uint64_t kMaskA =
+        bitsPerElement == 16 ? 0xffff0000ffff0000ULL : 0xffffffff00000000ULL;
+    const uint64_t kMaskB = ~kMaskA;
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const uint64_t* v1p = reinterpret_cast<const uint64_t*>(b1.data() + pos);
+      const uint64_t* v2p = reinterpret_cast<const uint64_t*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        uint64_t v1 = Endian::little(*(v1p + i));
+        uint64_t v2 = Endian::little(*(v2p + i));
+        uint64_t v1a = v1 & kMaskA;
+        uint64_t v1b = v1 & kMaskB;
+        uint64_t v2a = v2 & kMaskA;
+        uint64_t v2b = v2 & kMaskB;
+        uint64_t v3a = (v1a + v2a) & kMaskA;
+        uint64_t v3b = (v1b + v2b) & kMaskB;
+        results[i] = Endian::little(v3a | v3b);
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  } else {
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const uint64_t* v1p = reinterpret_cast<const uint64_t*>(b1.data() + pos);
+      const uint64_t* v2p = reinterpret_cast<const uint64_t*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        uint64_t v1 = Endian::little(*(v1p + i));
+        uint64_t v2 = Endian::little(*(v2p + i));
+        results[i] = Endian::little((v1 + v2) & dataMask);
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  }
+}
+// static
+template <>
+void MathOperation<MathEngine::SIMPLE>::sub(
+    uint64_t dataMask,
+    size_t bitsPerElement,
+    ByteRange b1,
+    ByteRange b2,
+    MutableByteRange out) {
+  DCHECK_EQ(b1.size(), b2.size());
+  DCHECK_EQ(b1.size(), out.size());
+  DCHECK_EQ(0, b1.size() % kCacheLineSize);
+  static_assert(
+      kCacheLineSize % sizeof(uint64_t) == 0,
+      "kCacheLineSize must be a multiple of sizeof(uint64_t)");
+  static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(uint64_t);
+  static_assert(
+      kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(uint64_t)");
+  alignas(kCacheLineSize) std::array<uint64_t, kValsPerCacheLine> results;
+  if (bitsPerElement == 16 || bitsPerElement == 32) {
+    // When bitsPerElement is 16:
+    // There are no padding bits, 4x 16-bit values fit exactly into a uint64_t:
+    // uint64_t U = [ uint16_t W, uint16_t X, uint16_t Y, uint16_t Z ].
+    // We break them up into A and B groups, with each group containing
+    // alternating elements, such that A | B = the original number:
+    // uint64_t A = [ uint16_t W,          0, uint16_t Y,          0 ]
+    // uint64_t B = [          0, uint16_t X,          0, uint16_t Z ]
+    // Then we add the A group and B group independently, and bitwise-OR
+    // the results.
+    // When bitsPerElement is 32:
+    // There are no padding bits, 2x 32-bit values fit exactly into a uint64_t.
+    // We independently add the high and low halves and then XOR them together.
+    const uint64_t kMaskA =
+        bitsPerElement == 16 ? 0xffff0000ffff0000ULL : 0xffffffff00000000ULL;
+    const uint64_t kMaskB = ~kMaskA;
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const uint64_t* v1p = reinterpret_cast<const uint64_t*>(b1.data() + pos);
+      const uint64_t* v2p = reinterpret_cast<const uint64_t*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        uint64_t v1 = Endian::little(*(v1p + i));
+        uint64_t v2 = Endian::little(*(v2p + i));
+        uint64_t v1a = v1 & kMaskA;
+        uint64_t v1b = v1 & kMaskB;
+        uint64_t v2a = v2 & kMaskA;
+        uint64_t v2b = v2 & kMaskB;
+        uint64_t v3a = (v1a + (kMaskB - v2a)) & kMaskA;
+        uint64_t v3b = (v1b + (kMaskA - v2b)) & kMaskB;
+        results[i] = Endian::little(v3a | v3b);
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  } else {
+    for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
+      const uint64_t* v1p = reinterpret_cast<const uint64_t*>(b1.data() + pos);
+      const uint64_t* v2p = reinterpret_cast<const uint64_t*>(b2.data() + pos);
+      for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+        uint64_t v1 = Endian::little(*(v1p + i));
+        uint64_t v2 = Endian::little(*(v2p + i));
+        results[i] =
+            Endian::little((v1 + ((~dataMask - v2) & dataMask)) & dataMask);
+      }
+      std::memcpy(out.data() + pos, results.data(), sizeof(results));
+    }
+  }
+}
+template <>
+void MathOperation<MathEngine::SIMPLE>::clearPaddingBits(
+    uint64_t dataMask,
+    MutableByteRange buf) {
+  if (dataMask == 0xffffffffffffffffULL) {
+    return;
+  }
+  DCHECK_EQ(0, buf.size() % kCacheLineSize);
+  static_assert(
+      kCacheLineSize % sizeof(uint64_t) == 0,
+      "kCacheLineSize must be a multiple of sizeof(uint64_t)");
+  static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(uint64_t);
+  static_assert(
+      kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(uint64_t)");
+  alignas(kCacheLineSize) std::array<uint64_t, kValsPerCacheLine> results;
+  for (size_t pos = 0; pos < buf.size(); pos += kCacheLineSize) {
+    const uint64_t* p = reinterpret_cast<const uint64_t*>(buf.data() + pos);
+    for (size_t i = 0; i < kValsPerCacheLine; ++i) {
+      results[i] = Endian::little(Endian::little(*(p + i)) & dataMask);
+    }
+    std::memcpy(buf.data() + pos, results.data(), sizeof(results));
+  }
+}
+template <>
+bool MathOperation<MathEngine::SIMPLE>::checkPaddingBits(
+    uint64_t dataMask,
+    ByteRange buf) {
+  if (dataMask == 0xffffffffffffffffULL) {
+    return true;
+  }
+  DCHECK_EQ(0, buf.size() % sizeof(uint64_t));
+  for (size_t pos = 0; pos < buf.size(); pos += sizeof(uint64_t)) {
+    uint64_t val =
+        Endian::little(*reinterpret_cast<const uint64_t*>(buf.data() + pos));
+    if ((val & ~dataMask) != 0ULL) {
+      return false;
+    }
+  }
+  return true;
+}
+template struct MathOperation<MathEngine::SIMPLE>;
+} // namespace detail
+} // namespace crypto
+} // namespace folly
--- a/folly/experimental/crypto/test/LtHashBenchmark.cpp
+++ b/folly/experimental/crypto/test/LtHashBenchmark.cpp
+/*
+ * Copyright 2017-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/Benchmark.h>
+#include <folly/Random.h>
+#include <folly/experimental/crypto/LtHash.h>
+#include <folly/init/Init.h>
+#include <folly/io/IOBuf.h>
+#include <glog/logging.h>
+#include <sodium.h>
+using namespace ::folly::crypto;
+namespace {
+constexpr size_t kObjectCount = 1000;
+constexpr size_t kObjectSize = 150;
+std::vector<std::unique_ptr<const folly::IOBuf>> kObjects;
+} // namespace
+std::unique_ptr<folly::IOBuf> makeRandomData(size_t length) {
+  auto data = std::make_unique<folly::IOBuf>(
+      folly::crypto::detail::allocateCacheAlignedIOBuf(length));
+  data->append(length);
+  randombytes_buf(data->writableData(), data->length());
+  return data;
+}
+template <std::size_t B, std::size_t N>
+void runBenchmark(size_t n) {
+  LtHash<B, N> ltHash;
+  for (size_t i = 0; i < static_cast<size_t>(n); ++i) {
+    const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
+    ltHash.addObject({obj.data(), obj.length()});
+  }
+}
+BENCHMARK(single_blake2b, n) {
+  std::array<unsigned char, crypto_generichash_blake2b_BYTES_MAX> result;
+  for (size_t i = 0; i < static_cast<size_t>(n); ++i) {
+    const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
+    int res = crypto_generichash_blake2b(
+        result.data(), sizeof(result), obj.data(), obj.length(), nullptr, 0);
+    if (res != 0) {
+      throw std::runtime_error("blake2b hash failed");
+    }
+  }
+}
+BENCHMARK_RELATIVE(LtHash_element_count_1024_length_16, n) {
+  runBenchmark<16, 1024>(static_cast<size_t>(n));
+}
+BENCHMARK_RELATIVE(LtHash_element_count_1008_length_20, n) {
+  runBenchmark<20, 1008>(static_cast<size_t>(n));
+}
+BENCHMARK_RELATIVE(LtHash_element_count_1024_length_32, n) {
+  runBenchmark<32, 1024>(static_cast<size_t>(n));
+}
+BENCHMARK_RELATIVE(LtHash_element_count_2048_length_32, n) {
+  runBenchmark<32, 2048>(static_cast<size_t>(n));
+}
+BENCHMARK(calculateChecksumFor100KObjects_B20_N1008) {
+  LtHash<20, 1008> ltHash;
+  for (auto i = 0; i < 100000; ++i) {
+    const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
+    ltHash.addObject({obj.data(), obj.length()});
+  }
+}
+BENCHMARK_RELATIVE(calculateChecksumFor100KObjects_B16_N1024) {
+  LtHash<16, 1024> ltHash;
+  for (auto i = 0; i < 100000; ++i) {
+    const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
+    ltHash.addObject({obj.data(), obj.length()});
+  }
+}
+BENCHMARK_RELATIVE(calculateChecksumFor100KObjects_B32_N1024) {
+  LtHash<32, 1024> ltHash;
+  for (auto i = 0; i < 100000; ++i) {
+    const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
+    ltHash.addObject({obj.data(), obj.length()});
+  }
+}
+BENCHMARK(subtractChecksumFor100KObjects_B20_N1008) {
+  LtHash<20, 1008> ltHash;
+  for (auto i = 0; i < 100000; ++i) {
+    const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
+    ltHash.removeObject({obj.data(), obj.length()});
+  }
+}
+BENCHMARK_RELATIVE(subtractChecksumFor100KObjects_B16_N1024) {
+  LtHash<16, 1024> ltHash;
+  for (auto i = 0; i < 100000; ++i) {
+    const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
+    ltHash.removeObject({obj.data(), obj.length()});
+  }
+}
+BENCHMARK_RELATIVE(subtractChecksumFor100KObjects_B32_N1024) {
+  LtHash<32, 1024> ltHash;
+  for (auto i = 0; i < 100000; ++i) {
+    const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
+    ltHash.removeObject({obj.data(), obj.length()});
+  }
+}
+int main(int argc, char** argv) {
+  folly::init(&argc, &argv);
+  if (sodium_init() < 0) {
+    throw std::runtime_error("Failed to initialize libsodium");
+  }
+  // pre-generate objects with random length to hash
+  for (size_t i = 0; i < kObjectCount; i++) {
+    kObjects.push_back(makeRandomData(kObjectSize));
+  }
+  // Trigger the implementation selection of AUTO math operations before
+  // starting the benchmark, so log messages don't pollute the output table.
+  LtHash<20, 1008> ltHash;
+  ltHash.addObject(folly::range("hello world"));
+  ltHash.removeObject(folly::range("hello world"));
+  folly::runBenchmarks();
+  return 0;
+}
--- a/folly/experimental/crypto/test/LtHashTest.cpp
+++ b/folly/experimental/crypto/test/LtHashTest.cpp