crc32_combine

Summary: Adds a crc32_combine function to folly (and crc32c hardware) Reviewed By: yfeldblum Differential Revision: D7687302 fbshipit-source-id: 86393c54776fa63ecfb34e9a589256e92505eeae

crc32_combine
Summary: Adds a crc32_combine function to folly (and crc32c hardware) Reviewed By: yfeldblum Differential Revision: D7687302 fbshipit-source-id: 86393c54776fa63ecfb34e9a589256e92505eeae
c670567d · Dave Watson · Facebook Github Bot · a54efbc6 · c670567d · c670567d
Commit c670567d authored Aug 15, 2018 by Dave Watson Committed by Facebook Github Bot Aug 15, 2018
6 changed files
--- a/folly/Makefile.am
+++ b/folly/Makefile.am
@@ -529,7 +529,8 @@ nobase_follyinclude_HEADERS = \
 libfollybasesse42_la_SOURCES = \
 	detail/RangeSse42.cpp \
 	hash/detail/ChecksumDetail.cpp \
-	hash/detail/Crc32cDetail.cpp
+	hash/detail/Crc32cDetail.cpp \
+	hash/detail/Crc32CombineDetail.cpp
 libfollybase_la_SOURCES = \
 	Conv.cpp \

--- a/folly/hash/Checksum.cpp
+++ b/folly/hash/Checksum.cpp
@@ -22,6 +22,7 @@
 #include <stdexcept>
 #if FOLLY_SSE_PREREQ(4, 2)
+#include <emmintrin.h>
 #include <nmmintrin.h>
 #endif
@@ -147,4 +148,34 @@ crc32_type(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
  return ~crc32(data, nbytes, startingChecksum);
 }
+uint32_t crc32_combine(uint32_t crc1, uint32_t crc2, size_t crc2len) {
+  // Append up to 32 bits of zeroes in the normal way
+  uint8_t data[4] = {0, 0, 0, 0};
+  auto len = crc2len & 3;
+  if (len) {
+    crc1 = crc32(data, len, crc1);
+  }
+  if (detail::crc32_hw_supported()) {
+    return detail::crc32_combine_hw(crc1, crc2, crc2len);
+  } else {
+    return detail::crc32_combine_sw(crc1, crc2, crc2len);
+  }
+}
+uint32_t crc32c_combine(uint32_t crc1, uint32_t crc2, size_t crc2len) {
+  // Append up to 32 bits of zeroes in the normal way
+  uint8_t data[4] = {0, 0, 0, 0};
+  auto len = crc2len & 3;
+  if (len) {
+    crc1 = crc32c(data, len, crc1);
+  }
+  if (detail::crc32_hw_supported()) {
+    return detail::crc32c_combine_hw(crc1, crc2, crc2len - len);
+  } else {
+    return detail::crc32c_combine_sw(crc1, crc2, crc2len - len);
+  }
+}
 } // namespace folly
--- a/folly/hash/Checksum.h
+++ b/folly/hash/Checksum.h
@@ -57,4 +57,37 @@ crc32(const uint8_t* data, size_t nbytes, uint32_t startingChecksum = ~0U);
 uint32_t
 crc32_type(const uint8_t* data, size_t nbytes, uint32_t startingChecksum = ~0U);
+/**
+ * Given two checksums, combine them in to one checksum.
+ *
+ * Example:
+ *                     len1            len2
+ * Given a buffer [  checksum 1  |  checksum 2  ]
+ * such that the first buffer's crc is checksum1 and has length len1,
+ * and the remainder of the buffer's crc is checksum2 and len 2,
+ * a total checksum over the whole buffer can be made by:
+ *
+ * crc32_combine(checksum1, checksum 2, len2); // len1 not needed.
+ *
+ * Note that this is equivalent to:
+ *
+ * crc32(buffer2, len2, crc32(buffer1, len1));
+ *
+ * However, this allows calculating the checksums in parallel
+ * or calculating checksum 2 before checksum 1.
+ *
+ * Additionally, this is also equivalent, but much slower:
+ * crc2 = crc32(buffer2, len2, 0);
+ * crc1 = crc32(buffer1, len1, 0);
+ * combined = crc2 ^ crc32(buffer_of_all_zeros, len2, crc1);
+ *
+ * crc32[c]_combine is roughly ~10x faster than either of the other
+ * above two examples.
+ */
+uint32_t crc32_combine(uint32_t crc1, uint32_t crc2, size_t crc2len);
+/* crc32c_combine is the same as crc32_combine, but uses the crc32c
+   polynomial */
+uint32_t crc32c_combine(uint32_t crc1, uint32_t crc2, size_t crc2len);
 } // namespace folly
--- a/folly/hash/detail/ChecksumDetail.h
+++ b/folly/hash/detail/ChecksumDetail.h
@@ -97,5 +97,15 @@ bool crc32_hw_supported();
 */
 uint32_t
 crc32_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum = ~0U);
+/* See Checksum.h for details.
+ *
+ * crc2len *must* be a power of two >= 4.
+ */
+uint32_t crc32_combine_sw(uint32_t crc1, uint32_t crc2, size_t crc2len);
+uint32_t crc32_combine_hw(uint32_t crc1, uint32_t crc2, size_t crc2len);
+uint32_t crc32c_combine_sw(uint32_t crc1, uint32_t crc2, size_t crc2len);
+uint32_t crc32c_combine_hw(uint32_t crc1, uint32_t crc2, size_t crc2len);
 } // namespace detail
 } // namespace folly
--- a/folly/hash/detail/Crc32CombineDetail.cpp
+++ b/folly/hash/detail/Crc32CombineDetail.cpp
+/*
+ * Copyright 2018-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/hash/detail/ChecksumDetail.h>
+#include <folly/Bits.h>
+namespace folly {
+// Standard galois-field multiply.  The only modification is that a,
+// b, m, and p are all bit-reflected.
+//
+// https://en.wikipedia.org/wiki/Finite_field_arithmetic
+static uint32_t gf_multiply_sw(uint32_t a, uint32_t b, uint32_t m) {
+  uint32_t p = 0;
+  for (int i = 0; i < 32; i++) {
+    p ^= -((b >> 31) & 1) & a;
+    a = (a >> 1) ^ (-(a & 1) & m);
+    b <<= 1;
+  }
+  return p;
+}
+#if FOLLY_SSE_PREREQ(4, 2)
+// Reduction taken from
+// https://www.nicst.de/crc.pdf
+//
+// This is an intrinsics-based implementation of listing 3.
+static uint32_t gf_multiply_crc32c_hw(uint64_t crc1, uint64_t crc2, uint32_t) {
+  const auto crc1_xmm = _mm_set_epi64x(0, crc1);
+  const auto crc2_xmm = _mm_set_epi64x(0, crc2);
+  const auto count = _mm_set_epi64x(0, 1);
+  const auto res0 = _mm_clmulepi64_si128(crc2_xmm, crc1_xmm, 0x00);
+  const auto res1 = _mm_sll_epi64(res0, count);
+  // Use hardware crc32c to do reduction from 64 -> 32 bytes
+  const auto res2 = _mm_cvtsi128_si64(res1);
+  const auto res3 = _mm_crc32_u32(0, res2);
+  const auto res4 = _mm_extract_epi32(res1, 1);
+  return res3 ^ res4;
+}
+static uint32_t gf_multiply_crc32_hw(uint64_t crc1, uint64_t crc2, uint32_t) {
+  const auto crc1_xmm = _mm_set_epi64x(0, crc1);
+  const auto crc2_xmm = _mm_set_epi64x(0, crc2);
+  const auto count = _mm_set_epi64x(0, 1);
+  const auto res0 = _mm_clmulepi64_si128(crc2_xmm, crc1_xmm, 0x00);
+  const auto res1 = _mm_sll_epi64(res0, count);
+  // Do barrett reduction of 64 -> 32 bytes
+  const auto mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
+  const auto barrett_reduction_constants =
+      _mm_set_epi32(0x1, 0xDB710641, 0x1, 0xF7011641);
+  const auto res2 = _mm_clmulepi64_si128(
+      _mm_and_si128(res1, mask32), barrett_reduction_constants, 0x00);
+  const auto res3 = _mm_clmulepi64_si128(
+      _mm_and_si128(res2, mask32), barrett_reduction_constants, 0x10);
+  return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(res3, res1), 4));
+}
+#else
+static uint32_t gf_multiply_crc32c_hw(uint64_t, uint64_t, uint32_t) {
+  return 0;
+}
+static uint32_t gf_multiply_crc32_hw(uint64_t, uint64_t, uint32_t) {
+  return 0;
+}
+#endif
+/*
+ * Pre-calculated powers tables for crc32c and crc32.
+ * Calculated using:
+ *
+ * printf("Powers for 0x%08x\n", polynomial);
+ * auto power = polynomial;
+ * for (int i = 0; i < 62; i++) {
+ *   printf("%i 0x%08x\n", i, power);
+ *   power = gf_multiply(power, power, polynomial);
+ * }
+ * printf("-------------\n");
+ */
+static const uint32_t crc32c_powers[] = {
+    0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
+    0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564,
+    0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3,
+    0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc,
+    0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
+    0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
+    0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62,
+    0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f,
+    0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
+    0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000,
+    0x00800000, 0x00008000,
+};
+static const uint32_t crc32_powers[] = {
+    0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
+    0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f,
+    0x30362f1a, 0x7b5a9cc3, 0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d,
+    0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214, 0xa8a472c0, 0x429a969e,
+    0x148d302a, 0xc40ba6d0, 0xc4e22c3c, 0x40000000, 0x20000000, 0x08000000,
+    0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae,
+    0x88d14467, 0xd7bbfe6a, 0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0,
+    0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3, 0x31fec169, 0x9fec022a,
+    0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
+    0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c, 0x40000000,
+    0x20000000, 0x08000000,
+};
+template <typename F>
+static uint32_t crc32_append_zeroes(
+    F mult,
+    uint32_t crc,
+    size_t len,
+    uint32_t polynomial,
+    uint32_t const* powers) {
+  // Append by multiplying by consecutive powers of two of the zeroes
+  // array
+  len >>= 2;
+  while (len) {
+    // Advance directly to next bit set.
+    auto r = findFirstSet(len) - 1;
+    len >>= r;
+    powers += r;
+    crc = mult(crc, *powers, polynomial);
+    len >>= 1;
+    powers++;
+  }
+  return crc;
+}
+namespace detail {
+uint32_t crc32_combine_sw(uint32_t crc1, uint32_t crc2, size_t crc2len) {
+  return crc2 ^
+      crc32_append_zeroes(
+             gf_multiply_sw, crc1, crc2len, 0xEDB88320, crc32_powers);
+}
+uint32_t crc32_combine_hw(uint32_t crc1, uint32_t crc2, size_t crc2len) {
+  return crc2 ^
+      crc32_append_zeroes(
+             gf_multiply_crc32_hw, crc1, crc2len, 0xEDB88320, crc32_powers);
+}
+uint32_t crc32c_combine_sw(uint32_t crc1, uint32_t crc2, size_t crc2len) {
+  return crc2 ^
+      crc32_append_zeroes(
+             gf_multiply_sw, crc1, crc2len, 0x82F63B78, crc32c_powers);
+}
+uint32_t crc32c_combine_hw(uint32_t crc1, uint32_t crc2, size_t crc2len) {
+  return crc2 ^
+      crc32_append_zeroes(
+             gf_multiply_crc32c_hw, crc1, crc2len, 0x82F63B78, crc32c_powers);
+}
+} // namespace detail
+} // namespace folly
--- a/folly/hash/test/ChecksumTest.cpp
+++ b/folly/hash/test/ChecksumTest.cpp
@@ -19,6 +19,7 @@
 #include <boost/crc.hpp>
 #include <folly/Benchmark.h>
+#include <folly/Random.h>
 #include <folly/hash/Hash.h>
 #include <folly/hash/detail/ChecksumDetail.h>
 #include <folly/portability/GFlags.h>
@@ -199,6 +200,28 @@ TEST(Checksum, crc32_type) {
  testMatchesBoost32Type();
 }
+TEST(Checksum, crc32_combine) {
+  for (size_t totlen = 1024; totlen < BUFFER_SIZE; totlen += BUFFER_SIZE / 8) {
+    auto mid = folly::Random::rand64(0, totlen);
+    auto crc1 = folly::crc32(&buffer[0], mid, 0);
+    auto crc2 = folly::crc32(&buffer[mid], totlen - mid, 0);
+    auto crcfull = folly::crc32(&buffer[0], totlen, 0);
+    auto combined = folly::crc32_combine(crc1, crc2, totlen - mid);
+    EXPECT_EQ(combined, crcfull);
+  }
+}
+TEST(Checksum, crc32c_combine) {
+  for (size_t totlen = 1024; totlen < BUFFER_SIZE; totlen += BUFFER_SIZE / 8) {
+    auto mid = folly::Random::rand64(0, totlen);
+    auto crc1 = folly::crc32c(&buffer[0], mid, 0);
+    auto crc2 = folly::crc32c(&buffer[mid], totlen - mid, 0);
+    auto crcfull = folly::crc32c(&buffer[0], totlen, 0);
+    auto combined = folly::crc32c_combine(crc1, crc2, totlen - mid);
+    EXPECT_EQ(combined, crcfull);
+  }
+}
 void benchmarkHardwareCRC32C(unsigned long iters, size_t blockSize) {
  if (folly::detail::crc32c_hw_supported()) {
    uint32_t checksum;
@@ -241,6 +264,43 @@ void benchmarkSoftwareCRC32(unsigned long iters, size_t blockSize) {
  }
 }
+void benchmarkCombineHardwareCrc32(unsigned long iters, size_t blockSize) {
+  // Arbitrarily chosen checksums
+  uint32_t checksum1 = 0xEDB88320;
+  uint32_t checksum2 = 0x82F63B78;
+  uint32_t result;
+  for (unsigned long i = 0; i < iters; i++) {
+    result = folly::crc32_combine(checksum1, checksum2, blockSize);
+    folly::doNotOptimizeAway(result);
+  }
+}
+void benchmarkCombineSoftwareLinear(unsigned long iters, size_t blockSize) {
+  // Arbitrarily chosen checksums
+  std::vector<uint8_t> zbuffer;
+  zbuffer.reserve(blockSize);
+  memset(zbuffer.data(), 0, blockSize);
+  uint32_t checksum1 = 0xEDB88320;
+  uint32_t checksum2 = 0x82F63B78;
+  uint32_t result;
+  for (unsigned long i = 0; i < iters; i++) {
+    result = folly::crc32c(zbuffer.data(), blockSize, checksum1);
+    result ^= checksum2;
+    folly::doNotOptimizeAway(result);
+  }
+}
+void benchmarkCombineHardwareCrc32c(unsigned long iters, size_t blockSize) {
+  // Arbitrarily chosen checksums
+  uint32_t checksum1 = 0xEDB88320;
+  uint32_t checksum2 = 0x82F63B78;
+  uint32_t result;
+  for (unsigned long i = 0; i < iters; i++) {
+    result = folly::crc32c_combine(checksum1, checksum2, blockSize);
+    folly::doNotOptimizeAway(result);
+  }
+}
 // This test fits easily in the L1 cache on modern server processors,
 // and thus it mainly measures the speed of the checksum computation.
 BENCHMARK(crc32c_hardware_1KB_block, iters) {
@@ -297,6 +357,20 @@ BENCHMARK(crc32_software_512KB_block, iters) {
  benchmarkSoftwareCRC32(iters, 512 * 1024);
 }
+BENCHMARK_DRAW_LINE();
+BENCHMARK(crc32_combine_linear_512KB_block, iters) {
+  benchmarkCombineSoftwareLinear(iters, 512 * 1024);
+}
+BENCHMARK(crc32_combine_512KB_block, iters) {
+  benchmarkCombineHardwareCrc32(iters, 512 * 1024);
+}
+BENCHMARK(crc32c_combine_512KB_block, iters) {
+  benchmarkCombineHardwareCrc32c(iters, 512 * 1024);
+}
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  gflags::ParseCommandLineFlags(&argc, &argv, true);