3-way crc32c

Summary: Current folly version uses a single crc32c. crc32c has a latency of 3, so pipelining three in a row makes it nearly 3x faster (for data all in cache). Reviewed By: yfeldblum Differential Revision: D5418228 fbshipit-source-id: d3a250e1b4fe1f0bc99b44c660df94cf233aebd6

3-way crc32c
Summary: Current folly version uses a single crc32c. crc32c has a latency of 3, so pipelining three in a row makes it nearly 3x faster (for data all in cache). Reviewed By: yfeldblum Differential Revision: D5418228 fbshipit-source-id: d3a250e1b4fe1f0bc99b44c660df94cf233aebd6
74102328 · Dave Watson · Facebook Github Bot · 98df5db5 · 74102328 · 74102328
Commit 74102328 authored Jul 19, 2017 by Dave Watson Committed by Facebook Github Bot Jul 19, 2017
4 changed files
--- a/folly/Checksum.cpp
+++ b/folly/Checksum.cpp
@@ -33,40 +33,6 @@ uint32_t
 crc32c_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum);
 #if FOLLY_SSE_PREREQ(4, 2)
-// Fast SIMD implementation of CRC-32C for x86 with SSE 4.2
-FOLLY_TARGET_ATTRIBUTE("sse4.2")
-uint32_t crc32c_hw(const uint8_t *data, size_t nbytes,
-    uint32_t startingChecksum) {
-  uint32_t sum = startingChecksum;
-  size_t offset = 0;
-  // Process bytes one at a time until we reach an 8-byte boundary and can
-  // start doing aligned 64-bit reads.
-  static uintptr_t ALIGN_MASK = sizeof(uint64_t) - 1;
-  size_t mask = (size_t)((uintptr_t)data & ALIGN_MASK);
-  if (mask != 0) {
-    size_t limit = std::min(nbytes, sizeof(uint64_t) - mask);
-    while (offset < limit) {
-      sum = (uint32_t)_mm_crc32_u8(sum, data[offset]);
-      offset++;
-    }
-  }
-  // Process 8 bytes at a time until we have fewer than 8 bytes left.
-  while (offset + sizeof(uint64_t) <= nbytes) {
-    const uint64_t* src = (const uint64_t*)(data + offset);
-    sum = uint32_t(_mm_crc32_u64(sum, *src));
-    offset += sizeof(uint64_t);
-  }
-  // Process any bytes remaining after the last aligned 8-byte block.
-  while (offset < nbytes) {
-    sum = (uint32_t)_mm_crc32_u8(sum, data[offset]);
-    offset++;
-  }
-  return sum;
-}
 uint32_t
 crc32_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum);
@@ -106,11 +72,6 @@ bool crc32_hw_supported() {
 #else
-uint32_t crc32c_hw(const uint8_t *data, size_t nbytes,
-    uint32_t startingChecksum) {
-  throw std::runtime_error("crc32_hw is not implemented on this platform");
-}
 uint32_t crc32_hw(const uint8_t *data, size_t nbytes,
    uint32_t startingChecksum) {
  throw std::runtime_error("crc32_hw is not implemented on this platform");

--- a/folly/Makefile.am
+++ b/folly/Makefile.am
@@ -440,6 +440,7 @@ GroupVarintTables.cpp: build/generate_varint_tables.py
 CLEANFILES += GroupVarintTables.cpp
 libfollybasesse42_la_SOURCES = \
+	detail/Crc32cDetail.cpp \
 	detail/ChecksumDetail.cpp \
 	detail/RangeSse42.cpp

--- a/folly/detail/Crc32cDetail.cpp
+++ b/folly/detail/Crc32cDetail.cpp
--- a/folly/test/ChecksumTest.cpp
+++ b/folly/test/ChecksumTest.cpp
@@ -121,6 +121,19 @@ TEST(Checksum, crc32c_hardware) {
  }
 }
+TEST(Checksum, crc32c_hardware_eq) {
+  if (folly::detail::crc32c_hw_supported()) {
+    for (int i = 0; i < 1000; i++) {
+      auto sw = folly::detail::crc32c_sw(buffer, i, 0);
+      auto hw = folly::detail::crc32c_hw(buffer, i, 0);
+      EXPECT_EQ(sw, hw);
+    }
+  } else {
+    LOG(WARNING) << "skipping hardware-accelerated CRC-32C tests"
+                 << " (not supported on this CPU)";
+  }
+}
 TEST(Checksum, crc32c_continuation_hardware) {
  if (folly::detail::crc32c_hw_supported()) {
    testCRC32CContinuation(folly::detail::crc32c_hw);