Commit 74102328 authored by Dave Watson's avatar Dave Watson Committed by Facebook Github Bot

3-way crc32c

Summary: Current folly version uses a single crc32c.  crc32c has a latency of 3, so pipelining three in a row makes it nearly 3x faster (for data all in cache).

Reviewed By: yfeldblum

Differential Revision: D5418228

fbshipit-source-id: d3a250e1b4fe1f0bc99b44c660df94cf233aebd6
parent 98df5db5
......@@ -33,40 +33,6 @@ uint32_t
crc32c_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum);
#if FOLLY_SSE_PREREQ(4, 2)
// Fast SIMD implementation of CRC-32C for x86 with SSE 4.2
FOLLY_TARGET_ATTRIBUTE("sse4.2")
uint32_t crc32c_hw(const uint8_t *data, size_t nbytes,
uint32_t startingChecksum) {
uint32_t sum = startingChecksum;
size_t offset = 0;
// Process bytes one at a time until we reach an 8-byte boundary and can
// start doing aligned 64-bit reads.
static uintptr_t ALIGN_MASK = sizeof(uint64_t) - 1;
size_t mask = (size_t)((uintptr_t)data & ALIGN_MASK);
if (mask != 0) {
size_t limit = std::min(nbytes, sizeof(uint64_t) - mask);
while (offset < limit) {
sum = (uint32_t)_mm_crc32_u8(sum, data[offset]);
offset++;
}
}
// Process 8 bytes at a time until we have fewer than 8 bytes left.
while (offset + sizeof(uint64_t) <= nbytes) {
const uint64_t* src = (const uint64_t*)(data + offset);
sum = uint32_t(_mm_crc32_u64(sum, *src));
offset += sizeof(uint64_t);
}
// Process any bytes remaining after the last aligned 8-byte block.
while (offset < nbytes) {
sum = (uint32_t)_mm_crc32_u8(sum, data[offset]);
offset++;
}
return sum;
}
uint32_t
crc32_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum);
......@@ -106,11 +72,6 @@ bool crc32_hw_supported() {
#else
uint32_t crc32c_hw(const uint8_t *data, size_t nbytes,
uint32_t startingChecksum) {
throw std::runtime_error("crc32_hw is not implemented on this platform");
}
uint32_t crc32_hw(const uint8_t *data, size_t nbytes,
uint32_t startingChecksum) {
throw std::runtime_error("crc32_hw is not implemented on this platform");
......
......@@ -440,6 +440,7 @@ GroupVarintTables.cpp: build/generate_varint_tables.py
CLEANFILES += GroupVarintTables.cpp
libfollybasesse42_la_SOURCES = \
detail/Crc32cDetail.cpp \
detail/ChecksumDetail.cpp \
detail/RangeSse42.cpp
......
This diff is collapsed.
......@@ -121,6 +121,19 @@ TEST(Checksum, crc32c_hardware) {
}
}
TEST(Checksum, crc32c_hardware_eq) {
if (folly::detail::crc32c_hw_supported()) {
for (int i = 0; i < 1000; i++) {
auto sw = folly::detail::crc32c_sw(buffer, i, 0);
auto hw = folly::detail::crc32c_hw(buffer, i, 0);
EXPECT_EQ(sw, hw);
}
} else {
LOG(WARNING) << "skipping hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
}
}
TEST(Checksum, crc32c_continuation_hardware) {
if (folly::detail::crc32c_hw_supported()) {
testCRC32CContinuation(folly::detail::crc32c_hw);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment