Commit 5a1dec3a authored by Yedidya Feldblum's avatar Yedidya Feldblum Committed by Facebook Github Bot

Generate group varint tables at compile time in C++

Summary: [Folly] Generate group varint tables at compile time in C++, rather than as a separate custom build step in Python.

Reviewed By: simpkins

Differential Revision: D6832563

fbshipit-source-id: fa218c512cbac383f153e7b6fd4df7f181bcb0de
parent 82d6df58
......@@ -30,5 +30,4 @@ folly/m4/ltsugar.m4
folly/m4/ltversion.m4
folly/m4/lt~obsolete.m4
folly/generate_fingerprint_tables
folly/GroupVarintTables.cpp
folly/FingerprintTables.cpp
......@@ -57,17 +57,6 @@ endif()
set(FOLLY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/folly")
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/folly/build/GroupVarintTables.cpp"
COMMAND
${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/folly/build
COMMAND
${PYTHON_EXECUTABLE} "${FOLLY_DIR}/build/generate_varint_tables.py"
--install_dir ${CMAKE_CURRENT_BINARY_DIR}/folly/build
DEPENDS ${FOLLY_DIR}/build/generate_varint_tables.py
COMMENT "Generating the group varint tables..." VERBATIM
)
include(folly-deps) # Find the required packages
if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
......@@ -189,7 +178,6 @@ endif()
add_library(folly_base OBJECT
${files} ${hfiles}
${CMAKE_CURRENT_BINARY_DIR}/folly/folly-config.h
${CMAKE_CURRENT_BINARY_DIR}/folly/build/GroupVarintTables.cpp
)
if (BUILD_SHARED_LIBS)
set_property(TARGET folly_base PROPERTY POSITION_INDEPENDENT_CODE ON)
......@@ -201,7 +189,6 @@ target_include_directories(folly_base PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
source_group("folly" FILES ${CMAKE_CURRENT_BINARY_DIR}/folly/folly-config.h)
source_group("folly\\build" FILES
${CMAKE_CURRENT_BINARY_DIR}/folly/build/FingerprintTables.cpp
${CMAKE_CURRENT_BINARY_DIR}/folly/build/GroupVarintTables.cpp
)
if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
......
......@@ -16,6 +16,8 @@
#include <folly/GroupVarint.h>
#include <folly/container/Array.h>
#if HAVE_GROUP_VARINT
namespace folly {
......@@ -29,5 +31,114 @@ const uint64_t GroupVarint64::kMask[] = {
0xffffffffffffffffULL
};
namespace detail {
struct group_varint_table_base_make_item {
constexpr std::size_t get_d(std::size_t index, std::size_t j) const {
return 1u + ((index >> (2 * j)) & 3u);
}
constexpr std::size_t get_offset(std::size_t index, std::size_t j) const {
// clang-format off
return
(j > 0 ? get_d(index, 0) : 0) +
(j > 1 ? get_d(index, 1) : 0) +
(j > 2 ? get_d(index, 2) : 0) +
(j > 3 ? get_d(index, 3) : 0) +
0;
// clang-format on
}
};
struct group_varint_table_length_make_item : group_varint_table_base_make_item {
constexpr std::uint8_t operator()(std::size_t index) const {
return 1u + get_offset(index, 4);
}
};
// Reference: http://www.stepanovpapers.com/CIKM_2011.pdf
//
// From 17 encoded bytes, we may use between 5 and 17 bytes to encode 4
// integers. The first byte is a key that indicates how many bytes each of
// the 4 integers takes:
//
// bit 0..1: length-1 of first integer
// bit 2..3: length-1 of second integer
// bit 4..5: length-1 of third integer
// bit 6..7: length-1 of fourth integer
//
// The value of the first byte is used as the index in a table which returns
// a mask value for the SSSE3 PSHUFB instruction, which takes an XMM register
// (16 bytes) and shuffles bytes from it into a destination XMM register
// (optionally setting some of them to 0)
//
// For example, if the key has value 4, that means that the first integer
// uses 1 byte, the second uses 2 bytes, the third and fourth use 1 byte each,
// so we set the mask value so that
//
// r[0] = a[0]
// r[1] = 0
// r[2] = 0
// r[3] = 0
//
// r[4] = a[1]
// r[5] = a[2]
// r[6] = 0
// r[7] = 0
//
// r[8] = a[3]
// r[9] = 0
// r[10] = 0
// r[11] = 0
//
// r[12] = a[4]
// r[13] = 0
// r[14] = 0
// r[15] = 0
struct group_varint_table_sse_mask_make_item
: group_varint_table_base_make_item {
constexpr auto partial_item(std::size_t d, std::size_t offset, std::size_t k)
const {
// if k < d, the j'th integer uses d bytes, consume them
// set remaining bytes in result to 0
// 0xff: set corresponding byte in result to 0
return std::uint32_t((k < d ? offset + k : std::size_t(0xff)) << (8 * k));
}
constexpr auto item_impl(std::size_t d, std::size_t offset) const {
// clang-format off
return
partial_item(d, offset, 0) |
partial_item(d, offset, 1) |
partial_item(d, offset, 2) |
partial_item(d, offset, 3) |
0;
// clang-format on
}
constexpr auto item(std::size_t index, std::size_t j) const {
return item_impl(get_d(index, j), get_offset(index, j));
}
constexpr auto operator()(std::size_t index) const {
return std::array<std::uint32_t, 4>{{
item(index, 0),
item(index, 1),
item(index, 2),
item(index, 3),
}};
}
};
#if FOLLY_SSE >= 3
alignas(16) constexpr decltype(groupVarintSSEMasks) groupVarintSSEMasks =
make_array_with<256>(group_varint_table_sse_mask_make_item{});
#endif
constexpr decltype(groupVarintLengths) groupVarintLengths =
make_array_with<256>(group_varint_table_length_make_item{});
} // namespace detail
} // namespace folly
#endif
......@@ -39,14 +39,14 @@
#include <nmmintrin.h>
namespace folly {
namespace detail {
alignas(16) extern const uint64_t groupVarintSSEMasks[];
extern const std::array<std::array<std::uint32_t, 4>, 256> groupVarintSSEMasks;
} // namespace detail
} // namespace folly
#endif
namespace folly {
namespace detail {
extern const uint8_t groupVarintLengths[];
extern const std::array<std::uint8_t, 256> groupVarintLengths;
} // namespace detail
} // namespace folly
......@@ -208,7 +208,7 @@ class GroupVarint<uint32_t> : public detail::GroupVarintBase<uint32_t> {
uint8_t key = uint8_t(p[0]);
__m128i val = _mm_loadu_si128((const __m128i*)(p+1));
__m128i mask =
_mm_load_si128((const __m128i*)&detail::groupVarintSSEMasks[key * 2]);
_mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data());
__m128i r = _mm_shuffle_epi8(val, mask);
_mm_storeu_si128((__m128i*)dest, r);
return p + detail::groupVarintLengths[key];
......@@ -223,7 +223,7 @@ class GroupVarint<uint32_t> : public detail::GroupVarintBase<uint32_t> {
uint8_t key = uint8_t(p[0]);
__m128i val = _mm_loadu_si128((const __m128i*)(p+1));
__m128i mask =
_mm_load_si128((const __m128i*)&detail::groupVarintSSEMasks[key * 2]);
_mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data());
__m128i r = _mm_shuffle_epi8(val, mask);
// Extracting 32 bits at a time out of an XMM register is a SSE4 feature
......
......@@ -483,10 +483,6 @@ nobase_follyinclude_HEADERS = \
Utility.h \
Varint.h
GroupVarintTables.cpp: build/generate_varint_tables.py
$(PYTHON) build/generate_varint_tables.py
CLEANFILES += GroupVarintTables.cpp
libfollybasesse42_la_SOURCES = \
detail/RangeSse42.cpp \
hash/detail/ChecksumDetail.cpp \
......@@ -541,7 +537,6 @@ libfolly_la_SOURCES = \
experimental/hazptr/hazptr.cpp \
experimental/hazptr/memory_resource.cpp \
GroupVarint.cpp \
GroupVarintTables.cpp \
hash/Checksum.cpp \
hash/SpookyHashV1.cpp \
hash/SpookyHashV2.cpp \
......
#!/usr/bin/env python
#
# Generate tables for GroupVarint32
# Copyright 2011 Facebook
#
# @author Tudor Bosman (tudorb@fb.com)
#
# Reference: http://www.stepanovpapers.com/CIKM_2011.pdf
#
# From 17 encoded bytes, we may use between 5 and 17 bytes to encode 4
# integers. The first byte is a key that indicates how many bytes each of
# the 4 integers takes:
#
# bit 0..1: length-1 of first integer
# bit 2..3: length-1 of second integer
# bit 4..5: length-1 of third integer
# bit 6..7: length-1 of fourth integer
#
# The value of the first byte is used as the index in a table which returns
# a mask value for the SSSE3 PSHUFB instruction, which takes an XMM register
# (16 bytes) and shuffles bytes from it into a destination XMM register
# (optionally setting some of them to 0)
#
# For example, if the key has value 4, that means that the first integer
# uses 1 byte, the second uses 2 bytes, the third and fourth use 1 byte each,
# so we set the mask value so that
#
# r[0] = a[0]
# r[1] = 0
# r[2] = 0
# r[3] = 0
#
# r[4] = a[1]
# r[5] = a[2]
# r[6] = 0
# r[7] = 0
#
# r[8] = a[3]
# r[9] = 0
# r[10] = 0
# r[11] = 0
#
# r[12] = a[4]
# r[13] = 0
# r[14] = 0
# r[15] = 0
import os
from optparse import OptionParser
OUTPUT_FILE = "GroupVarintTables.cpp"
def generate(f):
f.write("""
#include <folly/Portability.h>
#include <stdint.h>
namespace folly {
namespace detail {
#if (FOLLY_X64 || defined(__i386__)) && (FOLLY_SSE >= 2)
alignas(16) extern const uint64_t groupVarintSSEMasks[512] = {
""")
# Compute SSE masks
for i in range(0, 256):
offset = 0
vals = [0, 0, 0, 0]
for j in range(0, 4):
d = 1 + ((i >> (2 * j)) & 3)
# the j'th integer uses d bytes, consume them
for k in range(0, d):
vals[j] |= offset << (8 * k)
offset += 1
# set remaining bytes in result to 0
# 0xff: set corresponding byte in result to 0
for k in range(d, 4):
vals[j] |= 0xff << (8 * k)
f.write(" 0x{1:08x}{0:08x}ULL, "
"0x{3:08x}{2:08x}ULL,\n".format(*vals))
f.write("};\n"
"#endif /*#if (FOLLY_X64 || defined(__i386__)) && (FOLLY_SSE >= 2)*/\n"
"\n"
"extern const uint8_t groupVarintLengths[] = {\n")
# Also compute total encoded lengths, including key byte
for i in range(0, 256):
offset = 1 # include key byte
for j in range(0, 4):
d = 1 + ((i >> (2 * j)) & 3)
offset += d
f.write(" {0},\n".format(offset))
f.write("""
};
} // namespace detail
} // namespace folly
""")
def main():
parser = OptionParser()
parser.add_option("--install_dir", dest="install_dir", default=".",
help="write output to DIR", metavar="DIR")
parser.add_option("--fbcode_dir")
(options, args) = parser.parse_args()
f = open(os.path.join(options.install_dir, OUTPUT_FILE), "w")
generate(f)
f.close()
if __name__ == "__main__":
main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment