Commit b2818624 authored by Dmitry Pleshkov's avatar Dmitry Pleshkov Committed by facebook-github-bot-4

Haswell-specific implementation of Select64 routine.

Summary: k-th bit selection could be efficiently implemented via new BMI2 instruction set.

Reviewed By: ot, philippv

Differential Revision: D2843311

fb-gh-sync-id: 4c0cf52176a03422aef276ce5f677080f67f5fdf
parent f00612cf
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
#include <glog/logging.h> #include <glog/logging.h>
#include <folly/experimental/Instructions.h>
namespace folly { namespace folly {
namespace detail { namespace detail {
...@@ -61,5 +63,19 @@ inline uint64_t select64(uint64_t x, uint64_t k) { ...@@ -61,5 +63,19 @@ inline uint64_t select64(uint64_t x, uint64_t k) {
return place + detail::kSelectInByte[((x >> place) & 0xFF) | (byteRank << 8)]; return place + detail::kSelectInByte[((x >> place) & 0xFF) | (byteRank << 8)];
} }
template <>
inline uint64_t select64<compression::instructions::Haswell>(uint64_t x,
uint64_t k) {
uint64_t result = uint64_t(1) << k;
asm("pdep %1, %0, %0\n\t"
"tzcnt %0, %0"
: "+r"(result)
: "r"(x));
return result;
} }
} // namespace folly
#endif #endif
...@@ -69,6 +69,19 @@ TEST_F(EliasFanoCodingTest, SkipForwardPointers) { ...@@ -69,6 +69,19 @@ TEST_F(EliasFanoCodingTest, SkipForwardPointers) {
doTestAll<128, 128>(); doTestAll<128, 128>();
} }
TEST_F(EliasFanoCodingTest, Select64) {
typedef instructions::EF_TEST_ARCH instr;
constexpr uint64_t kPrime = uint64_t(-59);
for (uint64_t x = kPrime, i = 0; i < (1 << 20); x *= kPrime, i += 1) {
size_t w = instr::popcount(x);
for (size_t k = 0; k < w; ++k) {
auto pos = folly::select64<instr>(x, k);
CHECK_EQ((x >> pos) & 1, 1);
CHECK_EQ(instr::popcount(x & ((uint64_t(1) << pos) - 1)), k);
}
}
}
namespace bm { namespace bm {
constexpr size_t k1M = 1000000; constexpr size_t k1M = 1000000;
...@@ -158,46 +171,33 @@ BENCHMARK(Encode) { ...@@ -158,46 +171,33 @@ BENCHMARK(Encode) {
list.free(); list.free();
} }
BENCHMARK_DRAW_LINE();
BENCHMARK(Select64, iters) {
typedef instructions::EF_TEST_ARCH instr;
constexpr uint64_t kPrime = uint64_t(-59);
for (uint64_t x = kPrime, i = 0; i < iters; x *= kPrime, i += 1) {
size_t w = instr::popcount(x);
folly::doNotOptimizeAway(folly::select64<instr>(x, w - 1));
}
}
#if 0 #if 0
Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz (turbo off), Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz (turbo off),
using instructions::Haswell and GCC 4.9 with --bm_min_usec 100000. using instructions::Haswell and GCC 4.9 with --bm_min_usec 100000.
============================================================================ ============================================================================
folly/experimental/test/EliasFanoCodingTest.cpp relative time/iter iters/s folly/experimental/test/EliasFanoCodingTest.cpp relative time/iter iters/s
============================================================================ ============================================================================
Next 2.52ns 397.28M Next 2.59ns 386.60M
Skip_ForwardQ128(1) 3.92ns 255.28M Skip_ForwardQ128(1) 4.03ns 248.16M
Skip_ForwardQ128(2) 5.08ns 197.04M Skip_ForwardQ128(2) 5.28ns 189.39M
Skip_ForwardQ128(4_pm_1) 7.04ns 142.02M Skip_ForwardQ128(4_pm_1) 7.48ns 133.75M
Skip_ForwardQ128(16_pm_4) 19.68ns 50.82M Skip_ForwardQ128(16_pm_4) 20.28ns 49.32M
Skip_ForwardQ128(64_pm_16) 27.58ns 36.26M Skip_ForwardQ128(64_pm_16) 28.19ns 35.47M
Skip_ForwardQ128(256_pm_64) 32.49ns 30.78M Skip_ForwardQ128(256_pm_64) 31.99ns 31.26M
Skip_ForwardQ128(1024_pm_256) 33.39ns 29.95M Skip_ForwardQ128(1024_pm_256) 32.51ns 30.76M
Jump_ForwardQ128 34.05ns 29.37M Jump_ForwardQ128 33.77ns 29.61M
----------------------------------------------------------------------------
SkipTo_SkipQ128(1) 4.42ns 226.49M
SkipTo_SkipQ128(2) 8.58ns 116.55M
SkipTo_SkipQ128(4_pm_1) 11.43ns 87.50M
SkipTo_SkipQ128(16_pm_4) 31.19ns 32.06M
SkipTo_SkipQ128(64_pm_16) 43.88ns 22.79M
SkipTo_SkipQ128(256_pm_64) 49.08ns 20.37M
SkipTo_SkipQ128(1024_pm_256) 52.24ns 19.14M
JumpTo_SkipQ128 54.61ns 18.31M
---------------------------------------------------------------------------- ----------------------------------------------------------------------------
Encode_10 117.24ns 8.53M SkipTo_SkipQ128(1) 4.34ns 230.66M
Encode 5.64ms 177.15 SkipTo_SkipQ128(2) 8.90ns 112.38M
SkipTo_SkipQ128(4_pm_1) 12.12ns 82.49M
SkipTo_SkipQ128(16_pm_4) 32.52ns 30.75M
SkipTo_SkipQ128(64_pm_16) 44.82ns 22.31M
SkipTo_SkipQ128(256_pm_64) 49.52ns 20.19M
SkipTo_SkipQ128(1024_pm_256) 52.88ns 18.91M
JumpTo_SkipQ128 54.65ns 18.30M
---------------------------------------------------------------------------- ----------------------------------------------------------------------------
Select64 8.04ns 124.35M Encode_10 98.70ns 10.13M
Encode 5.48ms 182.33
============================================================================ ============================================================================
#endif #endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment