Commit b7eba90d authored by Giuseppe Ottaviano's avatar Giuseppe Ottaviano Committed by Facebook GitHub Bot

Kill small skip optimization in EliasFanoReader

Summary:
These optimizations were introduced in D1793554, mostly to mitigate the high cost of `select64`. We've had an efficient implementation of `select64` for Haswell for a while, and Haswell is more than 5 years old, so we can drop the optimization, which actually harms performance on modern microarchitectures.

While on a real workload this shows significant benefits, the microbenchmarks for very small skips are degraded:
```
============================================================================  ====================
folly/experimental/test/EliasFanoCodingTest.cpp relative  time/iter  iters/s    time/iter  iters/s
============================================================================  ====================
Next                                                         2.58ns  388.22M       2.55ns  391.93M
Skip_ForwardQ128(1)                                          4.81ns  207.72M       4.31ns  231.79M
Skip_ForwardQ128(2)                                          5.96ns  167.75M       4.67ns  214.07M
Skip_ForwardQ128(4_pm_1)                                     7.40ns  135.16M       4.87ns  205.29M
Skip_ForwardQ128(16_pm_4)                                    8.20ns  121.97M       5.38ns  185.93M
Skip_ForwardQ128(64_pm_16)                                  12.04ns   83.06M       8.97ns  111.51M
Skip_ForwardQ128(256_pm_64)                                 16.84ns   59.39M      13.19ns   75.79M
Skip_ForwardQ128(1024_pm_256)                               17.67ns   56.61M      14.19ns   70.45M
Jump_ForwardQ128                                            25.37ns   39.41M      25.47ns   39.26M
----------------------------------------------------------------------------  --------------------
SkipTo_SkipQ128(1)                                           7.27ns  137.59M      10.77ns   92.87M
SkipTo_SkipQ128(2)                                          10.99ns   91.01M      14.39ns   69.51M
SkipTo_SkipQ128(4_pm_1)                                     13.53ns   73.93M      16.15ns   61.90M
SkipTo_SkipQ128(16_pm_4)                                    20.58ns   48.59M      17.72ns   56.45M
SkipTo_SkipQ128(64_pm_16)                                   32.08ns   31.18M      31.16ns   32.09M
SkipTo_SkipQ128(256_pm_64)                                  38.66ns   25.87M      38.22ns   26.16M
SkipTo_SkipQ128(1024_pm_256)                                42.32ns   23.63M      42.07ns   23.77M
JumpTo_SkipQ128                                             47.95ns   20.86M      47.85ns   20.90M
----------------------------------------------------------------------------  --------------------
Encode_10                                                  103.99ns    9.62M     104.89ns    9.53M
Encode                                                       7.60ms   131.53       7.55ms   132.46
----------------------------------------------------------------------------  --------------------
defaultNumLowerBits                                          3.59ns  278.69M       3.61ns  276.97M
slowDefaultNumLowerBits                                     10.88ns   91.90M      10.98ns   91.06M
============================================================================  ====================
```

It is important to note however that these micro-benchmarks have very little variability in terms of effective skip distance for small skips, producing almost perfect branch prediction in the linear scan loop. In a real workload, the overhead of branch misprediction ends up being significant.

Reviewed By: philippv

Differential Revision: D22139846

fbshipit-source-id: 8df17a74aa57c92413709d9e11c60a77d5462422
parent f1dd8b18
...@@ -654,13 +654,7 @@ class EliasFanoReader { ...@@ -654,13 +654,7 @@ class EliasFanoReader {
} }
if (kUnchecked || LIKELY(position() + n < size_)) { if (kUnchecked || LIKELY(position() + n < size_)) {
if (LIKELY(n < kLinearScanThreshold)) {
for (SizeType i = 0; i < n; ++i) {
upper_.next();
}
} else {
upper_.skip(n); upper_.skip(n);
}
value_ = value_ =
readLowerPart(upper_.position()) | (upper_.value() << numLowerBits_); readLowerPart(upper_.position()) | (upper_.value() << numLowerBits_);
return true; return true;
...@@ -685,18 +679,8 @@ class EliasFanoReader { ...@@ -685,18 +679,8 @@ class EliasFanoReader {
return true; return true;
} }
ValueType upperValue = (value >> numLowerBits_); ValueType upperValue = value >> numLowerBits_;
ValueType upperSkip = upperValue - upper_.value();
// The average density of ones in upper bits is 1/2.
// LIKELY here seems to make things worse, even for small skips.
if (upperSkip < 2 * kLinearScanThreshold) {
do {
upper_.next();
} while (UNLIKELY(upper_.value() < upperValue));
} else {
upper_.skipToNext(upperValue); upper_.skipToNext(upperValue);
}
iterateTo(value); iterateTo(value);
return true; return true;
} }
...@@ -829,8 +813,6 @@ class EliasFanoReader { ...@@ -829,8 +813,6 @@ class EliasFanoReader {
} }
} }
constexpr static size_t kLinearScanThreshold = 8;
detail::UpperBitsReader<Encoder, Instructions, SizeType> upper_; detail::UpperBitsReader<Encoder, Instructions, SizeType> upper_;
const uint8_t* lower_; const uint8_t* lower_;
SizeType size_; SizeType size_;
......
...@@ -310,36 +310,35 @@ BENCHMARK(slowDefaultNumLowerBits, iters) { ...@@ -310,36 +310,35 @@ BENCHMARK(slowDefaultNumLowerBits, iters) {
} }
#if 0 #if 0
// Intel(R) Xeon(R) CPU E5-2678 v3 @ 2.50GHz (turbo on), // Intel(R) Xeon(R) CPU E5-2678 v3 @ 2.50GHz, Clang 8.0.
// Using GCC 5 with --bm_min_usec 100000. // $ eliasfano_test --benchmark --bm_min_usec 200000
V1008 12:29:33.646595 87744 Instructions.h:161] Will use folly::compression::instructions::Haswell
============================================================================ ============================================================================
folly/experimental/test/EliasFanoCodingTest.cpp relative time/iter iters/s folly/experimental/test/EliasFanoCodingTest.cpp relative time/iter iters/s
============================================================================ ============================================================================
Next 2.47ns 405.58M Next 2.58ns 388.22M
Skip_ForwardQ128(1) 6.68ns 149.67M Skip_ForwardQ128(1) 4.81ns 207.72M
Skip_ForwardQ128(2) 7.67ns 130.30M Skip_ForwardQ128(2) 5.96ns 167.75M
Skip_ForwardQ128(4_pm_1) 9.12ns 109.65M Skip_ForwardQ128(4_pm_1) 7.40ns 135.16M
Skip_ForwardQ128(16_pm_4) 9.95ns 100.53M Skip_ForwardQ128(16_pm_4) 8.20ns 121.97M
Skip_ForwardQ128(64_pm_16) 12.76ns 78.40M Skip_ForwardQ128(64_pm_16) 12.04ns 83.06M
Skip_ForwardQ128(256_pm_64) 18.09ns 55.27M Skip_ForwardQ128(256_pm_64) 16.84ns 59.39M
Skip_ForwardQ128(1024_pm_256) 19.13ns 52.28M Skip_ForwardQ128(1024_pm_256) 17.67ns 56.61M
Jump_ForwardQ128 20.27ns 49.33M Jump_ForwardQ128 25.37ns 39.41M
---------------------------------------------------------------------------- ----------------------------------------------------------------------------
SkipTo_SkipQ128(1) 8.35ns 119.76M SkipTo_SkipQ128(1) 7.27ns 137.59M
SkipTo_SkipQ128(2) 12.37ns 80.85M SkipTo_SkipQ128(2) 10.99ns 91.01M
SkipTo_SkipQ128(4_pm_1) 15.05ns 66.44M SkipTo_SkipQ128(4_pm_1) 13.53ns 73.93M
SkipTo_SkipQ128(16_pm_4) 22.90ns 43.66M SkipTo_SkipQ128(16_pm_4) 20.58ns 48.59M
SkipTo_SkipQ128(64_pm_16) 34.11ns 29.31M SkipTo_SkipQ128(64_pm_16) 32.08ns 31.18M
SkipTo_SkipQ128(256_pm_64) 38.68ns 25.85M SkipTo_SkipQ128(256_pm_64) 38.66ns 25.87M
SkipTo_SkipQ128(1024_pm_256) 41.75ns 23.95M SkipTo_SkipQ128(1024_pm_256) 42.32ns 23.63M
JumpTo_SkipQ128 44.79ns 22.33M JumpTo_SkipQ128 47.95ns 20.86M
---------------------------------------------------------------------------- ----------------------------------------------------------------------------
Encode_10 120.33ns 8.31M Encode_10 103.99ns 9.62M
Encode 7.61ms 131.32 Encode 7.60ms 131.53
---------------------------------------------------------------------------- ----------------------------------------------------------------------------
defaultNumLowerBits 3.69ns 270.74M defaultNumLowerBits 3.59ns 278.69M
slowDefaultNumLowerBits 10.90ns 91.73M slowDefaultNumLowerBits 10.88ns 91.90M
============================================================================ ============================================================================
#endif #endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment