Commit b7eba90d authored by Giuseppe Ottaviano's avatar Giuseppe Ottaviano Committed by Facebook GitHub Bot

Kill small skip optimization in EliasFanoReader

Summary:
These optimizations were introduced in D1793554, mostly to mitigate the high cost of `select64`. We've had an efficient implementation of `select64` for Haswell for a while, and Haswell is more than 5 years old, so we can drop the optimization, which actually harms performance on modern microarchitectures.

While on a real workload this shows significant benefits, the microbenchmarks for very small skips are degraded:
```
============================================================================  ====================
folly/experimental/test/EliasFanoCodingTest.cpp relative  time/iter  iters/s    time/iter  iters/s
============================================================================  ====================
Next                                                         2.58ns  388.22M       2.55ns  391.93M
Skip_ForwardQ128(1)                                          4.81ns  207.72M       4.31ns  231.79M
Skip_ForwardQ128(2)                                          5.96ns  167.75M       4.67ns  214.07M
Skip_ForwardQ128(4_pm_1)                                     7.40ns  135.16M       4.87ns  205.29M
Skip_ForwardQ128(16_pm_4)                                    8.20ns  121.97M       5.38ns  185.93M
Skip_ForwardQ128(64_pm_16)                                  12.04ns   83.06M       8.97ns  111.51M
Skip_ForwardQ128(256_pm_64)                                 16.84ns   59.39M      13.19ns   75.79M
Skip_ForwardQ128(1024_pm_256)                               17.67ns   56.61M      14.19ns   70.45M
Jump_ForwardQ128                                            25.37ns   39.41M      25.47ns   39.26M
----------------------------------------------------------------------------  --------------------
SkipTo_SkipQ128(1)                                           7.27ns  137.59M      10.77ns   92.87M
SkipTo_SkipQ128(2)                                          10.99ns   91.01M      14.39ns   69.51M
SkipTo_SkipQ128(4_pm_1)                                     13.53ns   73.93M      16.15ns   61.90M
SkipTo_SkipQ128(16_pm_4)                                    20.58ns   48.59M      17.72ns   56.45M
SkipTo_SkipQ128(64_pm_16)                                   32.08ns   31.18M      31.16ns   32.09M
SkipTo_SkipQ128(256_pm_64)                                  38.66ns   25.87M      38.22ns   26.16M
SkipTo_SkipQ128(1024_pm_256)                                42.32ns   23.63M      42.07ns   23.77M
JumpTo_SkipQ128                                             47.95ns   20.86M      47.85ns   20.90M
----------------------------------------------------------------------------  --------------------
Encode_10                                                  103.99ns    9.62M     104.89ns    9.53M
Encode                                                       7.60ms   131.53       7.55ms   132.46
----------------------------------------------------------------------------  --------------------
defaultNumLowerBits                                          3.59ns  278.69M       3.61ns  276.97M
slowDefaultNumLowerBits                                     10.88ns   91.90M      10.98ns   91.06M
============================================================================  ====================
```

It is important to note however that these micro-benchmarks have very little variability in terms of effective skip distance for small skips, producing almost perfect branch prediction in the linear scan loop. In a real workload, the overhead of branch misprediction ends up being significant.

Reviewed By: philippv

Differential Revision: D22139846

fbshipit-source-id: 8df17a74aa57c92413709d9e11c60a77d5462422
parent f1dd8b18
......@@ -654,13 +654,7 @@ class EliasFanoReader {
}
if (kUnchecked || LIKELY(position() + n < size_)) {
if (LIKELY(n < kLinearScanThreshold)) {
for (SizeType i = 0; i < n; ++i) {
upper_.next();
}
} else {
upper_.skip(n);
}
value_ =
readLowerPart(upper_.position()) | (upper_.value() << numLowerBits_);
return true;
......@@ -685,18 +679,8 @@ class EliasFanoReader {
return true;
}
ValueType upperValue = (value >> numLowerBits_);
ValueType upperSkip = upperValue - upper_.value();
// The average density of ones in upper bits is 1/2.
// LIKELY here seems to make things worse, even for small skips.
if (upperSkip < 2 * kLinearScanThreshold) {
do {
upper_.next();
} while (UNLIKELY(upper_.value() < upperValue));
} else {
ValueType upperValue = value >> numLowerBits_;
upper_.skipToNext(upperValue);
}
iterateTo(value);
return true;
}
......@@ -829,8 +813,6 @@ class EliasFanoReader {
}
}
constexpr static size_t kLinearScanThreshold = 8;
detail::UpperBitsReader<Encoder, Instructions, SizeType> upper_;
const uint8_t* lower_;
SizeType size_;
......
......@@ -310,36 +310,35 @@ BENCHMARK(slowDefaultNumLowerBits, iters) {
}
#if 0
// Intel(R) Xeon(R) CPU E5-2678 v3 @ 2.50GHz (turbo on),
// Using GCC 5 with --bm_min_usec 100000.
V1008 12:29:33.646595 87744 Instructions.h:161] Will use folly::compression::instructions::Haswell
// Intel(R) Xeon(R) CPU E5-2678 v3 @ 2.50GHz, Clang 8.0.
// $ eliasfano_test --benchmark --bm_min_usec 200000
============================================================================
folly/experimental/test/EliasFanoCodingTest.cpp relative time/iter iters/s
============================================================================
Next 2.47ns 405.58M
Skip_ForwardQ128(1) 6.68ns 149.67M
Skip_ForwardQ128(2) 7.67ns 130.30M
Skip_ForwardQ128(4_pm_1) 9.12ns 109.65M
Skip_ForwardQ128(16_pm_4) 9.95ns 100.53M
Skip_ForwardQ128(64_pm_16) 12.76ns 78.40M
Skip_ForwardQ128(256_pm_64) 18.09ns 55.27M
Skip_ForwardQ128(1024_pm_256) 19.13ns 52.28M
Jump_ForwardQ128 20.27ns 49.33M
Next 2.58ns 388.22M
Skip_ForwardQ128(1) 4.81ns 207.72M
Skip_ForwardQ128(2) 5.96ns 167.75M
Skip_ForwardQ128(4_pm_1) 7.40ns 135.16M
Skip_ForwardQ128(16_pm_4) 8.20ns 121.97M
Skip_ForwardQ128(64_pm_16) 12.04ns 83.06M
Skip_ForwardQ128(256_pm_64) 16.84ns 59.39M
Skip_ForwardQ128(1024_pm_256) 17.67ns 56.61M
Jump_ForwardQ128 25.37ns 39.41M
----------------------------------------------------------------------------
SkipTo_SkipQ128(1) 8.35ns 119.76M
SkipTo_SkipQ128(2) 12.37ns 80.85M
SkipTo_SkipQ128(4_pm_1) 15.05ns 66.44M
SkipTo_SkipQ128(16_pm_4) 22.90ns 43.66M
SkipTo_SkipQ128(64_pm_16) 34.11ns 29.31M
SkipTo_SkipQ128(256_pm_64) 38.68ns 25.85M
SkipTo_SkipQ128(1024_pm_256) 41.75ns 23.95M
JumpTo_SkipQ128 44.79ns 22.33M
SkipTo_SkipQ128(1) 7.27ns 137.59M
SkipTo_SkipQ128(2) 10.99ns 91.01M
SkipTo_SkipQ128(4_pm_1) 13.53ns 73.93M
SkipTo_SkipQ128(16_pm_4) 20.58ns 48.59M
SkipTo_SkipQ128(64_pm_16) 32.08ns 31.18M
SkipTo_SkipQ128(256_pm_64) 38.66ns 25.87M
SkipTo_SkipQ128(1024_pm_256) 42.32ns 23.63M
JumpTo_SkipQ128 47.95ns 20.86M
----------------------------------------------------------------------------
Encode_10 120.33ns 8.31M
Encode 7.61ms 131.32
Encode_10 103.99ns 9.62M
Encode 7.60ms 131.53
----------------------------------------------------------------------------
defaultNumLowerBits 3.69ns 270.74M
slowDefaultNumLowerBits 10.90ns 91.73M
defaultNumLowerBits 3.59ns 278.69M
slowDefaultNumLowerBits 10.88ns 91.90M
============================================================================
#endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment