Commit 7aff1428 authored by Marc Celani's avatar Marc Celani Committed by Facebook Github Bot

Avoid some unnecessary floating point operations in TDigest

Summary: Further performance improvements to TDigest by diverging from the algorithm in the paper slightly, but in a manner that does not impact the results at all.

Reviewed By: anakryiko

Differential Revision: D7687476

fbshipit-source-id: f71884b0fb21c9e78418643b82b099b01e96e4c9
parent 3593da7e
...@@ -82,7 +82,6 @@ TDigest TDigest::merge(Range<const double*> sortedValues) const { ...@@ -82,7 +82,6 @@ TDigest TDigest::merge(Range<const double*> sortedValues) const {
std::vector<Centroid> compressed; std::vector<Centroid> compressed;
compressed.reserve(maxSize_); compressed.reserve(maxSize_);
double q_0_times_count = 0.0;
double k_limit = 1; double k_limit = 1;
double q_limit_times_count = double q_limit_times_count =
detail::k_to_q(k_limit++, maxSize_) * result.count_; detail::k_to_q(k_limit++, maxSize_) * result.count_;
...@@ -98,7 +97,11 @@ TDigest TDigest::merge(Range<const double*> sortedValues) const { ...@@ -98,7 +97,11 @@ TDigest TDigest::merge(Range<const double*> sortedValues) const {
cur = Centroid(*it_sortedValues++, 1.0); cur = Centroid(*it_sortedValues++, 1.0);
} }
result.sum_ += cur.mean() * cur.weight(); double weightSoFar = cur.weight();
// Keep track of sums along the way to reduce expensive floating points
double sumsToMerge = 0;
double weightsToMerge = 0;
while (it_centroids != centroids_.end() || while (it_centroids != centroids_.end() ||
it_sortedValues != sortedValues.end()) { it_sortedValues != sortedValues.end()) {
...@@ -112,19 +115,22 @@ TDigest TDigest::merge(Range<const double*> sortedValues) const { ...@@ -112,19 +115,22 @@ TDigest TDigest::merge(Range<const double*> sortedValues) const {
next = Centroid(*it_sortedValues++, 1.0); next = Centroid(*it_sortedValues++, 1.0);
} }
result.sum_ += next.mean() * next.weight(); double nextSum = next.mean() * next.weight();
weightSoFar += next.weight();
double q_times_count = q_0_times_count + cur.weight() + next.weight();
if (q_times_count <= q_limit_times_count) { if (weightSoFar <= q_limit_times_count) {
cur.add(next); sumsToMerge += nextSum;
weightsToMerge += next.weight();
} else { } else {
result.sum_ += cur.add(sumsToMerge, weightsToMerge);
sumsToMerge = 0;
weightsToMerge = 0;
compressed.push_back(cur); compressed.push_back(cur);
q_0_times_count += cur.weight();
q_limit_times_count = detail::k_to_q(k_limit++, maxSize_) * result.count_; q_limit_times_count = detail::k_to_q(k_limit++, maxSize_) * result.count_;
cur = next; cur = next;
} }
} }
result.sum_ += cur.add(sumsToMerge, weightsToMerge);
compressed.push_back(cur); compressed.push_back(cur);
result.centroids_ = std::move(compressed); result.centroids_ = std::move(compressed);
return result; return result;
...@@ -144,44 +150,46 @@ TDigest TDigest::merge(Range<const TDigest*> digests) { ...@@ -144,44 +150,46 @@ TDigest TDigest::merge(Range<const TDigest*> digests) {
centroids.reserve(nCentroids); centroids.reserve(nCentroids);
double count = 0; double count = 0;
double sum = 0;
for (auto it = digests.begin(); it != digests.end(); it++) { for (auto it = digests.begin(); it != digests.end(); it++) {
count += it->count();
for (const auto& centroid : it->centroids_) { for (const auto& centroid : it->centroids_) {
centroids.push_back(centroid); centroids.push_back(centroid);
count += centroid.weight();
sum += centroid.mean() * centroid.weight();
} }
} }
std::sort(centroids.begin(), centroids.end()); std::sort(centroids.begin(), centroids.end());
size_t maxSize = digests.begin()->maxSize_; size_t maxSize = digests.begin()->maxSize_;
TDigest result(maxSize);
std::vector<Centroid> compressed; std::vector<Centroid> compressed;
compressed.reserve(maxSize); compressed.reserve(maxSize);
double q_0_times_count = 0.0;
double k_limit = 1; double k_limit = 1;
double q_limit_times_count = detail::k_to_q(k_limit, maxSize) * count; double q_limit_times_count = detail::k_to_q(k_limit, maxSize) * count;
Centroid cur = centroids.front(); Centroid cur = centroids.front();
double weightSoFar = cur.weight();
double sumsToMerge = 0;
double weightsToMerge = 0;
for (auto it = centroids.begin() + 1; it != centroids.end(); ++it) { for (auto it = centroids.begin() + 1; it != centroids.end(); ++it) {
double q_times_count = q_0_times_count + cur.weight() + it->weight(); weightSoFar += it->weight();
if (weightSoFar <= q_limit_times_count) {
if (q_times_count <= q_limit_times_count) { sumsToMerge += it->mean() * it->weight();
cur.add(*it); weightsToMerge += it->weight();
} else { } else {
result.sum_ += cur.add(sumsToMerge, weightsToMerge);
sumsToMerge = 0;
weightsToMerge = 0;
compressed.push_back(cur); compressed.push_back(cur);
q_0_times_count += cur.weight();
q_limit_times_count = detail::k_to_q(k_limit++, maxSize) * count; q_limit_times_count = detail::k_to_q(k_limit++, maxSize) * count;
cur = *it; cur = *it;
} }
} }
result.sum_ += cur.add(sumsToMerge, weightsToMerge);
compressed.push_back(cur); compressed.push_back(cur);
TDigest result(maxSize);
result.count_ = count; result.count_ = count;
result.sum_ = sum;
result.centroids_ = std::move(compressed); result.centroids_ = std::move(compressed);
return result; return result;
} }
...@@ -236,16 +244,11 @@ double TDigest::estimateQuantile(double q) const { ...@@ -236,16 +244,11 @@ double TDigest::estimateQuantile(double q) const {
((rank - t) / centroids_[pos].weight() - 0.5) * delta; ((rank - t) / centroids_[pos].weight() - 0.5) * delta;
} }
void TDigest::Centroid::add(const TDigest::Centroid& other) { double TDigest::Centroid::add(double sum, double weight) {
auto means = _mm_set_pd(mean(), other.mean()); sum += (mean_ * weight_);
auto weights = _mm_set_pd(weight(), other.weight()); weight_ += weight;
auto sums128 = _mm_mul_pd(means, weights);
double* sums = reinterpret_cast<double*>(&sums128);
double sum = sums[0] + sums[1];
weight_ += other.weight();
mean_ = sum / weight_; mean_ = sum / weight_;
return sum;
} }
} // namespace folly } // namespace folly
...@@ -95,7 +95,10 @@ class TDigest { ...@@ -95,7 +95,10 @@ class TDigest {
return weight_; return weight_;
} }
inline void add(const Centroid& other); /*
* Adds the sum/weight to this centroid, and returns the new sum.
*/
inline double add(double sum, double weight);
inline bool operator<(const Centroid& other) const { inline bool operator<(const Centroid& other) const {
return mean() < other.mean(); return mean() < other.mean();
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include <folly/Range.h> #include <folly/Range.h>
#include <folly/portability/GFlags.h> #include <folly/portability/GFlags.h>
DEFINE_int32(digest_merge_time_ns, 13000, "Time to merge into the digest"); DEFINE_int32(digest_merge_time_ns, 5500, "Time to merge into the digest");
using namespace folly; using namespace folly;
using namespace folly::detail; using namespace folly::detail;
...@@ -110,19 +110,19 @@ BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(append, 10000x32, 10000, 32) ...@@ -110,19 +110,19 @@ BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(append, 10000x32, 10000, 32)
* ============================================================================ * ============================================================================
* folly/stats/test/DigestBuilderBenchmark.cpp relative time/iter iters/s * folly/stats/test/DigestBuilderBenchmark.cpp relative time/iter iters/s
* ============================================================================ * ============================================================================
* append(1000x1) 51.79ns 19.31M * append(1000x1) 45.44ns 22.01M
* append(1000x2) 97.04% 53.37ns 18.74M * append(1000x2) 99.84% 45.52ns 21.97M
* append(1000x4) 95.68% 54.12ns 18.48M * append(1000x4) 96.65% 47.02ns 21.27M
* append(1000x8) 91.95% 56.32ns 17.76M * append(1000x8) 93.49% 48.61ns 20.57M
* append(1000x16) 62.12% 83.36ns 12.00M * append(1000x16) 46.88% 96.93ns 10.32M
* append(1000x32) 38.12% 135.85ns 7.36M * append(1000x32) 33.59% 135.30ns 7.39M
* ---------------------------------------------------------------------------- * ----------------------------------------------------------------------------
* append(10000x1) 46.34ns 21.58M * append(10000x1) 46.12ns 21.68M
* append(10000x2) 97.91% 47.33ns 21.13M * append(10000x2) 96.02% 48.03ns 20.82M
* append(10000x4) 95.27% 48.64ns 20.56M * append(10000x4) 95.39% 48.35ns 20.68M
* append(10000x8) 91.39% 50.70ns 19.72M * append(10000x8) 90.52% 50.95ns 19.63M
* append(10000x16) 55.26% 83.85ns 11.93M * append(10000x16) 43.39% 106.28ns 9.41M
* append(10000x32) 35.57% 130.25ns 7.68M * append(10000x32) 34.83% 132.41ns 7.55M
* ============================================================================ * ============================================================================
*/ */
......
...@@ -136,31 +136,31 @@ BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 1000_p999, 1000, 0.999) ...@@ -136,31 +136,31 @@ BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 1000_p999, 1000, 0.999)
* ============================================================================ * ============================================================================
* folly/stats/test/TDigestBenchmark.cpp relative time/iter iters/s * folly/stats/test/TDigestBenchmark.cpp relative time/iter iters/s
* ============================================================================ * ============================================================================
* merge(100x1) 2.34us 427.35K * merge(100x1) 2.21us 451.95K
* merge(100x5) 37.44% 6.25us 159.99K * merge(100x5) 57.80% 3.83us 261.23K
* merge(100x10) 19.38% 12.08us 82.80K * merge(100x10) 42.26% 5.24us 191.01K
* merge(1000x1) 10.93% 21.41us 46.70K * merge(1000x1) 10.43% 21.21us 47.15K
* merge(1000x5) 4.57% 51.18us 19.54K * merge(1000x5) 6.54% 33.85us 29.54K
* merge(1000x10) 2.33% 100.28us 9.97K * merge(1000x10) 4.52% 48.97us 20.42K
* ---------------------------------------------------------------------------- * ----------------------------------------------------------------------------
* mergeDigests(100x60) 331.30us 3.02K * mergeDigests(100x60) 278.92us 3.59K
* mergeDigests(1000x60) 9.20% 3.60ms 277.74 * mergeDigests(1000x60) 8.98% 3.10ms 322.11
* ---------------------------------------------------------------------------- * ----------------------------------------------------------------------------
* estimateQuantile(100x1_p001) 8.50ns 117.65M * estimateQuantile(100x1_p001) 8.51ns 117.49M
* estimateQuantile(100_p01) 62.84% 13.53ns 73.93M * estimateQuantile(100_p01) 61.35% 13.87ns 72.08M
* estimateQuantile(100_p25) 13.56% 62.69ns 15.95M * estimateQuantile(100_p25) 13.71% 62.08ns 16.11M
* estimateQuantile(100_p50) 10.42% 81.59ns 12.26M * estimateQuantile(100_p50) 10.37% 82.09ns 12.18M
* estimateQuantile(100_p75) 14.85% 57.24ns 17.47M * estimateQuantile(100_p75) 13.92% 61.14ns 16.36M
* estimateQuantile(100_p99) 76.32% 11.14ns 89.79M * estimateQuantile(100_p99) 67.06% 12.69ns 78.79M
* estimateQuantile(100_p999) 127.14% 6.69ns 149.58M * estimateQuantile(100_p999) 110.81% 7.68ns 130.20M
* ---------------------------------------------------------------------------- * ----------------------------------------------------------------------------
* estimateQuantile(1000_p001) 26.35% 32.26ns 31.00M * estimateQuantile(1000_p001) 26.56% 32.05ns 31.20M
* estimateQuantile(1000_p01) 7.75% 109.66ns 9.12M * estimateQuantile(1000_p01) 7.72% 110.22ns 9.07M
* estimateQuantile(1000_p25) 1.74% 487.64ns 2.05M * estimateQuantile(1000_p25) 1.74% 488.18ns 2.05M
* estimateQuantile(1000_p50) 1.24% 683.61ns 1.46M * estimateQuantile(1000_p50) 1.24% 684.06ns 1.46M
* estimateQuantile(1000_p75) 1.75% 484.43ns 2.06M * estimateQuantile(1000_p75) 1.76% 483.38ns 2.07M
* estimateQuantile(1000_p99) 7.87% 107.94ns 9.26M * estimateQuantile(1000_p99) 7.98% 106.66ns 9.38M
* estimateQuantile(1000_p999) 34.58% 24.58ns 40.69M * estimateQuantile(1000_p999) 32.64% 26.08ns 38.35M
* ============================================================================ * ============================================================================
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment