Commit 7aff1428 authored by Marc Celani's avatar Marc Celani Committed by Facebook Github Bot

Avoid some unnecessary floating point operations in TDigest

Summary: Further performance improvements to TDigest by diverging from the algorithm in the paper slightly, but in a manner that does not impact the results at all.

Reviewed By: anakryiko

Differential Revision: D7687476

fbshipit-source-id: f71884b0fb21c9e78418643b82b099b01e96e4c9
parent 3593da7e
......@@ -82,7 +82,6 @@ TDigest TDigest::merge(Range<const double*> sortedValues) const {
std::vector<Centroid> compressed;
compressed.reserve(maxSize_);
double q_0_times_count = 0.0;
double k_limit = 1;
double q_limit_times_count =
detail::k_to_q(k_limit++, maxSize_) * result.count_;
......@@ -98,7 +97,11 @@ TDigest TDigest::merge(Range<const double*> sortedValues) const {
cur = Centroid(*it_sortedValues++, 1.0);
}
result.sum_ += cur.mean() * cur.weight();
double weightSoFar = cur.weight();
// Keep track of sums along the way to reduce expensive floating points
double sumsToMerge = 0;
double weightsToMerge = 0;
while (it_centroids != centroids_.end() ||
it_sortedValues != sortedValues.end()) {
......@@ -112,19 +115,22 @@ TDigest TDigest::merge(Range<const double*> sortedValues) const {
next = Centroid(*it_sortedValues++, 1.0);
}
result.sum_ += next.mean() * next.weight();
double q_times_count = q_0_times_count + cur.weight() + next.weight();
double nextSum = next.mean() * next.weight();
weightSoFar += next.weight();
if (q_times_count <= q_limit_times_count) {
cur.add(next);
if (weightSoFar <= q_limit_times_count) {
sumsToMerge += nextSum;
weightsToMerge += next.weight();
} else {
result.sum_ += cur.add(sumsToMerge, weightsToMerge);
sumsToMerge = 0;
weightsToMerge = 0;
compressed.push_back(cur);
q_0_times_count += cur.weight();
q_limit_times_count = detail::k_to_q(k_limit++, maxSize_) * result.count_;
cur = next;
}
}
result.sum_ += cur.add(sumsToMerge, weightsToMerge);
compressed.push_back(cur);
result.centroids_ = std::move(compressed);
return result;
......@@ -144,44 +150,46 @@ TDigest TDigest::merge(Range<const TDigest*> digests) {
centroids.reserve(nCentroids);
double count = 0;
double sum = 0;
for (auto it = digests.begin(); it != digests.end(); it++) {
count += it->count();
for (const auto& centroid : it->centroids_) {
centroids.push_back(centroid);
count += centroid.weight();
sum += centroid.mean() * centroid.weight();
}
}
std::sort(centroids.begin(), centroids.end());
size_t maxSize = digests.begin()->maxSize_;
TDigest result(maxSize);
std::vector<Centroid> compressed;
compressed.reserve(maxSize);
double q_0_times_count = 0.0;
double k_limit = 1;
double q_limit_times_count = detail::k_to_q(k_limit, maxSize) * count;
Centroid cur = centroids.front();
double weightSoFar = cur.weight();
double sumsToMerge = 0;
double weightsToMerge = 0;
for (auto it = centroids.begin() + 1; it != centroids.end(); ++it) {
double q_times_count = q_0_times_count + cur.weight() + it->weight();
if (q_times_count <= q_limit_times_count) {
cur.add(*it);
weightSoFar += it->weight();
if (weightSoFar <= q_limit_times_count) {
sumsToMerge += it->mean() * it->weight();
weightsToMerge += it->weight();
} else {
result.sum_ += cur.add(sumsToMerge, weightsToMerge);
sumsToMerge = 0;
weightsToMerge = 0;
compressed.push_back(cur);
q_0_times_count += cur.weight();
q_limit_times_count = detail::k_to_q(k_limit++, maxSize) * count;
cur = *it;
}
}
result.sum_ += cur.add(sumsToMerge, weightsToMerge);
compressed.push_back(cur);
TDigest result(maxSize);
result.count_ = count;
result.sum_ = sum;
result.centroids_ = std::move(compressed);
return result;
}
......@@ -236,16 +244,11 @@ double TDigest::estimateQuantile(double q) const {
((rank - t) / centroids_[pos].weight() - 0.5) * delta;
}
void TDigest::Centroid::add(const TDigest::Centroid& other) {
auto means = _mm_set_pd(mean(), other.mean());
auto weights = _mm_set_pd(weight(), other.weight());
auto sums128 = _mm_mul_pd(means, weights);
double* sums = reinterpret_cast<double*>(&sums128);
double sum = sums[0] + sums[1];
weight_ += other.weight();
double TDigest::Centroid::add(double sum, double weight) {
sum += (mean_ * weight_);
weight_ += weight;
mean_ = sum / weight_;
return sum;
}
} // namespace folly
......@@ -95,7 +95,10 @@ class TDigest {
return weight_;
}
inline void add(const Centroid& other);
/*
* Adds the sum/weight to this centroid, and returns the new sum.
*/
inline double add(double sum, double weight);
inline bool operator<(const Centroid& other) const {
return mean() < other.mean();
......
......@@ -24,7 +24,7 @@
#include <folly/Range.h>
#include <folly/portability/GFlags.h>
DEFINE_int32(digest_merge_time_ns, 13000, "Time to merge into the digest");
DEFINE_int32(digest_merge_time_ns, 5500, "Time to merge into the digest");
using namespace folly;
using namespace folly::detail;
......@@ -110,19 +110,19 @@ BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(append, 10000x32, 10000, 32)
* ============================================================================
* folly/stats/test/DigestBuilderBenchmark.cpp relative time/iter iters/s
* ============================================================================
* append(1000x1) 51.79ns 19.31M
* append(1000x2) 97.04% 53.37ns 18.74M
* append(1000x4) 95.68% 54.12ns 18.48M
* append(1000x8) 91.95% 56.32ns 17.76M
* append(1000x16) 62.12% 83.36ns 12.00M
* append(1000x32) 38.12% 135.85ns 7.36M
* append(1000x1) 45.44ns 22.01M
* append(1000x2) 99.84% 45.52ns 21.97M
* append(1000x4) 96.65% 47.02ns 21.27M
* append(1000x8) 93.49% 48.61ns 20.57M
* append(1000x16) 46.88% 96.93ns 10.32M
* append(1000x32) 33.59% 135.30ns 7.39M
* ----------------------------------------------------------------------------
* append(10000x1) 46.34ns 21.58M
* append(10000x2) 97.91% 47.33ns 21.13M
* append(10000x4) 95.27% 48.64ns 20.56M
* append(10000x8) 91.39% 50.70ns 19.72M
* append(10000x16) 55.26% 83.85ns 11.93M
* append(10000x32) 35.57% 130.25ns 7.68M
* append(10000x1) 46.12ns 21.68M
* append(10000x2) 96.02% 48.03ns 20.82M
* append(10000x4) 95.39% 48.35ns 20.68M
* append(10000x8) 90.52% 50.95ns 19.63M
* append(10000x16) 43.39% 106.28ns 9.41M
* append(10000x32) 34.83% 132.41ns 7.55M
* ============================================================================
*/
......
......@@ -136,31 +136,31 @@ BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 1000_p999, 1000, 0.999)
* ============================================================================
* folly/stats/test/TDigestBenchmark.cpp relative time/iter iters/s
* ============================================================================
* merge(100x1) 2.34us 427.35K
* merge(100x5) 37.44% 6.25us 159.99K
* merge(100x10) 19.38% 12.08us 82.80K
* merge(1000x1) 10.93% 21.41us 46.70K
* merge(1000x5) 4.57% 51.18us 19.54K
* merge(1000x10) 2.33% 100.28us 9.97K
* merge(100x1) 2.21us 451.95K
* merge(100x5) 57.80% 3.83us 261.23K
* merge(100x10) 42.26% 5.24us 191.01K
* merge(1000x1) 10.43% 21.21us 47.15K
* merge(1000x5) 6.54% 33.85us 29.54K
* merge(1000x10) 4.52% 48.97us 20.42K
* ----------------------------------------------------------------------------
* mergeDigests(100x60) 331.30us 3.02K
* mergeDigests(1000x60) 9.20% 3.60ms 277.74
* mergeDigests(100x60) 278.92us 3.59K
* mergeDigests(1000x60) 8.98% 3.10ms 322.11
* ----------------------------------------------------------------------------
* estimateQuantile(100x1_p001) 8.50ns 117.65M
* estimateQuantile(100_p01) 62.84% 13.53ns 73.93M
* estimateQuantile(100_p25) 13.56% 62.69ns 15.95M
* estimateQuantile(100_p50) 10.42% 81.59ns 12.26M
* estimateQuantile(100_p75) 14.85% 57.24ns 17.47M
* estimateQuantile(100_p99) 76.32% 11.14ns 89.79M
* estimateQuantile(100_p999) 127.14% 6.69ns 149.58M
* estimateQuantile(100x1_p001) 8.51ns 117.49M
* estimateQuantile(100_p01) 61.35% 13.87ns 72.08M
* estimateQuantile(100_p25) 13.71% 62.08ns 16.11M
* estimateQuantile(100_p50) 10.37% 82.09ns 12.18M
* estimateQuantile(100_p75) 13.92% 61.14ns 16.36M
* estimateQuantile(100_p99) 67.06% 12.69ns 78.79M
* estimateQuantile(100_p999) 110.81% 7.68ns 130.20M
* ----------------------------------------------------------------------------
* estimateQuantile(1000_p001) 26.35% 32.26ns 31.00M
* estimateQuantile(1000_p01) 7.75% 109.66ns 9.12M
* estimateQuantile(1000_p25) 1.74% 487.64ns 2.05M
* estimateQuantile(1000_p50) 1.24% 683.61ns 1.46M
* estimateQuantile(1000_p75) 1.75% 484.43ns 2.06M
* estimateQuantile(1000_p99) 7.87% 107.94ns 9.26M
* estimateQuantile(1000_p999) 34.58% 24.58ns 40.69M
* estimateQuantile(1000_p001) 26.56% 32.05ns 31.20M
* estimateQuantile(1000_p01) 7.72% 110.22ns 9.07M
* estimateQuantile(1000_p25) 1.74% 488.18ns 2.05M
* estimateQuantile(1000_p50) 1.24% 684.06ns 1.46M
* estimateQuantile(1000_p75) 1.76% 483.38ns 2.07M
* estimateQuantile(1000_p99) 7.98% 106.66ns 9.38M
* estimateQuantile(1000_p999) 32.64% 26.08ns 38.35M
* ============================================================================
*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment