Commit 5e1a06ae authored by Marc Celani's avatar Marc Celani Committed by Facebook Github Bot

TDigest for estimating quantiles

Summary:
The existing histogram approach for estimating quantiles is inadequate for common use cases. This diff introduces TDigests for estimating quantiles of streaming values.

The current histogram approach suffers from two issues:
  - The user needs to know something about the universe of values, U, and the underlying distribution in order to get reasonably accurate values
  - Because the histogram is not biased, estimating tail quantiles such as p99.9 requires a large amount of memory.

t-digests (https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) are biased digests that allow for accurate estimates of tail quantiles, while using a small amount of memory and without requiring the user to know anything about U.

The scaling function in the paper is slow, because it uses arcsin. This approach uses a sqrt function instead. Details of why this works in the comments of the TDigest file.

Reviewed By: simpkins

Differential Revision: D7512432

fbshipit-source-id: 01460433ec7380fed88e5e942ef15f032dd794bb
parent 0e169ccf
......@@ -632,6 +632,7 @@ if (BUILD_TESTS)
DIRECTORY stats/test/
TEST histogram_test SOURCES HistogramTest.cpp
TEST tdigest_test SOURCES TDigestTest.cpp
TEST timeseries_histogram_test SOURCES TimeseriesHistogramTest.cpp
TEST timeseries_test SOURCES TimeSeriesTest.cpp
......
......@@ -444,6 +444,7 @@ nobase_follyinclude_HEADERS = \
stats/Histogram.h \
stats/MultiLevelTimeSeries-defs.h \
stats/MultiLevelTimeSeries.h \
stats/TDigest.h \
stats/TimeseriesHistogram-defs.h \
stats/TimeseriesHistogram.h \
synchronization/AsymmetricMemoryBarrier.h \
......@@ -633,6 +634,7 @@ libfolly_la_SOURCES = \
stats/BucketedTimeSeries.cpp \
stats/Histogram.cpp \
stats/MultiLevelTimeSeries.cpp \
stats/TDigest.cpp \
stats/TimeseriesHistogram.cpp \
synchronization/AsymmetricMemoryBarrier.cpp \
synchronization/LifoSem.cpp \
......
/*
* Copyright 2012-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/stats/TDigest.h>
#include <emmintrin.h>
#include <cmath>
namespace folly {
namespace detail {
/*
* A good biased scaling function has the following properties:
* - The value of the function k(0, delta) = 0, and k(1, delta) = delta.
* This is a requirement for any t-digest function.
* - The limit of the derivative of the function dk/dq at 0 is inf, and at
* 1 is inf. This provides bias to improve accuracy at the tails.
* - For any q <= 0.5, dk/dq(q) = dk/dq(1-q). This ensures that the accuracy
* of upper and lower quantiles are equivalent.
*
* The scaling function used here is...
* k(q, d) = (IF q >= 0.5, d - d * sqrt(2 - 2q) / 2, d * sqrt(2q) / 2)
*
* k(0, d) = 0
* k(1, d) = d
*
* dk/dq = (IF q >= 0.5, d / sqrt(2-2q), d / sqrt(2q))
* limit q->1 dk/dq = inf
* limit q->0 dk/dq = inf
*
* When plotted, the derivative function is symmetric, centered at q=0.5.
*
* Note that FMA has been tested here, but benchmarks have not shown it to be a
* performance improvement.
*/
double q_to_k(double q, double d) {
if (q >= 0.5) {
return d - d * std::sqrt(0.5 - 0.5 * q);
}
return d * std::sqrt(0.5 * q);
}
double k_to_q(double k, double d) {
double k_div_d = k / d;
if (k_div_d >= 0.5) {
double base = 1 - k_div_d;
return 1 - 2 * base * base;
} else {
return 2 * k_div_d * k_div_d;
}
}
} // namespace detail
TDigest TDigest::merge(Range<const double*> sortedValues) const {
if (sortedValues.empty()) {
return *this;
}
TDigest result(maxSize_);
result.total_ = total_ + sortedValues.size();
std::vector<Centroid> compressed;
compressed.reserve(2 * maxSize_);
double q_0_times_total = 0.0;
double q_limit_times_total = detail::k_to_q(1, maxSize_) * result.total_;
auto it_centroids = centroids_.begin();
auto it_sortedValues = sortedValues.begin();
Centroid cur;
if (it_centroids != centroids_.end() &&
it_centroids->mean() < *it_sortedValues) {
cur = *it_centroids++;
} else {
cur = Centroid(*it_sortedValues++, 1.0);
}
while (it_centroids != centroids_.end() ||
it_sortedValues != sortedValues.end()) {
Centroid next;
if (it_centroids != centroids_.end() &&
(it_sortedValues == sortedValues.end() ||
it_centroids->mean() < *it_sortedValues)) {
next = *it_centroids++;
} else {
next = Centroid(*it_sortedValues++, 1.0);
}
double q_times_total = q_0_times_total + cur.weight() + next.weight();
if (q_times_total <= q_limit_times_total) {
cur.add(next);
} else {
compressed.push_back(cur);
q_0_times_total += cur.weight();
double q_to_k_res =
detail::q_to_k(q_0_times_total / result.total_, maxSize_);
q_limit_times_total =
detail::k_to_q(q_to_k_res + 1, maxSize_) * result.total_;
cur = next;
}
}
compressed.push_back(cur);
result.centroids_ = std::move(compressed);
return result;
}
TDigest TDigest::merge(Range<const TDigest*> digests) {
size_t nCentroids = 0;
for (auto it = digests.begin(); it != digests.end(); it++) {
nCentroids += it->centroids_.size();
}
if (nCentroids == 0) {
return TDigest();
}
std::vector<Centroid> centroids;
centroids.reserve(nCentroids);
double total = 0;
for (auto it = digests.begin(); it != digests.end(); it++) {
for (const auto& centroid : it->centroids_) {
centroids.push_back(centroid);
total += centroid.weight();
}
}
std::sort(centroids.begin(), centroids.end());
size_t maxSize = digests.begin()->maxSize_;
std::vector<Centroid> compressed;
compressed.reserve(2 * maxSize);
double q_0_times_total = 0.0;
double q_limit_times_total = detail::k_to_q(1, maxSize) * total;
Centroid cur = centroids.front();
for (auto it = centroids.begin() + 1; it != centroids.end(); ++it) {
double q_times_total = q_0_times_total + cur.weight() + it->weight();
if (q_times_total <= q_limit_times_total) {
cur.add(*it);
} else {
compressed.push_back(cur);
q_0_times_total += cur.weight();
double q_to_k_res = detail::q_to_k(q_0_times_total / total, maxSize);
q_limit_times_total = detail::k_to_q(q_to_k_res + 1, maxSize) * total;
cur = *it;
}
}
compressed.push_back(cur);
TDigest result(maxSize);
result.total_ = total;
result.centroids_ = std::move(compressed);
return result;
}
double TDigest::estimateQuantile(double q) const {
if (centroids_.empty()) {
return 0.0;
}
double rank = q * total_;
size_t pos;
double t;
if (q > 0.5) {
if (q >= 1.0) {
return centroids_.back().mean();
}
pos = 0;
t = total_;
for (auto rit = centroids_.rbegin(); rit != centroids_.rend(); ++rit) {
t -= rit->weight();
if (rank >= t) {
pos = std::distance(rit, centroids_.rend()) - 1;
break;
}
}
} else {
if (q <= 0.0) {
return centroids_.front().mean();
}
pos = centroids_.size() - 1;
t = 0;
for (auto it = centroids_.begin(); it != centroids_.end(); ++it) {
if (rank < t + it->weight()) {
pos = std::distance(centroids_.begin(), it);
break;
}
t += it->weight();
}
}
double delta = 0;
if (centroids_.size() > 1) {
if (pos == 0) {
delta = centroids_[pos + 1].mean() - centroids_[pos].mean();
} else if (pos == centroids_.size() - 1) {
delta = centroids_[pos].mean() - centroids_[pos - 1].mean();
} else {
delta = (centroids_[pos + 1].mean() - centroids_[pos - 1].mean()) / 2;
}
}
return centroids_[pos].mean() +
((rank - t) / centroids_[pos].weight() - 0.5) * delta;
}
void TDigest::Centroid::add(const TDigest::Centroid& other) {
auto means = _mm_set_pd(mean(), other.mean());
auto weights = _mm_set_pd(weight(), other.weight());
auto sums128 = _mm_mul_pd(means, weights);
double* sums = reinterpret_cast<double*>(&sums128);
double sum = sums[0] + sums[1];
weight_ += other.weight();
mean_ = sum / weight_;
}
} // namespace folly
/*
* Copyright 2012-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <vector>
#include <folly/Range.h>
namespace folly {
/*
* TDigests are a biased quantile estimator designed to estimate the values of
* the quantiles of streaming data with high accuracy and low memory,
* particularly for quantiles at the tails (p0.1, p1, p99, p99.9). See
* https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf
* for an explanation of what the purpose of TDigests is, and how they work.
*
* There is a notable difference between the implementation here and the
* implementation in the paper. In the paper, the recommended scaling function
* for bucketing centroids is an arcsin function. The arcsin function provides
* high accuracy for low memory, but comes at a relatively high compute cost.
* A good choice algorithm has the following properties:
* - The value of the function k(0, delta) = 0, and k(1, delta) = delta.
* This is a requirement for any t-digest function.
* - The limit of the derivative of the function dk/dq at 0 is inf, and at
* 1 is inf. This provides bias to improve accuracy at the tails.
* - For any q <= 0.5, dk/dq(q) = dk/dq(1-q). This ensures that the accuracy
* of upper and lower quantiles are equivalent.
* As such, TDigest uses a sqrt function with these properties, which is faster
* than arcsin. There is a small, but relatively negligible impact to accuracy
* at the tail. In empirical tests, accuracy of the sqrt approach has been
* adequate.
*/
class TDigest {
public:
explicit TDigest(size_t maxSize = 100) : maxSize_(maxSize), total_(0.0) {
centroids_.reserve(maxSize);
}
/*
* Returns a new TDigest constructed with values merged from the current
* digest and the given sortedValues.
*/
TDigest merge(Range<const double*> sortedValues) const;
/*
* Returns a new TDigest constructed with values merged from the given
* digests.
*/
static TDigest merge(Range<const TDigest*> digests);
/*
* Estimates the value of the given quantile.
*/
double estimateQuantile(double q) const;
private:
struct Centroid {
public:
explicit Centroid(double mean = 0.0, double weight = 1.0)
: mean_(mean), weight_(weight) {}
inline double mean() const {
return mean_;
}
inline double weight() const {
return weight_;
}
inline void add(const Centroid& other);
inline bool operator<(const Centroid& other) const {
return mean() < other.mean();
}
private:
double mean_;
double weight_;
};
std::vector<Centroid> centroids_;
size_t maxSize_;
double total_;
};
} // namespace folly
......@@ -3,10 +3,14 @@ ACLOCAL_AMFLAGS = -I m4
CPPFLAGS = -I$(top_srcdir)/test/gtest/googletest/include
ldadd = $(top_builddir)/test/libfollytestmain.la
check_PROGRAMS = \
histogram_test
TESTS = \
histogram_test \
tdigest_test
TESTS = $(check_PROGRAMS)
histogram_test_SOURCES = HistogramTest.cpp
histogram_test_LDADD = $(ldadd)
tdigest_test_SOURCES = TDigestTest.cpp
tdigest_test_LDADD = $(ldadd)
/*
* Copyright 2012-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/stats/TDigest.h>
#include <algorithm>
#include <chrono>
#include <random>
#include <folly/Benchmark.h>
#include <folly/portability/GFlags.h>
using folly::TDigest;
void merge(unsigned int iters, size_t maxSize, size_t bufSize) {
TDigest digest(maxSize);
std::vector<std::vector<double>> buffers;
BENCHMARK_SUSPEND {
std::default_random_engine generator;
generator.seed(std::chrono::system_clock::now().time_since_epoch().count());
std::lognormal_distribution<double> distribution(0.0, 1.0);
for (size_t i = 0; i < iters; ++i) {
std::vector<double> buffer;
for (size_t j = 0; j < bufSize; ++j) {
buffer.push_back(distribution(generator));
}
std::sort(buffer.begin(), buffer.end());
buffers.push_back(std::move(buffer));
}
}
for (const auto& buffer : buffers) {
digest = digest.merge(buffer);
}
}
void mergeDigests(unsigned int iters, size_t maxSize, size_t nDigests) {
std::vector<TDigest> digests;
BENCHMARK_SUSPEND {
TDigest digest(maxSize);
std::default_random_engine generator;
generator.seed(std::chrono::system_clock::now().time_since_epoch().count());
std::lognormal_distribution<double> distribution(0.0, 1.0);
for (size_t i = 0; i < nDigests; ++i) {
std::vector<double> buffer;
for (size_t j = 0; j < maxSize; ++j) {
buffer.push_back(distribution(generator));
}
std::sort(buffer.begin(), buffer.end());
digests.push_back(digest.merge(buffer));
}
}
for (size_t i = 0; i < iters; ++i) {
TDigest::merge(digests);
}
}
void estimateQuantile(unsigned int iters, size_t maxSize, double quantile) {
TDigest digest(maxSize);
size_t bufSize = maxSize * 10;
BENCHMARK_SUSPEND {
std::vector<double> values;
std::default_random_engine generator;
generator.seed(std::chrono::system_clock::now().time_since_epoch().count());
std::lognormal_distribution<double> distribution(0.0, 1.0);
for (size_t i = 0; i < 50000; ++i) {
values.push_back(distribution(generator));
}
for (size_t i = 0; i < 50000 / bufSize; ++i) {
std::vector<double> buffer;
for (size_t j = 0; j < bufSize; ++j) {
buffer.push_back(values[i * bufSize + j]);
}
std::sort(buffer.begin(), buffer.end());
digest = digest.merge(buffer);
}
}
for (size_t i = 0; i < iters; ++i) {
digest.estimateQuantile(quantile);
}
}
BENCHMARK_NAMED_PARAM(merge, 100x1, 100, 100)
BENCHMARK_RELATIVE_NAMED_PARAM(merge, 100x5, 100, 500)
BENCHMARK_RELATIVE_NAMED_PARAM(merge, 100x10, 100, 1000)
BENCHMARK_RELATIVE_NAMED_PARAM(merge, 1000x1, 1000, 1000)
BENCHMARK_RELATIVE_NAMED_PARAM(merge, 1000x5, 1000, 5000)
BENCHMARK_RELATIVE_NAMED_PARAM(merge, 1000x10, 1000, 10000)
BENCHMARK_DRAW_LINE();
BENCHMARK_NAMED_PARAM(mergeDigests, 100x60, 100, 60)
BENCHMARK_RELATIVE_NAMED_PARAM(mergeDigests, 1000x60, 1000, 60)
BENCHMARK_DRAW_LINE();
BENCHMARK_NAMED_PARAM(estimateQuantile, 100x1_p001, 100, 0.001)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 100_p01, 100, 0.01)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 100_p25, 100, 0.25)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 100_p50, 100, 0.5)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 100_p75, 100, 0.75)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 100_p99, 100, 0.99)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 100_p999, 100, 0.999)
BENCHMARK_DRAW_LINE();
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 1000_p001, 1000, 0.001)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 1000_p01, 1000, 0.01)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 1000_p25, 1000, 0.25)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 1000_p50, 1000, 0.5)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 1000_p75, 1000, 0.75)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 1000_p99, 1000, 0.99)
BENCHMARK_RELATIVE_NAMED_PARAM(estimateQuantile, 1000_p999, 1000, 0.999)
/*
* ./tdigest_benchmark
* ============================================================================
* folly/stats/test/TDigestBenchmark.cpp relative time/iter iters/s
* ============================================================================
* merge(100x1) 4.08us 244.93K
* merge(100x5) 52.76% 7.74us 129.23K
* merge(100x10) 30.58% 13.35us 74.91K
* merge(1000x1) 12.42% 32.88us 30.42K
* merge(1000x5) 6.48% 63.03us 15.87K
* merge(1000x10) 3.81% 107.04us 9.34K
* ----------------------------------------------------------------------------
* mergeDigests(100x60) 382.79us 2.61K
* mergeDigests(1000x60) 9.16% 4.18ms 239.40
* ----------------------------------------------------------------------------
* estimateQuantile(100x1_p001) 8.86ns 112.84M
* estimateQuantile(100_p01) 57.10% 15.52ns 64.43M
* estimateQuantile(100_p25) 12.60% 70.34ns 14.22M
* estimateQuantile(100_p50) 9.83% 90.13ns 11.10M
* estimateQuantile(100_p75) 12.57% 70.52ns 14.18M
* estimateQuantile(100_p99) 60.29% 14.70ns 68.03M
* estimateQuantile(100_p999) 102.06% 8.68ns 115.16M
* ----------------------------------------------------------------------------
* estimateQuantile(1000_p001) 23.67% 37.43ns 26.71M
* estimateQuantile(1000_p01) 6.80% 130.26ns 7.68M
* estimateQuantile(1000_p25) 1.56% 569.73ns 1.76M
* estimateQuantile(1000_p50) 1.13% 786.43ns 1.27M
* estimateQuantile(1000_p75) 1.57% 564.81ns 1.77M
* estimateQuantile(1000_p99) 6.86% 129.28ns 7.74M
* estimateQuantile(1000_p999) 27.21% 32.57ns 30.71M
* ============================================================================
*/
int main(int argc, char* argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
folly::runBenchmarks();
return 0;
}
/*
* Copyright 2013-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/stats/TDigest.h>
#include <chrono>
#include <random>
#include <folly/portability/GTest.h>
using namespace folly;
/*
* Tests run at a reasonable speed with these settings, but it is good to
* occasionally test with kNumRandomRuns = 3000.
*/
const int32_t kNumSamples = 50000;
const int32_t kNumRandomRuns = 30;
const int32_t kSeed = 0;
TEST(TDigest, Basic) {
TDigest digest(100);
std::vector<double> values;
for (int i = 1; i <= 100; ++i) {
values.push_back(i);
}
digest = digest.merge(values);
EXPECT_EQ(0.6, digest.estimateQuantile(0.001));
EXPECT_EQ(2.0 - 0.5, digest.estimateQuantile(0.01));
EXPECT_EQ(51.0 - 0.5, digest.estimateQuantile(0.5));
EXPECT_EQ(100.0 - 0.5, digest.estimateQuantile(0.99));
EXPECT_EQ(100.4, digest.estimateQuantile(0.999));
}
TEST(TDigest, Merge) {
TDigest digest(100);
std::vector<double> values;
for (int i = 1; i <= 100; ++i) {
values.push_back(i);
}
digest = digest.merge(values);
values.clear();
for (int i = 101; i <= 200; ++i) {
values.push_back(i);
}
digest = digest.merge(values);
EXPECT_EQ(0.7, digest.estimateQuantile(0.001));
EXPECT_EQ(4.0 - 1.5, digest.estimateQuantile(0.01));
EXPECT_EQ(102.0 - 1.5, digest.estimateQuantile(0.5));
EXPECT_EQ(200.0 - 1.5, digest.estimateQuantile(0.99));
EXPECT_EQ(200.3, digest.estimateQuantile(0.999));
}
TEST(TDigest, MergeSmall) {
TDigest digest(100);
std::vector<double> values;
values.push_back(1);
digest = digest.merge(values);
EXPECT_EQ(1.0, digest.estimateQuantile(0.001));
EXPECT_EQ(1.0, digest.estimateQuantile(0.01));
EXPECT_EQ(1.0, digest.estimateQuantile(0.5));
EXPECT_EQ(1.0, digest.estimateQuantile(0.99));
EXPECT_EQ(1.0, digest.estimateQuantile(0.999));
}
TEST(TDigest, MergeLarge) {
TDigest digest(100);
std::vector<double> values;
for (int i = 1; i <= 1000; ++i) {
values.push_back(i);
}
digest = digest.merge(values);
EXPECT_EQ(1.5, digest.estimateQuantile(0.001));
EXPECT_EQ(10.5, digest.estimateQuantile(0.01));
EXPECT_EQ(500.5, digest.estimateQuantile(0.5));
EXPECT_EQ(990.5, digest.estimateQuantile(0.99));
EXPECT_EQ(999.5, digest.estimateQuantile(0.999));
}
TEST(TDigest, MergeLargeAsDigests) {
std::vector<TDigest> digests;
TDigest digest(100);
for (int i = 0; i < 10; ++i) {
std::vector<double> values;
for (int j = 1; j <= 100; ++j) {
values.push_back(100 * i + j);
}
digests.push_back(digest.merge(values));
}
digest = TDigest::merge(digests);
EXPECT_EQ(1.5, digest.estimateQuantile(0.001));
EXPECT_EQ(10.5, digest.estimateQuantile(0.01));
EXPECT_EQ(500.5, digest.estimateQuantile(0.5));
EXPECT_EQ(990.5, digest.estimateQuantile(0.99));
EXPECT_EQ(999.5, digest.estimateQuantile(0.999));
}
class DistributionTest : public ::testing::TestWithParam<
std::tuple<bool, size_t, double, double>> {};
TEST_P(DistributionTest, ReasonableError) {
bool logarithmic;
size_t modes;
double quantile;
double reasonableError;
std::tie(logarithmic, modes, quantile, reasonableError) = GetParam();
std::vector<double> errors;
std::default_random_engine generator;
generator.seed(kSeed);
for (size_t iter = 0; iter < kNumRandomRuns; ++iter) {
TDigest digest(100);
std::vector<double> values;
if (logarithmic) {
std::lognormal_distribution<double> distribution(0.0, 1.0);
for (size_t i = 0; i < kNumSamples; ++i) {
auto mode = (int)distribution(generator) % modes;
values.push_back(distribution(generator) + 100.0 * mode);
}
} else {
std::uniform_int_distribution<int> distributionPicker(0, modes - 1);
std::vector<std::normal_distribution<double>> distributions;
for (size_t i = 0; i < modes; ++i) {
distributions.emplace_back(100.0 * (i + 1), 25);
}
for (size_t i = 0; i < kNumSamples; ++i) {
auto distributionIdx = distributionPicker(generator);
values.push_back(distributions[distributionIdx](generator));
}
}
for (size_t i = 0; i < kNumSamples / 1000; ++i) {
auto it_l = values.begin() + (i * 1000);
auto it_r = it_l + 1000;
std::sort(it_l, it_r);
folly::Range<const double*> r(values, i * 1000, 1000);
digest = digest.merge(r);
}
std::sort(values.begin(), values.end());
double est = digest.estimateQuantile(quantile);
auto it = std::lower_bound(values.begin(), values.end(), est);
int32_t actualRank = std::distance(values.begin(), it);
double actualQuantile = ((double)actualRank) / kNumSamples;
errors.push_back(actualQuantile - quantile);
}
double sum = 0.0;
for (auto error : errors) {
sum += error;
}
double mean = sum / kNumRandomRuns;
double numerator = 0.0;
for (auto error : errors) {
numerator += pow(error - mean, 2);
}
double stddev = std::sqrt(numerator / (kNumRandomRuns - 1));
EXPECT_GE(reasonableError, stddev);
}
INSTANTIATE_TEST_CASE_P(
ReasonableErrors,
DistributionTest,
::testing::Values(
std::make_tuple(true, 1, 0.001, 0.0005),
std::make_tuple(true, 1, 0.01, 0.005),
std::make_tuple(true, 1, 0.25, 0.02),
std::make_tuple(true, 1, 0.50, 0.02),
std::make_tuple(true, 1, 0.75, 0.02),
std::make_tuple(true, 1, 0.99, 0.005),
std::make_tuple(true, 1, 0.999, 0.0005),
std::make_tuple(true, 3, 0.001, 0.0005),
std::make_tuple(true, 3, 0.01, 0.005),
std::make_tuple(true, 3, 0.25, 0.02),
std::make_tuple(true, 3, 0.50, 0.02),
std::make_tuple(true, 3, 0.75, 0.02),
std::make_tuple(true, 3, 0.99, 0.005),
std::make_tuple(true, 3, 0.999, 0.0005),
std::make_tuple(false, 1, 0.001, 0.0005),
std::make_tuple(false, 1, 0.01, 0.005),
std::make_tuple(false, 1, 0.25, 0.02),
std::make_tuple(false, 1, 0.50, 0.02),
std::make_tuple(false, 1, 0.75, 0.02),
std::make_tuple(false, 1, 0.99, 0.005),
std::make_tuple(false, 1, 0.999, 0.0005),
std::make_tuple(false, 10, 0.001, 0.0005),
std::make_tuple(false, 10, 0.01, 0.005),
std::make_tuple(false, 10, 0.25, 0.02),
std::make_tuple(false, 10, 0.50, 0.02),
std::make_tuple(false, 10, 0.75, 0.02),
std::make_tuple(false, 10, 0.99, 0.005),
std::make_tuple(false, 10, 0.999, 0.0005)));
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment