Commit 5c02ae45 authored by Marc Celani's avatar Marc Celani Committed by Facebook Github Bot

TDigest handles crazy outliers

Summary: The algorithm in the TDigest paper does not handle weird outliers very well. This diff changes the clamp logic to clamp between the prev and next centroids rather than min_ and max_. This gives much better results.

Reviewed By: yfeldblum

Differential Revision: D9192686

fbshipit-source-id: 171a7b7db5552802f91e6e95bb541a1b03c8821d
parent 7659f9ba
......@@ -317,18 +317,24 @@ double TDigest::estimateQuantile(double q) const {
}
double delta = 0;
double min = min_;
double max = max_;
if (centroids_.size() > 1) {
if (pos == 0) {
delta = centroids_[pos + 1].mean() - centroids_[pos].mean();
max = centroids_[pos + 1].mean();
} else if (pos == centroids_.size() - 1) {
delta = centroids_[pos].mean() - centroids_[pos - 1].mean();
min = centroids_[pos - 1].mean();
} else {
delta = (centroids_[pos + 1].mean() - centroids_[pos - 1].mean()) / 2;
min = centroids_[pos - 1].mean();
max = centroids_[pos + 1].mean();
}
}
auto value = centroids_[pos].mean() +
((rank - t) / centroids_[pos].weight() - 0.5) * delta;
return clamp(value, min_, max_);
return clamp(value, min, max);
}
double TDigest::Centroid::add(double sum, double weight) {
......
......@@ -259,6 +259,22 @@ TEST(TDigest, ConstructFromCentroids) {
EXPECT_NE(digest1.getCentroids().size(), digest3.getCentroids().size());
}
TEST(TDigest, LargeOutlierTest) {
folly::TDigest digest(100);
std::vector<double> values;
for (double i = 0; i < 19; ++i) {
values.push_back(i);
}
values.push_back(1000000);
std::sort(values.begin(), values.end());
digest = digest.merge(values);
EXPECT_LT(
(int64_t)digest.estimateQuantile(0.5),
(int64_t)digest.estimateQuantile(0.90));
}
class DistributionTest
: public ::testing::TestWithParam<
std::tuple<std::pair<bool, size_t>, double, bool>> {};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment