TDigest handles crazy outliers

Summary: The algorithm in the TDigest paper does not handle weird outliers very well. This diff changes the clamp logic to clamp between the prev and next centroids rather than min_ and max_. This gives much better results. Reviewed By: yfeldblum Differential Revision: D9192686 fbshipit-source-id: 171a7b7db5552802f91e6e95bb541a1b03c8821d

TDigest handles crazy outliers
Summary: The algorithm in the TDigest paper does not handle weird outliers very well. This diff changes the clamp logic to clamp between the prev and next centroids rather than min_ and max_. This gives much better results. Reviewed By: yfeldblum Differential Revision: D9192686 fbshipit-source-id: 171a7b7db5552802f91e6e95bb541a1b03c8821d
5c02ae45 · Marc Celani · Facebook Github Bot · 7659f9ba · 5c02ae45 · 5c02ae45
Commit 5c02ae45 authored Aug 06, 2018 by Marc Celani Committed by Facebook Github Bot Aug 06, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 1 deletion

folly/stats/TDigest.cpp folly/stats/TDigest.cpp +7 -1

folly/stats/test/TDigestTest.cpp folly/stats/test/TDigestTest.cpp +16 -0

No files found.
--- a/folly/stats/TDigest.cpp
+++ b/folly/stats/TDigest.cpp
@@ -317,18 +317,24 @@ double TDigest::estimateQuantile(double q) const {
  }

  double delta = 0;
+  double min = min_;
+  double max = max_;
  if (centroids_.size() > 1) {
    if (pos == 0) {
      delta = centroids_[pos + 1].mean() - centroids_[pos].mean();
+      max = centroids_[pos + 1].mean();
    } else if (pos == centroids_.size() - 1) {
      delta = centroids_[pos].mean() - centroids_[pos - 1].mean();
+      min = centroids_[pos - 1].mean();
    } else {
      delta = (centroids_[pos + 1].mean() - centroids_[pos - 1].mean()) / 2;
+      min = centroids_[pos - 1].mean();
+      max = centroids_[pos + 1].mean();
    }
  }
  auto value = centroids_[pos].mean() +
      ((rank - t) / centroids_[pos].weight() - 0.5) * delta;
-  return clamp(value, min_, max_);
+  return clamp(value, min, max);
 }

 double TDigest::Centroid::add(double sum, double weight) {

--- a/folly/stats/test/TDigestTest.cpp
+++ b/folly/stats/test/TDigestTest.cpp
@@ -259,6 +259,22 @@ TEST(TDigest, ConstructFromCentroids) {
  EXPECT_NE(digest1.getCentroids().size(), digest3.getCentroids().size());
 }

+TEST(TDigest, LargeOutlierTest) {
+  folly::TDigest digest(100);
+
+  std::vector<double> values;
+  for (double i = 0; i < 19; ++i) {
+    values.push_back(i);
+  }
+  values.push_back(1000000);
+
+  std::sort(values.begin(), values.end());
+  digest = digest.merge(values);
+  EXPECT_LT(
+      (int64_t)digest.estimateQuantile(0.5),
+      (int64_t)digest.estimateQuantile(0.90));
+}
+
 class DistributionTest
    : public ::testing::TestWithParam<
          std::tuple<std::pair<bool, size_t>, double, bool>> {};