Commit 856466e2 authored by Alfredo Altamirano's avatar Alfredo Altamirano Committed by Facebook Github Bot

Support constructing TDigest from centroids and getting centroids from TDigest

Summary:
To implement tdigest aggregation in scuba, we need to be able to deserialize and serialize the digests.
I added a getter for the centroids so that we can serialize the tdigest, and added a constructor to create one from the list of centroids.

Last diff was reverted because of a build error :(
We were seeing an error only in opt mode because it's an error in gcc but not in clang.

Differential Revision: D8381617

fbshipit-source-id: 768a12795aeb02737eb9b060f80d01608685c91d
parent f2c8ffe5
......@@ -75,6 +75,32 @@ static double clamp(double v, double lo, double hi) {
return v;
}
TDigest::TDigest(
std::vector<Centroid> centroids,
double sum,
double count,
double max_val,
double min_val,
size_t maxSize)
: maxSize_(maxSize),
sum_(sum),
count_(count),
max_(max_val),
min_(min_val) {
if (centroids.size() <= maxSize_) {
centroids_ = std::move(centroids);
} else {
// Number of centroids is greater than maxSize, we need to compress them
// When merging, resulting digest takes the maxSize of the first digest
auto sz = centroids.size();
std::array<TDigest, 2> digests{{
TDigest(maxSize_),
TDigest(std::move(centroids), sum_, count_, max_, min_, sz),
}};
*this = this->merge(digests);
}
}
TDigest TDigest::merge(Range<const double*> sortedValues) const {
if (sortedValues.empty()) {
return *this;
......
......@@ -48,9 +48,44 @@ namespace folly {
*/
class TDigest {
public:
class Centroid {
public:
explicit Centroid(double mean = 0.0, double weight = 1.0)
: mean_(mean), weight_(weight) {}
inline double mean() const {
return mean_;
}
inline double weight() const {
return weight_;
}
/*
* Adds the sum/weight to this centroid, and returns the new sum.
*/
inline double add(double sum, double weight);
inline bool operator<(const Centroid& other) const {
return mean() < other.mean();
}
private:
double mean_;
double weight_;
};
explicit TDigest(size_t maxSize = 100)
: maxSize_(maxSize), sum_(0.0), count_(0.0), max_(NAN), min_(NAN) {}
explicit TDigest(
std::vector<Centroid> centroids,
double sum,
double count,
double max_val,
double min_val,
size_t maxSize = 100);
/*
* Returns a new TDigest constructed with values merged from the current
* digest and the given sortedValues.
......@@ -92,34 +127,11 @@ class TDigest {
return centroids_.empty();
}
private:
struct Centroid {
public:
explicit Centroid(double mean = 0.0, double weight = 1.0)
: mean_(mean), weight_(weight) {}
inline double mean() const {
return mean_;
}
inline double weight() const {
return weight_;
}
/*
* Adds the sum/weight to this centroid, and returns the new sum.
*/
inline double add(double sum, double weight);
inline bool operator<(const Centroid& other) const {
return mean() < other.mean();
}
private:
double mean_;
double weight_;
};
const std::vector<Centroid>& getCentroids() const {
return centroids_;
}
private:
std::vector<Centroid> centroids_;
size_t maxSize_;
double sum_;
......
......@@ -219,6 +219,46 @@ TEST(TDigest, NegativeValuesMergeDigests) {
EXPECT_EQ(100, digest.estimateQuantile(1.0));
}
TEST(TDigest, ConstructFromCentroids) {
std::vector<TDigest::Centroid> centroids{};
TDigest digest(100);
std::vector<double> values;
for (int i = 1; i <= 100; ++i) {
values.push_back(i);
}
auto digest1 = digest.merge(values);
size_t centroid_count = digest1.getCentroids().size();
TDigest digest2(
digest1.getCentroids(),
digest1.sum(),
digest1.count(),
digest1.max(),
digest1.min(),
100);
EXPECT_EQ(digest1.sum(), digest2.sum());
EXPECT_EQ(digest1.count(), digest2.count());
EXPECT_EQ(digest1.min(), digest2.min());
EXPECT_EQ(digest1.max(), digest2.max());
EXPECT_EQ(digest1.getCentroids().size(), digest2.getCentroids().size());
TDigest digest3(
digest1.getCentroids(),
digest1.sum(),
digest1.count(),
digest1.max(),
digest1.min(),
centroid_count - 1);
EXPECT_EQ(digest1.sum(), digest3.sum());
EXPECT_EQ(digest1.count(), digest3.count());
EXPECT_EQ(digest1.min(), digest3.min());
EXPECT_EQ(digest1.max(), digest3.max());
EXPECT_NE(digest1.getCentroids().size(), digest3.getCentroids().size());
}
class DistributionTest
: public ::testing::TestWithParam<
std::tuple<std::pair<bool, size_t>, double, bool>> {};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment