Commit 0fbe9c5b authored by Tom Kwong's avatar Tom Kwong Committed by Facebook GitHub Bot

Add StreamingStats functionality

Summary:
This diff adds new capability to calculate statistics efficiently in a single pass. In particular, it uses Welford's algorithm to track an internal state such that variance and standard deviation can be obtained at O(1).

The following statistics are available:
- count
- minimum
- maximum
- mean
- sample and population variance
- sample and population standard deviation

Reviewed By: yfeldblum

Differential Revision: D26667517

fbshipit-source-id: c4eb31e93646b9cff2f6516039170c8d88a5bf33
parent 438afd7d
......@@ -213,8 +213,9 @@ Collections similar to `std::map` but implemented as sorted vectors.
#### `stats/`
A collection of efficient utilities for collecting statistics (often of
time series data).
A collection of efficient utilities for collecting statistics:
* time series counters, gauges, histograms, and quantiles;
* single-pass mean and variance.
#### `StlAllocator.h`
......
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cmath>
#include <functional>
#include <limits>
#include <tuple>
#include <type_traits>
#include <folly/lang/Exception.h>
namespace folly {
// Robust and efficient online computation of statistics,
// using Welford's method for variance.
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
template <typename SampleDataType, typename StatsType = double>
class StreamingStats final {
// Caclulated statistic result has to be floating point type
static_assert(std::is_floating_point_v<StatsType>);
public:
template <class Iterator>
StreamingStats(Iterator first, Iterator last) noexcept {
add(first, last);
}
StreamingStats() = default;
~StreamingStats() = default;
/// Add sample data via iteratation
template <class Iterator>
void add(Iterator first, Iterator last) noexcept {
for (auto it = first; it != last; ++it) {
add(*it);
}
}
/// Add a single sample
void add(SampleDataType value) noexcept {
max_ = std::max(max_, value);
min_ = std::min(min_, value);
++count_;
StatsType const delta = value - mean_;
mean_ += delta / count_;
StatsType const delta2 = value - mean_;
m2_ += delta * delta2;
}
/// Merge with an existing StreamingStats object
void merge(StreamingStats const& other) {
if (other.count_ == 0) {
return;
}
max_ = std::max(max_, other.max_);
min_ = std::min(min_, other.min_);
size_t const new_size = count_ + other.count_;
StatsType const new_mean =
(mean_ * count_ + other.mean_ * other.count_) / new_size;
// Each cumulant must be corrected.
// * from: sum((x_i - mean_)²)
// * to: sum((x_i - new_mean)²)
auto delta = [&](auto const& stats) {
return stats.count_ *
(new_mean * (new_mean - 2 * stats.mean_) + stats.mean_ * stats.mean_);
};
m2_ = m2_ + delta(*this) + other.m2_ + delta(other);
mean_ = new_mean;
count_ = new_size;
}
size_t count() const noexcept { return count_; }
SampleDataType minimum() const {
checkMinimumDataSize(1);
return min_;
}
SampleDataType maximum() const {
checkMinimumDataSize(1);
return max_;
}
StatsType mean() const {
checkMinimumDataSize(1);
return mean_;
}
StatsType populationVariance() const {
checkMinimumDataSize(2);
return var_(0);
}
StatsType sampleVariance() const {
checkMinimumDataSize(2);
return var_(1);
}
StatsType populationStandardDeviation() const {
checkMinimumDataSize(2);
return std_(0);
}
StatsType sampleStandardDeviation() const {
checkMinimumDataSize(2);
return std_(1);
}
private:
void checkMinimumDataSize(size_t const minElements) const {
if (count_ < minElements) {
throw_exception<std::logic_error>("stats: unavailable with no samples");
}
}
StatsType var_(size_t bias) const noexcept { return m2_ / (count_ - bias); }
StatsType std_(size_t bias) const noexcept { return std::sqrt(var_(bias)); }
size_t count_ = 0;
StatsType mean_ = 0;
StatsType m2_ = 0;
SampleDataType min_ = std::numeric_limits<SampleDataType>::max();
SampleDataType max_ = std::numeric_limits<SampleDataType>::lowest();
};
} // namespace folly
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/stats/StreamingStats.h>
#include <vector>
#include <folly/portability/GTest.h>
using folly::StreamingStats;
using std::vector;
TEST(StreamingStatsTest, EmptyDataSet) {
StreamingStats<int, double> stats;
EXPECT_EQ(stats.count(), 0);
EXPECT_THROW(stats.minimum(), std::logic_error);
EXPECT_THROW(stats.maximum(), std::logic_error);
EXPECT_THROW(stats.mean(), std::logic_error);
EXPECT_THROW(stats.populationVariance(), std::logic_error);
EXPECT_THROW(stats.populationStandardDeviation(), std::logic_error);
EXPECT_THROW(stats.sampleVariance(), std::logic_error);
EXPECT_THROW(stats.sampleStandardDeviation(), std::logic_error);
}
TEST(StreamingStatsTest, Constructor) {
std::vector<int> v{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
StreamingStats<int, double> stats(v.begin(), v.end());
EXPECT_EQ(stats.count(), 10);
EXPECT_DOUBLE_EQ(stats.mean(), 5.5);
EXPECT_DOUBLE_EQ(stats.minimum(), 1.0);
EXPECT_DOUBLE_EQ(stats.maximum(), 10.0);
}
template <typename SampleDataType>
class StreamingStatsTest : public testing::Test {
public:
void SetUp() override {
for (SampleDataType value = 1.0; value < 11.0; value += 1.0) {
stats.add(value);
}
}
StreamingStats<SampleDataType, double> stats;
};
using InputDataTypes = ::testing::Types<double, float, long, int, short>;
TYPED_TEST_CASE(StreamingStatsTest, InputDataTypes);
TYPED_TEST(StreamingStatsTest, StatsCalculations) {
EXPECT_EQ(this->stats.count(), 10);
EXPECT_DOUBLE_EQ(this->stats.mean(), 5.5);
EXPECT_DOUBLE_EQ(this->stats.minimum(), 1.0);
EXPECT_DOUBLE_EQ(this->stats.maximum(), 10.0);
EXPECT_DOUBLE_EQ(this->stats.populationVariance(), 8.25);
EXPECT_DOUBLE_EQ(
this->stats.populationStandardDeviation(), 2.8722813232690143);
EXPECT_DOUBLE_EQ(this->stats.sampleVariance(), 9.166666666666666);
EXPECT_DOUBLE_EQ(this->stats.sampleStandardDeviation(), 3.0276503540974917);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment