Use radix sort in TDigest

Summary: Update the TDigest interface to accept unsorted or sorted data. If unsorted, use a custom radix sort explicitly aimed at this use case: doubles only, assume small data, so we don't use in-place and everything still fits in cache (1000-10000 items). Previously D8875766. This version does not require newer boost versions, and puts everything in the heap, so there should be no fiber overflows. Reviewed By: yfeldblum Differential Revision: D9197722 fbshipit-source-id: 10088cf14d1ff88e1072c00084e09129271e97ec

Use radix sort in TDigest
Summary: Update the TDigest interface to accept unsorted or sorted data. If unsorted, use a custom radix sort explicitly aimed at this use case: doubles only, assume small data, so we don't use in-place and everything still fits in cache (1000-10000 items). Previously D8875766. This version does not require newer boost versions, and puts everything in the heap, so there should be no fiber overflows. Reviewed By: yfeldblum Differential Revision: D9197722 fbshipit-source-id: 10088cf14d1ff88e1072c00084e09129271e97ec
7fed85c2 · Dave Watson · Facebook Github Bot · 8f45e92c · 7fed85c2 · 7fed85c2
Commit 7fed85c2 authored Aug 15, 2018 by Dave Watson Committed by Facebook Github Bot Aug 15, 2018
9 changed files
--- a/folly/stats/TDigest.cpp
+++ b/folly/stats/TDigest.cpp
@@ -15,6 +15,7 @@
 */

 #include <folly/stats/TDigest.h>
+#include <folly/stats/detail/DoubleRadixSort.h>

 #include <algorithm>
 #include <limits>
@@ -101,7 +102,28 @@ TDigest::TDigest(
  }
 }

-TDigest TDigest::merge(Range<const double*> sortedValues) const {
+// Merge unsorted values by first sorting them.  Use radix sort if
+// possible.  This implementation puts all additional memory in the
+// heap, so that if called from fiber context we do not smash the
+// stack.  Otherwise it is very similar to boost::spreadsort.
+TDigest TDigest::merge(Range<const double*> unsortedValues) const {
+  auto n = unsortedValues.size();
+
+  // We require 256 buckets per byte level, plus one count array we can reuse.
+  std::unique_ptr<uint64_t[]> buckets{new uint64_t[256 * 9]};
+  // Allocate input and tmp array
+  std::unique_ptr<double[]> tmp{new double[n * 2]};
+  auto out = tmp.get() + n;
+  auto in = tmp.get();
+  std::copy(unsortedValues.begin(), unsortedValues.end(), in);
+
+  detail::double_radix_sort(n, buckets.get(), in, out);
+  DCHECK(std::is_sorted(in, in + n));
+
+  return merge(presorted, Range<const double*>(in, in + n));
+}
+
+TDigest TDigest::merge(presorted_t, Range<const double*> sortedValues) const {
  if (sortedValues.empty()) {
    return *this;
  }

--- a/folly/stats/TDigest.h
+++ b/folly/stats/TDigest.h
@@ -20,6 +20,7 @@
 #include <vector>

 #include <folly/Range.h>
+#include <folly/Utility.h>

 namespace folly {

@@ -92,7 +93,8 @@ class TDigest {
   * Returns a new TDigest constructed with values merged from the current
   * digest and the given sortedValues.
   */
-  TDigest merge(Range<const double*> sortedValues) const;
+  TDigest merge(presorted_t, Range<const double*> sortedValues) const;
+  TDigest merge(Range<const double*> unsortedValues) const;

  /*
   * Returns a new TDigest constructed with values merged from the given

--- a/folly/stats/detail/DigestBuilder-defs.h
+++ b/folly/stats/detail/DigestBuilder-defs.h
@@ -64,7 +64,6 @@ DigestT DigestBuilder<DigestT>::build() {
    for (const auto& vec : valuesVec) {
      values.insert(values.end(), vec.begin(), vec.end());
    }
-    std::sort(values.begin(), values.end());
    DigestT digest(digestSize_);
    digests.push_back(digest.merge(values));
  }
@@ -83,7 +82,6 @@ void DigestBuilder<DigestT>::append(double value) {
  }
  cpuLocalBuf->buffer.push_back(value);
  if (cpuLocalBuf->buffer.size() == bufferSize_) {
-    std::sort(cpuLocalBuf->buffer.begin(), cpuLocalBuf->buffer.end());
    if (!cpuLocalBuf->digest) {
      cpuLocalBuf->digest = std::make_unique<DigestT>(digestSize_);
    }

--- a/folly/stats/detail/DoubleRadixSort.cpp
+++ b/folly/stats/detail/DoubleRadixSort.cpp
+/*
+ * Copyright 2018-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/stats/detail/DoubleRadixSort.h>
+
+#include <algorithm>
+#include <cstring>
+
+namespace folly {
+namespace detail {
+
+// Convert floats to something comparable via radix sort.
+// See http://stereopsis.com/radix.html for details.
+static uint8_t getRadixBucket(double* f, uint8_t shift) {
+  uint64_t val;
+  memcpy(&val, f, sizeof(double));
+  uint64_t mask = -int64_t(val >> 63) | 0x8000000000000000;
+  auto adjusted = val ^ mask;
+  return (adjusted >> (64 - 8 - shift)) & 0xFF;
+}
+
+// MSB radix sort for doubles.
+static void double_radix_sort_rec(
+    uint64_t n,
+    uint64_t* buckets,
+    uint8_t shift,
+    bool inout,
+    double* in,
+    double* out) {
+  // First pass: calculate bucket counts.
+  memset(buckets, 0, 256 * sizeof(uint64_t));
+  for (uint64_t i = 0; i < n; i++) {
+    buckets[getRadixBucket(&in[i], shift)]++;
+  }
+
+  // Second pass: calculate bucket start positions.
+  uint64_t tot = 0;
+  for (uint64_t i = 0; i < 256; i++) {
+    auto prev = tot;
+    tot += buckets[i];
+    buckets[i + 256] = prev;
+  }
+
+  // Third pass: Move based on radix counts.
+  for (uint64_t i = 0; i < n; i++) {
+    auto pos = buckets[getRadixBucket(&in[i], shift) + 256]++;
+    out[pos] = in[i];
+  }
+
+  // If we haven't used up all input bytes, recurse and sort.  if the
+  // bucket is too small, use std::sort instead, and copy output to
+  // correct array.
+  if (shift < 56) {
+    tot = 0;
+    for (int i = 0; i < 256; i++) {
+      if (buckets[i] < 256) {
+        std::sort(out + tot, out + tot + buckets[i]);
+        if (!inout) {
+          memcpy(in + tot, out + tot, buckets[i] * sizeof(double));
+        }
+      } else {
+        double_radix_sort_rec(
+            buckets[i], buckets + 256, shift + 8, !inout, out + tot, in + tot);
+      }
+      tot += buckets[i];
+    }
+  }
+}
+
+void double_radix_sort(uint64_t n, uint64_t* buckets, double* in, double* tmp) {
+  // If array is too small, use std::sort directly.
+  if (n < 700) {
+    std::sort(in, in + n);
+  } else {
+    detail::double_radix_sort_rec(n, buckets, 0, false, in, tmp);
+  }
+}
+
+} // namespace detail
+} // namespace folly
--- a/folly/stats/detail/DoubleRadixSort.h
+++ b/folly/stats/detail/DoubleRadixSort.h
+/*
+ * Copyright 2018-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstdint>
+
+namespace folly {
+namespace detail {
+
+/*
+ * Sorts a double[] array using radix sort (falling back to std::sort
+ * for small arrays).
+ *
+ * n - size of array
+ * buckets - must be array of uint64_t of size 256*9.
+ * in & out - must be double arrays of size n.  in contains input data.
+ *
+ * output - in array is sorted.
+ */
+void double_radix_sort(uint64_t n, uint64_t* buckets, double* in, double* tmp);
+
+} // namespace detail
+} // namespace folly
--- a/folly/stats/detail/test/RadixSortTest.cpp
+++ b/folly/stats/detail/test/RadixSortTest.cpp
+/*
+ * Copyright 2018-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/portability/GTest.h>
+
+#include <folly/Random.h>
+#include <folly/stats/detail/DoubleRadixSort.h>
+
+using namespace folly;
+using namespace folly::detail;
+
+TEST(DoubleRadixSort, Basic) {
+  std::unique_ptr<uint64_t[]> buckets(new uint64_t[256 * 9]);
+  for (int i = 0; i < 100; i++) {
+    size_t sz = folly::Random::rand32(0, 100000);
+    std::unique_ptr<double[]> in(new double[sz]);
+    std::unique_ptr<double[]> out(new double[sz]);
+    for (size_t j = 0; j < sz; j++) {
+      in[j] = folly::Random::randDouble(-100.0, 100.0);
+    }
+    double_radix_sort(sz, buckets.get(), in.get(), out.get());
+    EXPECT_TRUE(std::is_sorted(in.get(), in.get() + sz));
+  }
+}
--- a/folly/stats/test/DigestBuilderBenchmark.cpp
+++ b/folly/stats/test/DigestBuilderBenchmark.cpp
@@ -93,19 +93,19 @@ BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(append, 10000x32, 10000, 32)
 * ============================================================================
 * folly/stats/test/DigestBuilderBenchmark.cpp     relative  time/iter  iters/s
 * ============================================================================
- * append(1000x1)                                              25.70ns   38.91M
- * append(1000x2)                                    99.12%    25.93ns   38.57M
- * append(1000x4)                                    98.62%    26.06ns   38.37M
- * append(1000x8)                                    96.23%    26.70ns   37.45M
- * append(1000x16)                                   88.75%    28.96ns   34.53M
- * append(1000x32)                                   66.81%    38.46ns   26.00M
+ * append(1000x1)                                              16.32ns   61.26M
+ * append(1000x2)                                    96.42%    16.93ns   59.07M
+ * append(1000x4)                                    93.57%    17.44ns   57.33M
+ * append(1000x8)                                    93.43%    17.47ns   57.24M
+ * append(1000x16)                                   92.96%    17.56ns   56.95M
+ * append(1000x32)                                   29.23%    55.84ns   17.91M
 * ----------------------------------------------------------------------------
- * append(10000x1)                                             25.39ns   39.38M
- * append(10000x2)                                   98.81%    25.70ns   38.91M
- * append(10000x4)                                   98.76%    25.71ns   38.90M
- * append(10000x8)                                   98.95%    25.66ns   38.97M
- * append(10000x16)                                  72.90%    34.83ns   28.71M
- * append(10000x32)                                  85.15%    29.82ns   33.54M
+ * append(10000x1)                                             11.58ns   86.33M
+ * append(10000x2)                                   95.82%    12.09ns   82.72M
+ * append(10000x4)                                   89.00%    13.01ns   76.84M
+ * append(10000x8)                                   88.63%    13.07ns   76.51M
+ * append(10000x16)                                  88.22%    13.13ns   76.16M
+ * append(10000x32)                                  24.89%    46.54ns   21.49M
 * ============================================================================
 */


--- a/folly/stats/test/TDigestBenchmark.cpp
+++ b/folly/stats/test/TDigestBenchmark.cpp
@@ -46,7 +46,7 @@ void merge(unsigned int iters, size_t maxSize, size_t bufSize) {
  }

  for (const auto& buffer : buffers) {
-    digest = digest.merge(buffer);
+    digest = digest.merge(folly::presorted, buffer);
  }
 }

@@ -64,7 +64,6 @@ void mergeDigests(unsigned int iters, size_t maxSize, size_t nDigests) {
      for (size_t j = 0; j < maxSize; ++j) {
        buffer.push_back(distribution(generator));
      }
-      std::sort(buffer.begin(), buffer.end());
      digests.push_back(digest.merge(buffer));
    }
  }
@@ -95,7 +94,6 @@ void estimateQuantile(unsigned int iters, size_t maxSize, double quantile) {
      for (size_t j = 0; j < bufSize; ++j) {
        buffer.push_back(values[i * bufSize + j]);
      }
-      std::sort(buffer.begin(), buffer.end());
      digest = digest.merge(buffer);
    }
  }

--- a/folly/stats/test/TDigestTest.cpp
+++ b/folly/stats/test/TDigestTest.cpp
@@ -136,10 +136,9 @@ TEST(TDigest, MergeLargeAsDigests) {
  // Ensure that the values do not monotonically increase across digests.
  std::random_shuffle(values.begin(), values.end());
  for (int i = 0; i < 10; ++i) {
-    std::vector<double> sorted(
+    std::vector<double> unsorted_values(
        values.begin() + (i * 100), values.begin() + (i + 1) * 100);
-    std::sort(sorted.begin(), sorted.end());
-    digests.push_back(digest.merge(sorted));
+    digests.push_back(digest.merge(unsorted_values));
  }

  digest = TDigest::merge(digests);
@@ -166,8 +165,6 @@ TEST(TDigest, NegativeValues) {
    values.push_back(-i);
  }

-  std::sort(values.begin(), values.end());
-
  digest = digest.merge(values);

  EXPECT_EQ(200, digest.count());
@@ -195,9 +192,6 @@ TEST(TDigest, NegativeValuesMergeDigests) {
    negativeValues.push_back(-i);
  }

-  std::sort(values.begin(), values.end());
-  std::sort(negativeValues.begin(), negativeValues.end());
-
  auto digest1 = digest.merge(values);
  auto digest2 = digest.merge(negativeValues);

@@ -365,9 +359,6 @@ TEST_P(DistributionTest, ReasonableError) {

    std::vector<TDigest> digests;
    for (size_t i = 0; i < kNumSamples / 1000; ++i) {
-      auto it_l = values.begin() + (i * 1000);
-      auto it_r = it_l + 1000;
-      std::sort(it_l, it_r);
      folly::Range<const double*> r(values, i * 1000, 1000);
      if (digestMerge) {
        digests.push_back(digest.merge(r));