Do not read the end of the list on EliasFanoReader construction

Summary: Move size and upper bound accounting to `UpperBitsReader`, so we don't need to read the last value on construction, which can be expensive when opening a large number of small lists. This also makes the maximum `ValueType` representable, and lifts the requirement that the provided upper bound at construction time must be equal to the last element in the list, allowing multiple lists to share the encoding of the upper bound. Based on initial work by swaroopnm. Reviewed By: philippv Differential Revision: D32130075 fbshipit-source-id: e98a053b46b10c435ac0d402ff94a56fcfe095ad

Do not read the end of the list on EliasFanoReader construction
Summary: Move size and upper bound accounting to `UpperBitsReader`, so we don't need to read the last value on construction, which can be expensive when opening a large number of small lists. This also makes the maximum `ValueType` representable, and lifts the requirement that the provided upper bound at construction time must be equal to the last element in the list, allowing multiple lists to share the encoding of the upper bound. Based on initial work by swaroopnm. Reviewed By: philippv Differential Revision: D32130075 fbshipit-source-id: e98a053b46b10c435ac0d402ff94a56fcfe095ad
928aa5de · Giuseppe Ottaviano · Facebook GitHub Bot · 446ac5fa · 928aa5de · 928aa5de
Commit 928aa5de authored Nov 04, 2021 by Giuseppe Ottaviano Committed by Facebook GitHub Bot Nov 04, 2021
3 changed files
--- a/folly/experimental/EliasFanoCoding.h
+++ b/folly/experimental/EliasFanoCoding.h
--- a/folly/experimental/test/CodingTestUtils.h
+++ b/folly/experimental/test/CodingTestUtils.h
@@ -249,22 +249,30 @@ void testSkipTo(const std::vector<uint64_t>& data, const List& list) {
    EXPECT_EQ(reader.value(), data[0]);
    EXPECT_EQ(reader.position(), 0);
  }
-  {
-    // Skip past the last element.
-    Reader reader(list);
-    EXPECT_FALSE(reader.skipTo(data.back() + 1));
-    EXPECT_FALSE(reader.valid());
-    EXPECT_EQ(reader.position(), reader.size());
-    EXPECT_FALSE(reader.next());
+
+  // Skip past the last element, when possible. Make sure to probe values far
+  // from the last element, as the reader implementation may keep an internal
+  // upper bound larger than that, and we need to make sure we exercise skipping
+  // both before and after that.
+  using ValueType = typename Reader::ValueType;
+  std::vector<ValueType> valuesPastTheEnd;
+  const auto lastValue = data.back();
+  const auto kMaxValue = std::numeric_limits<ValueType>::max();
+  // Keep doubling the distance from the last value until we overflow.
+  for (ValueType value = lastValue + 1; value > lastValue;
+       value += value - lastValue) {
+    valuesPastTheEnd.push_back(value);
  }
-  {
-    // Skip to maximum integer.
+  if (kMaxValue != lastValue) {
+    valuesPastTheEnd.push_back(kMaxValue);
+  }
+
+  for (auto value : valuesPastTheEnd) {
    Reader reader(list);
-    using ValueType = typename Reader::ValueType;
-    EXPECT_FALSE(reader.skipTo(std::numeric_limits<ValueType>::max()));
-    EXPECT_FALSE(reader.valid());
-    EXPECT_EQ(reader.position(), reader.size());
-    EXPECT_FALSE(reader.next());
+    EXPECT_FALSE(reader.skipTo(value)) << value << " " << lastValue;
+    EXPECT_FALSE(reader.valid()) << value << " " << lastValue;
+    EXPECT_EQ(reader.position(), reader.size()) << value << " " << lastValue;
+    EXPECT_FALSE(reader.next()) << value << " " << lastValue;
  }
 }

@@ -284,9 +292,9 @@ void testJump(const std::vector<uint64_t>& data, const List& list) {
  for (auto i : is) {
    // Also test idempotency.
    for (size_t round = 0; round < 2; ++round) {
-      EXPECT_TRUE(reader.jump(i));
-      EXPECT_EQ(reader.value(), data[i]);
-      EXPECT_EQ(reader.position(), i);
+      EXPECT_TRUE(reader.jump(i)) << i << " " << data.size();
+      EXPECT_EQ(reader.value(), data[i]) << i << " " << data.size();
+      EXPECT_EQ(reader.position(), i) << i << " " << data.size();
    }
    maybeTestPreviousValue(data, reader, i);
    maybeTestPrevious(data, reader, i);
@@ -332,9 +340,11 @@ void testJumpTo(const std::vector<uint64_t>& data, const List& list) {
    EXPECT_EQ(reader.position(), std::distance(data.begin(), it));
  }

+  if (data.back() != std::numeric_limits<ValueType>::max()) {
    EXPECT_FALSE(reader.jumpTo(data.back() + 1));
    EXPECT_FALSE(reader.valid());
    EXPECT_EQ(reader.position(), reader.size());
+  }
 }

 template <class Reader, class Encoder>
@@ -360,11 +370,20 @@ void testEmpty() {
  }
 }

+// `upperBoundExtension` is required to inject additional 0-blocks
+// at the end of the list. This allows us to test lists with a large gap between
+// last element and universe upper bound, to exercise bounds-checking when
+// skipping past the last element
 template <class Reader, class Encoder>
-void testAll(const std::vector<uint64_t>& data) {
+void testAll(
+    const std::vector<uint64_t>& data, uint64_t upperBoundExtension = 0) {
  SCOPED_TRACE(__PRETTY_FUNCTION__);

-  auto list = Encoder::encode(data.begin(), data.end());
+  Encoder encoder(data.size(), data.back() + upperBoundExtension);
+  for (const auto value : data) {
+    encoder.add(value);
+  }
+  auto list = encoder.finish();
  testNext<Reader>(data, list);
  testSkip<Reader>(data, list);
  testSkipTo<Reader>(data, list);

--- a/folly/experimental/test/EliasFanoCodingTest.cpp
+++ b/folly/experimental/test/EliasFanoCodingTest.cpp
@@ -21,6 +21,7 @@
 #include <vector>

 #include <folly/Benchmark.h>
+#include <folly/Random.h>
 #include <folly/experimental/EliasFanoCoding.h>
 #include <folly/experimental/Select64.h>
 #include <folly/experimental/test/CodingTestUtils.h>
@@ -106,12 +107,23 @@ class EliasFanoCodingTest : public ::testing::Test {
    using Reader = EliasFanoReader<Encoder, instructions::Default, false>;
    testAll<Reader, Encoder>({0});
    testAll<Reader, Encoder>(generateRandomList(100 * 1000, 10 * 1000 * 1000));
+    // Test a list with size multiple of kForwardQuantum and universe multiple
+    // of kSkipQuantum, to exercise corner cases in the construction of forward
+    // and skip lists.
+    testAll<Reader, Encoder>(generateRandomList(
+        std::max<size_t>(8 * kForwardQuantum, 1024),
+        std::max<size_t>(16 * kSkipQuantum, 2048)));
    testAll<Reader, Encoder>(generateRandomList(
        100 * 1000, 10 * 1000 * 1000, /* withDuplicates */ true));
    testAll<Reader, Encoder>(generateSeqList(1, 100000, 100));
-    // max() cannot be read, as it is assumed an invalid value.
-    // TODO(ott): It should be possible to lift this constraint.
-    testAll<Reader, Encoder>({0, 1, std::numeric_limits<uint32_t>::max() - 1});
+    testAll<Reader, Encoder>({0, 1, std::numeric_limits<uint32_t>::max()});
+    // Test data with additional trailing 0s in the upperBits by extending
+    // the upper bound.
+    constexpr uint64_t minUpperBoundExtension = 2;
+    constexpr uint64_t maxUpperBoundExtension = 1024;
+    testAll<Reader, Encoder>(
+        generateRandomList(100 * 1000, 10 * 1000 * 1000),
+        folly::Random::rand32(minUpperBoundExtension, maxUpperBoundExtension));
  }

  template <size_t kSkipQuantum, size_t kForwardQuantum, typename ValueType>
@@ -135,12 +147,12 @@ class EliasFanoCodingTest : public ::testing::Test {
    using Reader = EliasFanoReader<Encoder, instructions::Default, false>;
    constexpr size_t kMaxU16 = std::numeric_limits<uint16_t>::max();

-    // kMaxU16 is reserved for both value and size.
+    // Max SizeType value is reserved.
    testAll<Reader, Encoder>(generateSeqList(1, kMaxU16 - 1));
    // Test various sizes close to the limit.
    for (size_t i = 1; i <= 16; ++i) {
-      testAll<Reader, Encoder>(generateRandomList(
-          kMaxU16 - i, kMaxU16 - 1, /* withDuplicates */ true));
+      testAll<Reader, Encoder>(
+          generateRandomList(kMaxU16 - i, kMaxU16, /* withDuplicates */ true));
    }
  }

@@ -155,27 +167,43 @@ TEST_F(EliasFanoCodingTest, Empty) {
  doTestEmpty();
 }

-TEST_F(EliasFanoCodingTest, Simple) {
+TEST_F(EliasFanoCodingTest, Simple32Bit) {
  doTestAll<0, 0, uint32_t>();
+}
+TEST_F(EliasFanoCodingTest, Simple64Bit) {
  doTestAll<0, 0, uint64_t>();
+}
+TEST_F(EliasFanoCodingTest, SimpleDense) {
  doTestDenseAll<0, 0>();
 }

-TEST_F(EliasFanoCodingTest, SkipPointers) {
+TEST_F(EliasFanoCodingTest, SkipPointers32Bit) {
  doTestAll<128, 0, uint32_t>();
+}
+TEST_F(EliasFanoCodingTest, SkipPointers64Bit) {
  doTestAll<128, 0, uint64_t>();
+}
+TEST_F(EliasFanoCodingTest, SkipPointersDense) {
  doTestDenseAll<128, 0>();
 }

-TEST_F(EliasFanoCodingTest, ForwardPointers) {
+TEST_F(EliasFanoCodingTest, ForwardPointers32Bit) {
  doTestAll<0, 128, uint32_t>();
+}
+TEST_F(EliasFanoCodingTest, ForwardPointers64Bit) {
  doTestAll<0, 128, uint64_t>();
+}
+TEST_F(EliasFanoCodingTest, ForwardPointersDense) {
  doTestDenseAll<0, 128>();
 }

-TEST_F(EliasFanoCodingTest, SkipForwardPointers) {
+TEST_F(EliasFanoCodingTest, SkipForwardPointers32Bit) {
  doTestAll<128, 128, uint32_t>();
+}
+TEST_F(EliasFanoCodingTest, SkipForwardPointers64Bit) {
  doTestAll<128, 128, uint64_t>();
+}
+TEST_F(EliasFanoCodingTest, SkipForwardPointersDense) {
  doTestDenseAll<128, 128>();
 }