Move AtomicUnorderedInsertMap to folly.

Summary: AtomicUnorderedInsertMap is a concurrent hash table that firmly at the performance end of the generality <-> performance spectrum. If you don't need updates (or can use your own concurrency control when overwriting values), you never need to delete, and you can predict your capacity perfectly, then you will get wait-free reads, lock-free inserts, safe concurrent iteration, and excellent cache and performance outlier behavior. Arbitrary key and value types are supported. Reviewed By: @yfeldblum Differential Revision: D2145281

Move AtomicUnorderedInsertMap to folly.
Summary: AtomicUnorderedInsertMap is a concurrent hash table that firmly at the performance end of the generality <-> performance spectrum. If you don't need updates (or can use your own concurrency control when overwriting values), you never need to delete, and you can predict your capacity perfectly, then you will get wait-free reads, lock-free inserts, safe concurrent iteration, and excellent cache and performance outlier behavior. Arbitrary key and value types are supported. Reviewed By: @yfeldblum Differential Revision: D2145281
53e6886f · Nathan Bronson · Sara Golemon · fe6e73a6 · 53e6886f · 53e6886f
Commit 53e6886f authored Jun 15, 2015 by Nathan Bronson Committed by Sara Golemon Jun 15, 2015
4 changed files
--- a/folly/AtomicUnorderedMap.h
+++ b/folly/AtomicUnorderedMap.h
--- a/folly/Makefile.am
+++ b/folly/Makefile.am
@@ -28,6 +28,7 @@ nobase_follyinclude_HEADERS = \
 	AtomicHashMap-inl.h \
 	AtomicLinkedList.h \
 	AtomicStruct.h \
+	AtomicUnorderedMap.h \
 	Baton.h \
 	Benchmark.h \
 	Bits.h \
@@ -39,6 +40,7 @@ nobase_follyinclude_HEADERS = \
 	CpuId.h \
 	CPortability.h \
 	detail/AtomicHashUtils.h \
+	detail/AtomicUnorderedMapUtils.h \
 	detail/BitIteratorDetail.h \
 	detail/BitsDetail.h \
 	detail/CacheLocality.h \

--- a/folly/detail/AtomicUnorderedMapUtils.h
+++ b/folly/detail/AtomicUnorderedMapUtils.h
+#pragma once
+#include <atomic>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <unistd.h>
+namespace folly { namespace detail {
+class MMapAlloc {
+ private:
+  size_t computeSize(size_t size) {
+    long pagesize = sysconf(_SC_PAGESIZE);
+    size_t mmapLength = ((size - 1) & ~(pagesize - 1)) + pagesize;
+    assert(size <= mmapLength && mmapLength < size + pagesize);
+    assert((mmapLength % pagesize) == 0);
+    return mmapLength;
+  }
+ public:
+  void* allocate(size_t size) {
+    auto len = computeSize(size);
+    // MAP_HUGETLB is a perf win, but requires cooperation from the
+    // deployment environment (and a change to computeSize()).
+    void* mem = static_cast<void*>(mmap(
+         nullptr,
+         len,
+         PROT_READ | PROT_WRITE,
+         MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE,
+         -1,
+         0));
+    if (mem == reinterpret_cast<void*>(-1)) {
+      throw std::system_error(errno, std::system_category());
+    }
+    return mem;
+  }
+  void deallocate(void* p, size_t size) {
+    auto len = computeSize(size);
+    munmap(p, len);
+  }
+};
+template<typename Allocator>
+struct GivesZeroFilledMemory : public std::false_type {};
+template<>
+struct GivesZeroFilledMemory<MMapAlloc> : public std::true_type{};
+}}
--- a/folly/test/AtomicUnorderedMapTest.cpp
+++ b/folly/test/AtomicUnorderedMapTest.cpp
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/AtomicUnorderedMap.h>
+#include <folly/test/DeterministicSchedule.h>
+#include <thread>
+#include <semaphore.h>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <folly/Benchmark.h>
+#include <unordered_map>
+using namespace folly;
+using namespace folly::test;
+template<class T>
+struct non_atomic {
+  T value;
+  non_atomic() = default;
+  non_atomic(const non_atomic&) = delete;
+  constexpr /* implicit */ non_atomic(T desired): value(desired) {}
+  T operator+=(T arg) { value += arg; return load();}
+  T load(std::memory_order order= std::memory_order_seq_cst) const {
+    return value;
+  }
+  /* implicit */
+  operator T() const {return load();}
+  void store(T desired, std::memory_order order = std::memory_order_seq_cst) {
+    value = desired;
+  }
+  T exchange(T desired, std::memory_order order = std::memory_order_seq_cst) {
+    T old = load();
+    store(desired);
+    return old;
+  }
+  bool compare_exchange_weak(
+      T& expected, T desired,
+      std::memory_order success = std::memory_order_seq_cst,
+      std::memory_order failure = std::memory_order_seq_cst) {
+    if (value == expected) {
+      value = desired;
+      return true;
+    }
+    expected = value;
+    return false;
+  }
+  bool compare_exchange_strong(
+      T& expected, T desired,
+      std::memory_order success = std::memory_order_seq_cst,
+      std::memory_order failure = std::memory_order_seq_cst) {
+    if (value == expected) {
+      value = desired;
+      return true;
+    }
+    expected = value;
+    return false;
+  }
+  bool is_lock_free() const {return true;}
+};
+template<
+    typename Key, typename Value, template<typename> class Atom = non_atomic>
+using UnorderedInsertMap =  AtomicUnorderedInsertMap<
+    Key,
+    Value,
+    std::hash<Key>,
+    std::equal_to<Key>,
+    (boost::has_trivial_destructor<Key>::value &&
+     boost::has_trivial_destructor<Value>::value),
+    Atom,
+    std::allocator<char>>;
+TEST(AtomicUnorderedInsertMap, basic) {
+  AtomicUnorderedInsertMap<std::string,std::string> m(100);
+  m.emplace("abc", "ABC");
+  EXPECT_TRUE(m.find("abc") != m.cend());
+  EXPECT_EQ(m.find("abc")->first, "abc");
+  EXPECT_EQ(m.find("abc")->second, "ABC");
+  EXPECT_TRUE(m.find("def") == m.cend());
+  auto iter = m.cbegin();
+  EXPECT_TRUE(iter != m.cend());
+  EXPECT_TRUE(iter == m.find("abc"));
+  auto a = iter;
+  EXPECT_TRUE(a == iter);
+  auto b = iter;
+  ++iter;
+  EXPECT_TRUE(iter == m.cend());
+  EXPECT_TRUE(a == b);
+  EXPECT_TRUE(a != iter);
+  a++;
+  EXPECT_TRUE(a == iter);
+  EXPECT_TRUE(a != b);
+}
+TEST(AtomicUnorderedInsertMap, value_mutation) {
+  AtomicUnorderedInsertMap<int, MutableAtom<int>> m(100);
+  for (int i = 0; i < 50; ++i) {
+    m.emplace(i, i);
+  }
+  m.find(1)->second.data++;
+}
+TEST(UnorderedInsertMap, value_mutation) {
+  UnorderedInsertMap<int, MutableData<int>> m(100);
+  for (int i = 0; i < 50; ++i) {
+    m.emplace(i, i);
+  }
+  m.find(1)->second.data++;
+  EXPECT_EQ(m.find(1)->second.data, 2);
+}
+BENCHMARK(lookup_int_int_hit, iters) {
+  std::unique_ptr<AtomicUnorderedInsertMap<int,size_t>> ptr = {};
+  size_t capacity = 100000;
+  BENCHMARK_SUSPEND {
+    ptr.reset(new AtomicUnorderedInsertMap<int,size_t>(capacity));
+    for (size_t i = 0; i < capacity; ++i) {
+      auto k = 3 * ((5641 * i) % capacity);
+      ptr->emplace(k, k + 1);
+      EXPECT_EQ(ptr->find(k)->second, k + 1);
+    }
+  }
+  for (size_t i = 0; i < iters; ++i) {
+    size_t k = 3 * (((i * 7919) ^ (i * 4001)) % capacity);
+    auto iter = ptr->find(k);
+    if (iter == ptr->cend() ||
+        iter->second != k + 1) {
+      auto jter = ptr->find(k);
+      EXPECT_TRUE(iter == jter);
+    }
+    EXPECT_EQ(iter->second, k + 1);
+  }
+  BENCHMARK_SUSPEND {
+    ptr.reset(nullptr);
+  }
+}
+struct PairHash {
+  size_t operator()(const std::pair<uint64_t,uint64_t>& pr) const {
+    return pr.first ^ pr.second;
+  }
+};
+void contendedRW(size_t itersPerThread,
+                 size_t capacity,
+                 size_t numThreads,
+                 size_t readsPerWrite) {
+  typedef std::pair<uint64_t,uint64_t> Key;
+  typedef AtomicUnorderedInsertMap<Key,MutableAtom<uint32_t>,PairHash> Map;
+  std::unique_ptr<Map> ptr = {};
+  std::atomic<bool> go;
+  std::vector<std::thread> threads;
+  BENCHMARK_SUSPEND {
+    ptr.reset(new Map(capacity));
+    while (threads.size() < numThreads) {
+      threads.emplace_back([&](){
+        while (!go) {
+          std::this_thread::yield();
+        }
+        size_t reads = 0;
+        size_t writes = 0;
+        while (reads + writes < itersPerThread) {
+          auto r = Random::rand32();
+          Key key(reads + writes, r);
+          if (reads < writes * readsPerWrite ||
+              writes >= capacity / numThreads) {
+            // read needed
+            ++reads;
+            auto iter = ptr->find(key);
+            EXPECT_TRUE(
+                iter == ptr->cend() ||
+                iter->second.data.load(std::memory_order_acquire) >= key.first);
+          } else {
+            ++writes;
+            try {
+              auto pr = ptr->emplace(key, key.first);
+              if (!pr.second) {
+                pr.first->second.data++;
+              }
+            } catch (std::bad_alloc& x) {
+              LOG(INFO) << "bad alloc";
+            }
+          }
+        }
+      });
+    }
+  }
+  go = true;
+  for (auto& thr : threads) {
+    thr.join();
+  }
+  BENCHMARK_SUSPEND {
+    ptr.reset(nullptr);
+  }
+}
+// sudo nice -n -20 ~/fbcode/_bin/common/concurrency/experimental/atomic_unordered_map --benchmark --bm_min_iters=1000000
+//
+// without MAP_HUGETLB (default)
+//
+// ============================================================================
+// common/concurrency/experimental/AtomicUnorderedMapTest.cpprelative  time/iter
+//   iters/s
+// ============================================================================
+// lookup_int_int_hit                                          20.05ns   49.89M
+// contendedRW(small_32thr_99pct)                              70.36ns   14.21M
+// contendedRW(large_32thr_99pct)                             164.23ns    6.09M
+// contendedRW(large_32thr_99_9pct)                           158.81ns    6.30M
+// ============================================================================
+//
+// with MAP_HUGETLB hacked in
+// ============================================================================
+// lookup_int_int_hit                                          19.67ns   50.84M
+// contendedRW(small_32thr_99pct)                              62.46ns   16.01M
+// contendedRW(large_32thr_99pct)                             119.41ns    8.37M
+// contendedRW(large_32thr_99_9pct)                           111.23ns    8.99M
+// ============================================================================
+BENCHMARK_NAMED_PARAM(contendedRW, small_32thr_99pct, 100000, 32, 99)
+BENCHMARK_NAMED_PARAM(contendedRW, large_32thr_99pct, 100000000, 32, 99)
+BENCHMARK_NAMED_PARAM(contendedRW, large_32thr_99_9pct, 100000000, 32, 999)
+BENCHMARK_DRAW_LINE();
+// sudo nice -n -20 ~/fbcode/_build/opt/site_integrity/quasar/experimental/atomic_unordered_map_test --benchmark --bm_min_iters=10000
+// Single threaded benchmarks to test how much better we are than
+// std::unordered_map and what is the cost of using atomic operations
+// in the uncontended use case
+// ============================================================================
+// std_map                                                      1.20ms   832.58
+// atomic_fast_map                                            511.35us    1.96K
+// fast_map                                                   196.28us    5.09K
+// ============================================================================
+BENCHMARK(std_map) {
+  std::unordered_map<long, long> m;
+  m.reserve(10000);
+  for (int i=0; i<10000; ++i) {
+    m.emplace(i,i);
+  }
+  for (int i=0; i<10000; ++i) {
+    auto a = m.find(i);
+    folly::doNotOptimizeAway(&*a);
+  }
+}
+BENCHMARK(atomic_fast_map) {
+  UnorderedInsertMap<long, long, std::atomic> m(10000);
+  for (int i=0; i<10000; ++i) {
+    m.emplace(i,i);
+  }
+  for (int i=0; i<10000; ++i) {
+    auto a = m.find(i);
+    folly::doNotOptimizeAway(&*a);
+  }
+}
+BENCHMARK(fast_map) {
+  UnorderedInsertMap<long, long> m(10000);
+  for (int i=0; i<10000; ++i) {
+    m.emplace(i,i);
+  }
+  for (int i=0; i<10000; ++i) {
+    auto a = m.find(i);
+    folly::doNotOptimizeAway(&*a);
+  }
+}
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  int rv = RUN_ALL_TESTS();
+  folly::runBenchmarksOnFlag();
+  return rv;
+}