Reduce memory usage of CoreCachedSharedPtr

Summary: We only need as many slots as the number of L1 caches. Also avoid allocating control blocks when the passed pointer has no managed object. Reviewed By: philippv, luciang Differential Revision: D29872059 fbshipit-source-id: 8c221b0523494c44a5c6828bafd26eeb00e573c4

Reduce memory usage of CoreCachedSharedPtr
Summary: We only need as many slots as the number of L1 caches. Also avoid allocating control blocks when the passed pointer has no managed object. Reviewed By: philippv, luciang Differential Revision: D29872059 fbshipit-source-id: 8c221b0523494c44a5c6828bafd26eeb00e573c4
65180b25 · Giuseppe Ottaviano · Facebook GitHub Bot · 5fbc8492 · 65180b25 · 65180b25
Commit 65180b25 authored Jul 28, 2021 by Giuseppe Ottaviano Committed by Facebook GitHub Bot Jul 28, 2021
3 changed files
--- a/folly/concurrency/CacheLocality.h
+++ b/folly/concurrency/CacheLocality.h
@@ -484,8 +484,9 @@ class CoreRawAllocator {
 };

 template <typename T, size_t Stripes>
-CxxAllocatorAdaptor<T, typename CoreRawAllocator<Stripes>::Allocator>
-getCoreAllocator(size_t stripe) {
+FOLLY_EXPORT
+    CxxAllocatorAdaptor<T, typename CoreRawAllocator<Stripes>::Allocator>
+    getCoreAllocator(size_t stripe) {
  // We cannot make sure that the allocator will be destroyed after
  // all the objects allocated with it, so we leak it.
  static Indestructible<CoreRawAllocator<Stripes>> allocator;

--- a/folly/concurrency/CoreCachedSharedPtr.h
+++ b/folly/concurrency/CoreCachedSharedPtr.h
@@ -17,18 +17,56 @@
 #pragma once

 #include <array>
+#include <atomic>
 #include <memory>

+#include <folly/CppAttributes.h>
 #include <folly/Portability.h>
+#include <folly/Unit.h>
 #include <folly/concurrency/CacheLocality.h>
-#include <folly/container/Enumerate.h>
 #include <folly/synchronization/Hazptr.h>

 namespace folly {

 // On mobile we do not expect high concurrency, and memory is more important, so
 // use more conservative caching.
-constexpr size_t kCoreCachedSharedPtrDefaultNumSlots = kIsMobile ? 4 : 64;
+constexpr size_t kCoreCachedSharedPtrDefaultMaxSlots = kIsMobile ? 4 : 64;
+
+namespace core_cached_shared_ptr_detail {
+
+template <size_t kMaxSlots>
+class SlotsConfig {
+ public:
+  FOLLY_EXPORT static void initialize() {
+    FOLLY_MAYBE_UNUSED static const Unit _ = [] {
+      // We need at most as many slots as the number of L1 caches, so we can
+      // avoid wasting memory if more slots are requested.
+      const auto l1Caches = CacheLocality::system().numCachesByLevel.front();
+      num_ = std::min(std::max<size_t>(1, l1Caches), kMaxSlots);
+      return unit;
+    }();
+  }
+
+  static size_t num() { return num_.load(std::memory_order_relaxed); }
+
+ private:
+  static std::atomic<size_t> num_;
+};
+
+// Initialize with a valid num so that get() always returns a valid stripe, even
+// if initialize() has not been called yet.
+template <size_t kMaxSlots>
+std::atomic<size_t> SlotsConfig<kMaxSlots>::num_{1};
+
+// Check whether a shared_ptr is equivalent to default-constructed. Because of
+// aliasing constructors, there can be both nullptr with a managed object, and
+// non-nullptr with no managed object, so we need to check both.
+template <class T>
+bool isDefault(const std::shared_ptr<T>& p) {
+  return p == nullptr && p.use_count() == 0;
+}
+
+} // namespace core_cached_shared_ptr_detail

 /**
 * This class creates core-local caches for a given shared_ptr, to
@@ -36,29 +74,36 @@ constexpr size_t kCoreCachedSharedPtrDefaultNumSlots = kIsMobile ? 4 : 64;
 *
 * It has the same thread-safety guarantees as shared_ptr: it is safe
 * to concurrently call get(), but reset()s must be synchronized with
- * reads and other resets().
+ * reads and other reset()s.
 *
 * @author Giuseppe Ottaviano <ott@fb.com>
 */
-template <class T, size_t kNumSlots = kCoreCachedSharedPtrDefaultNumSlots>
+template <class T, size_t kMaxSlots = kCoreCachedSharedPtrDefaultMaxSlots>
 class CoreCachedSharedPtr {
+  using SlotsConfig = core_cached_shared_ptr_detail::SlotsConfig<kMaxSlots>;
+
 public:
  CoreCachedSharedPtr() = default;
  explicit CoreCachedSharedPtr(const std::shared_ptr<T>& p) { reset(p); }

  void reset(const std::shared_ptr<T>& p = nullptr) {
+    SlotsConfig::initialize();
    // Allocate each Holder in a different CoreRawAllocator stripe to
    // prevent false sharing. Their control blocks will be adjacent
    // thanks to allocate_shared().
-    for (auto slot : folly::enumerate(slots_)) {
-      auto alloc = getCoreAllocator<Holder, kNumSlots>(slot.index);
+    for (size_t i = 0; i < SlotsConfig::num(); ++i) {
+      // Try freeing the control block before allocating a new one.
+      slots_[i] = {};
+      if (!core_cached_shared_ptr_detail::isDefault(p)) {
+        auto alloc = getCoreAllocator<Holder, kMaxSlots>(i);
        auto holder = std::allocate_shared<Holder>(alloc, p);
-      *slot = std::shared_ptr<T>(holder, p.get());
+        slots_[i] = std::shared_ptr<T>(holder, p.get());
+      }
    }
  }

  std::shared_ptr<T> get() const {
-    return slots_[AccessSpreader<>::cachedCurrent(kNumSlots)];
+    return slots_[AccessSpreader<>::cachedCurrent(SlotsConfig::num())];
  }

 private:
@@ -67,35 +112,38 @@ class CoreCachedSharedPtr {
  template <class, size_t>
  friend class CoreCachedWeakPtr;

-  std::array<std::shared_ptr<T>, kNumSlots> slots_;
+  std::array<std::shared_ptr<T>, kMaxSlots> slots_;
 };

-template <class T, size_t kNumSlots = kCoreCachedSharedPtrDefaultNumSlots>
+template <class T, size_t kMaxSlots = kCoreCachedSharedPtrDefaultMaxSlots>
 class CoreCachedWeakPtr {
+  using SlotsConfig = core_cached_shared_ptr_detail::SlotsConfig<kMaxSlots>;
+
 public:
  CoreCachedWeakPtr() = default;
-  explicit CoreCachedWeakPtr(const CoreCachedSharedPtr<T, kNumSlots>& p) {
+  explicit CoreCachedWeakPtr(const CoreCachedSharedPtr<T, kMaxSlots>& p) {
    reset(p);
  }

  void reset() { *this = {}; }
-  void reset(const CoreCachedSharedPtr<T, kNumSlots>& p) {
-    for (auto slot : folly::enumerate(slots_)) {
-      *slot = p.slots_[slot.index];
+  void reset(const CoreCachedSharedPtr<T, kMaxSlots>& p) {
+    SlotsConfig::initialize();
+    for (size_t i = 0; i < SlotsConfig::num(); ++i) {
+      slots_[i] = p.slots_[i];
    }
  }

  std::weak_ptr<T> get() const {
-    return slots_[AccessSpreader<>::cachedCurrent(kNumSlots)];
+    return slots_[AccessSpreader<>::cachedCurrent(SlotsConfig::num())];
  }

  // Faster than get().lock(), as it avoid one weak count cycle.
  std::shared_ptr<T> lock() const {
-    return slots_[AccessSpreader<>::cachedCurrent(kNumSlots)].lock();
+    return slots_[AccessSpreader<>::cachedCurrent(SlotsConfig::num())].lock();
  }

 private:
-  std::array<std::weak_ptr<T>, kNumSlots> slots_;
+  std::array<std::weak_ptr<T>, kMaxSlots> slots_;
 };

 /**
@@ -110,52 +158,53 @@ class CoreCachedWeakPtr {
 * get()s will never see a newer pointer on one core, and an older
 * pointer on another after a subsequent thread migration.
 */
-template <class T, size_t kNumSlots = kCoreCachedSharedPtrDefaultNumSlots>
+template <class T, size_t kMaxSlots = kCoreCachedSharedPtrDefaultMaxSlots>
 class AtomicCoreCachedSharedPtr {
+  using SlotsConfig = core_cached_shared_ptr_detail::SlotsConfig<kMaxSlots>;
+
 public:
-  explicit AtomicCoreCachedSharedPtr(const std::shared_ptr<T>& p = nullptr) {
-    reset(p);
-  }
+  AtomicCoreCachedSharedPtr() = default;
+  explicit AtomicCoreCachedSharedPtr(const std::shared_ptr<T>& p) { reset(p); }

  ~AtomicCoreCachedSharedPtr() {
-    auto slots = slots_.load(std::memory_order_acquire);
    // Delete of AtomicCoreCachedSharedPtr must be synchronized, no
    // need for slots->retire().
-    if (slots) {
-      delete slots;
-    }
+    delete slots_.load(std::memory_order_acquire);
  }

  void reset(const std::shared_ptr<T>& p = nullptr) {
-    auto newslots = std::make_unique<Slots>();
+    SlotsConfig::initialize();
+    std::unique_ptr<Slots> newslots;
+    if (!core_cached_shared_ptr_detail::isDefault(p)) {
+      newslots = std::make_unique<Slots>();
      // Allocate each Holder in a different CoreRawAllocator stripe to
      // prevent false sharing. Their control blocks will be adjacent
      // thanks to allocate_shared().
-    for (auto slot : folly::enumerate(newslots->slots_)) {
-      auto alloc = getCoreAllocator<Holder, kNumSlots>(slot.index);
+      for (size_t i = 0; i < SlotsConfig::num(); ++i) {
+        auto alloc = getCoreAllocator<Holder, kMaxSlots>(i);
        auto holder = std::allocate_shared<Holder>(alloc, p);
-      *slot = std::shared_ptr<T>(holder, p.get());
+        newslots->slots[i] = std::shared_ptr<T>(holder, p.get());
+      }
    }

-    auto oldslots = slots_.exchange(newslots.release());
-    if (oldslots) {
+    if (auto oldslots = slots_.exchange(newslots.release())) {
      oldslots->retire();
    }
  }

  std::shared_ptr<T> get() const {
    folly::hazptr_local<1> hazptr;
-    auto slots = hazptr[0].protect(slots_);
-    if (!slots) {
+    if (auto slots = hazptr[0].protect(slots_)) {
+      return slots->slots[AccessSpreader<>::cachedCurrent(SlotsConfig::num())];
+    } else {
      return nullptr;
    }
-    return (slots->slots_)[AccessSpreader<>::cachedCurrent(kNumSlots)];
  }

 private:
  using Holder = std::shared_ptr<T>;
  struct Slots : folly::hazptr_obj_base<Slots> {
-    std::array<std::shared_ptr<T>, kNumSlots> slots_;
+    std::array<std::shared_ptr<T>, kMaxSlots> slots;
  };
  std::atomic<Slots*> slots_{nullptr};
 };

--- a/folly/concurrency/test/CoreCachedSharedPtrTest.cpp
+++ b/folly/concurrency/test/CoreCachedSharedPtrTest.cpp