Use CoreCachedSharedPtr in Singleton

Summary: `CoreCachedSharedPtr` is almost as fast as `ReadMostlySharedPtr`, so we can use it to have a better default that does not have pathological behavior under heavy contention. `try_get_fast()` is still useful if we need to squeeze out the last cycle. Reviewed By: philippv, luciang Differential Revision: D29812053 fbshipit-source-id: 49e9e53444f8099dbfe13e36c3c07c1b57bb89fb

Use CoreCachedSharedPtr in Singleton
Summary: `CoreCachedSharedPtr` is almost as fast as `ReadMostlySharedPtr`, so we can use it to have a better default that does not have pathological behavior under heavy contention. `try_get_fast()` is still useful if we need to squeeze out the last cycle. Reviewed By: philippv, luciang Differential Revision: D29812053 fbshipit-source-id: 49e9e53444f8099dbfe13e36c3c07c1b57bb89fb
937fc980 · Giuseppe Ottaviano · Facebook GitHub Bot · 4baba282 · 937fc980 · 937fc980
Commit 937fc980 authored Jul 22, 2021 by Giuseppe Ottaviano Committed by Facebook GitHub Bot Jul 22, 2021
4 changed files
--- a/folly/Singleton-inl.h
+++ b/folly/Singleton-inl.h
@@ -117,7 +117,7 @@ std::weak_ptr<T> SingletonHolder<T>::get_weak() {
    createInstance();
  }

-  return instance_weak_;
+  return instance_weak_core_cached_.get();
 }

 template <typename T>
@@ -128,7 +128,7 @@ std::shared_ptr<T> SingletonHolder<T>::try_get() {
    createInstance();
  }

-  return instance_weak_.lock();
+  return instance_weak_core_cached_.lock();
 }

 template <typename T>
@@ -182,6 +182,7 @@ void SingletonHolder<T>::destroyInstance() {
    }
  }
  state_ = SingletonHolderState::Dead;
+  instance_core_cached_.reset();
  instance_.reset();
  instance_copy_.reset();
  if (destroy_baton_) {
@@ -310,8 +311,10 @@ void SingletonHolder<T>::createInstance() {

  instance_weak_ = instance;
  instance_ptr_ = instance.get();
+  instance_core_cached_.reset(instance);
  instance_.reset(std::move(instance));
  instance_weak_fast_ = instance_;
+  instance_weak_core_cached_.reset(instance_core_cached_);

  destroy_baton_ = std::move(destroy_baton);
  print_destructor_stack_trace_ = std::move(print_destructor_stack_trace);

--- a/folly/Singleton.h
+++ b/folly/Singleton.h
@@ -126,6 +126,7 @@
 #include <folly/Executor.h>
 #include <folly/Memory.h>
 #include <folly/Synchronized.h>
+#include <folly/concurrency/CoreCachedSharedPtr.h>
 #include <folly/detail/Singleton.h>
 #include <folly/detail/StaticSingletonManager.h>
 #include <folly/experimental/ReadMostlySharedPtr.h>
@@ -357,13 +358,18 @@ struct SingletonHolder : public SingletonHolderBase {
  folly::ReadMostlyMainPtr<T> instance_;
  // used to release all ReadMostlyMainPtrs at once
  folly::ReadMostlySharedPtr<T> instance_copy_;
-  // weak_ptr to the singleton instance, set when state is changed from Dead
-  // to Living. We never write to this object after initialization, so it is
-  // safe to read it from different threads w/o synchronization if we know
-  // that state is set to Living
+  // per-core shared_ptrs that in turn hold the instance shared_ptr, to avoid
+  // contention in acquiring them.
+  folly::CoreCachedSharedPtr<T> instance_core_cached_;
+  // weak references to the previous pointers. These are never written to after
+  // initialization, so they're safe to read without synchronization once the
+  // state has transitioned to Living.
+  // instance_weak_ is a reference to the main instance, so it is authoritative
+  // on whether the instance is expired.
  std::weak_ptr<T> instance_weak_;
-  // Fast equivalent of instance_weak_
  folly::ReadMostlyWeakPtr<T> instance_weak_fast_;
+  folly::CoreCachedWeakPtr<T> instance_weak_core_cached_;
+
  // Time we wait on destroy_baton after releasing Singleton shared_ptr.
  std::shared_ptr<folly::Baton<>> destroy_baton_;
  T* instance_ptr_ = nullptr;

--- a/folly/concurrency/CoreCachedSharedPtr.h
+++ b/folly/concurrency/CoreCachedSharedPtr.h
@@ -19,13 +19,16 @@
 #include <array>
 #include <memory>

+#include <folly/Portability.h>
 #include <folly/concurrency/CacheLocality.h>
 #include <folly/container/Enumerate.h>
 #include <folly/synchronization/Hazptr.h>

 namespace folly {

-constexpr size_t kCoreCachedSharedPtrDefaultNumSlots = 64;
+// On mobile we do not expect high concurrency, and memory is more important, so
+// use more conservative caching.
+constexpr size_t kCoreCachedSharedPtrDefaultNumSlots = kIsMobile ? 4 : 64;

 /**
 * This class creates core-local caches for a given shared_ptr, to
@@ -40,9 +43,8 @@ constexpr size_t kCoreCachedSharedPtrDefaultNumSlots = 64;
 template <class T, size_t kNumSlots = kCoreCachedSharedPtrDefaultNumSlots>
 class CoreCachedSharedPtr {
 public:
-  explicit CoreCachedSharedPtr(const std::shared_ptr<T>& p = nullptr) {
-    reset(p);
-  }
+  CoreCachedSharedPtr() = default;
+  explicit CoreCachedSharedPtr(const std::shared_ptr<T>& p) { reset(p); }

  void reset(const std::shared_ptr<T>& p = nullptr) {
    // Allocate each Holder in a different CoreRawAllocator stripe to
@@ -71,7 +73,13 @@ class CoreCachedSharedPtr {
 template <class T, size_t kNumSlots = kCoreCachedSharedPtrDefaultNumSlots>
 class CoreCachedWeakPtr {
 public:
+  CoreCachedWeakPtr() = default;
  explicit CoreCachedWeakPtr(const CoreCachedSharedPtr<T, kNumSlots>& p) {
+    reset(p);
+  }
+
+  void reset() { *this = {}; }
+  void reset(const CoreCachedSharedPtr<T, kNumSlots>& p) {
    for (auto slot : folly::enumerate(slots_)) {
      *slot = p.slots_[slot.index];
    }
@@ -81,6 +89,7 @@ class CoreCachedWeakPtr {
    return slots_[AccessSpreader<>::cachedCurrent(kNumSlots)];
  }

+  // Faster than get().lock(), as it avoid one weak count cycle.
  std::shared_ptr<T> lock() const {
    return slots_[AccessSpreader<>::cachedCurrent(kNumSlots)].lock();
  }

--- a/folly/test/SingletonTest.cpp
+++ b/folly/test/SingletonTest.cpp
@@ -315,7 +315,6 @@ TEST(Singleton, SharedPtrUsage) {

  auto shared_s1 = weak_s1.lock();
  EXPECT_EQ(shared_s1.get(), s1);
-  EXPECT_EQ(shared_s1.use_count(), 2);

  auto old_serial = shared_s1->serial_number;