Commit 65180b25 authored by Giuseppe Ottaviano's avatar Giuseppe Ottaviano Committed by Facebook GitHub Bot

Reduce memory usage of CoreCachedSharedPtr

Summary:
We only need as many slots as the number of L1 caches.

Also avoid allocating control blocks when the passed pointer has no managed object.

Reviewed By: philippv, luciang

Differential Revision: D29872059

fbshipit-source-id: 8c221b0523494c44a5c6828bafd26eeb00e573c4
parent 5fbc8492
...@@ -484,8 +484,9 @@ class CoreRawAllocator { ...@@ -484,8 +484,9 @@ class CoreRawAllocator {
}; };
template <typename T, size_t Stripes> template <typename T, size_t Stripes>
CxxAllocatorAdaptor<T, typename CoreRawAllocator<Stripes>::Allocator> FOLLY_EXPORT
getCoreAllocator(size_t stripe) { CxxAllocatorAdaptor<T, typename CoreRawAllocator<Stripes>::Allocator>
getCoreAllocator(size_t stripe) {
// We cannot make sure that the allocator will be destroyed after // We cannot make sure that the allocator will be destroyed after
// all the objects allocated with it, so we leak it. // all the objects allocated with it, so we leak it.
static Indestructible<CoreRawAllocator<Stripes>> allocator; static Indestructible<CoreRawAllocator<Stripes>> allocator;
......
...@@ -17,18 +17,56 @@ ...@@ -17,18 +17,56 @@
#pragma once #pragma once
#include <array> #include <array>
#include <atomic>
#include <memory> #include <memory>
#include <folly/CppAttributes.h>
#include <folly/Portability.h> #include <folly/Portability.h>
#include <folly/Unit.h>
#include <folly/concurrency/CacheLocality.h> #include <folly/concurrency/CacheLocality.h>
#include <folly/container/Enumerate.h>
#include <folly/synchronization/Hazptr.h> #include <folly/synchronization/Hazptr.h>
namespace folly { namespace folly {
// On mobile we do not expect high concurrency, and memory is more important, so // On mobile we do not expect high concurrency, and memory is more important, so
// use more conservative caching. // use more conservative caching.
constexpr size_t kCoreCachedSharedPtrDefaultNumSlots = kIsMobile ? 4 : 64; constexpr size_t kCoreCachedSharedPtrDefaultMaxSlots = kIsMobile ? 4 : 64;
namespace core_cached_shared_ptr_detail {
template <size_t kMaxSlots>
class SlotsConfig {
public:
FOLLY_EXPORT static void initialize() {
FOLLY_MAYBE_UNUSED static const Unit _ = [] {
// We need at most as many slots as the number of L1 caches, so we can
// avoid wasting memory if more slots are requested.
const auto l1Caches = CacheLocality::system().numCachesByLevel.front();
num_ = std::min(std::max<size_t>(1, l1Caches), kMaxSlots);
return unit;
}();
}
static size_t num() { return num_.load(std::memory_order_relaxed); }
private:
static std::atomic<size_t> num_;
};
// Initialize with a valid num so that get() always returns a valid stripe, even
// if initialize() has not been called yet.
template <size_t kMaxSlots>
std::atomic<size_t> SlotsConfig<kMaxSlots>::num_{1};
// Check whether a shared_ptr is equivalent to default-constructed. Because of
// aliasing constructors, there can be both nullptr with a managed object, and
// non-nullptr with no managed object, so we need to check both.
template <class T>
bool isDefault(const std::shared_ptr<T>& p) {
return p == nullptr && p.use_count() == 0;
}
} // namespace core_cached_shared_ptr_detail
/** /**
* This class creates core-local caches for a given shared_ptr, to * This class creates core-local caches for a given shared_ptr, to
...@@ -36,29 +74,36 @@ constexpr size_t kCoreCachedSharedPtrDefaultNumSlots = kIsMobile ? 4 : 64; ...@@ -36,29 +74,36 @@ constexpr size_t kCoreCachedSharedPtrDefaultNumSlots = kIsMobile ? 4 : 64;
* *
* It has the same thread-safety guarantees as shared_ptr: it is safe * It has the same thread-safety guarantees as shared_ptr: it is safe
* to concurrently call get(), but reset()s must be synchronized with * to concurrently call get(), but reset()s must be synchronized with
* reads and other resets(). * reads and other reset()s.
* *
* @author Giuseppe Ottaviano <ott@fb.com> * @author Giuseppe Ottaviano <ott@fb.com>
*/ */
template <class T, size_t kNumSlots = kCoreCachedSharedPtrDefaultNumSlots> template <class T, size_t kMaxSlots = kCoreCachedSharedPtrDefaultMaxSlots>
class CoreCachedSharedPtr { class CoreCachedSharedPtr {
using SlotsConfig = core_cached_shared_ptr_detail::SlotsConfig<kMaxSlots>;
public: public:
CoreCachedSharedPtr() = default; CoreCachedSharedPtr() = default;
explicit CoreCachedSharedPtr(const std::shared_ptr<T>& p) { reset(p); } explicit CoreCachedSharedPtr(const std::shared_ptr<T>& p) { reset(p); }
void reset(const std::shared_ptr<T>& p = nullptr) { void reset(const std::shared_ptr<T>& p = nullptr) {
SlotsConfig::initialize();
// Allocate each Holder in a different CoreRawAllocator stripe to // Allocate each Holder in a different CoreRawAllocator stripe to
// prevent false sharing. Their control blocks will be adjacent // prevent false sharing. Their control blocks will be adjacent
// thanks to allocate_shared(). // thanks to allocate_shared().
for (auto slot : folly::enumerate(slots_)) { for (size_t i = 0; i < SlotsConfig::num(); ++i) {
auto alloc = getCoreAllocator<Holder, kNumSlots>(slot.index); // Try freeing the control block before allocating a new one.
auto holder = std::allocate_shared<Holder>(alloc, p); slots_[i] = {};
*slot = std::shared_ptr<T>(holder, p.get()); if (!core_cached_shared_ptr_detail::isDefault(p)) {
auto alloc = getCoreAllocator<Holder, kMaxSlots>(i);
auto holder = std::allocate_shared<Holder>(alloc, p);
slots_[i] = std::shared_ptr<T>(holder, p.get());
}
} }
} }
std::shared_ptr<T> get() const { std::shared_ptr<T> get() const {
return slots_[AccessSpreader<>::cachedCurrent(kNumSlots)]; return slots_[AccessSpreader<>::cachedCurrent(SlotsConfig::num())];
} }
private: private:
...@@ -67,35 +112,38 @@ class CoreCachedSharedPtr { ...@@ -67,35 +112,38 @@ class CoreCachedSharedPtr {
template <class, size_t> template <class, size_t>
friend class CoreCachedWeakPtr; friend class CoreCachedWeakPtr;
std::array<std::shared_ptr<T>, kNumSlots> slots_; std::array<std::shared_ptr<T>, kMaxSlots> slots_;
}; };
template <class T, size_t kNumSlots = kCoreCachedSharedPtrDefaultNumSlots> template <class T, size_t kMaxSlots = kCoreCachedSharedPtrDefaultMaxSlots>
class CoreCachedWeakPtr { class CoreCachedWeakPtr {
using SlotsConfig = core_cached_shared_ptr_detail::SlotsConfig<kMaxSlots>;
public: public:
CoreCachedWeakPtr() = default; CoreCachedWeakPtr() = default;
explicit CoreCachedWeakPtr(const CoreCachedSharedPtr<T, kNumSlots>& p) { explicit CoreCachedWeakPtr(const CoreCachedSharedPtr<T, kMaxSlots>& p) {
reset(p); reset(p);
} }
void reset() { *this = {}; } void reset() { *this = {}; }
void reset(const CoreCachedSharedPtr<T, kNumSlots>& p) { void reset(const CoreCachedSharedPtr<T, kMaxSlots>& p) {
for (auto slot : folly::enumerate(slots_)) { SlotsConfig::initialize();
*slot = p.slots_[slot.index]; for (size_t i = 0; i < SlotsConfig::num(); ++i) {
slots_[i] = p.slots_[i];
} }
} }
std::weak_ptr<T> get() const { std::weak_ptr<T> get() const {
return slots_[AccessSpreader<>::cachedCurrent(kNumSlots)]; return slots_[AccessSpreader<>::cachedCurrent(SlotsConfig::num())];
} }
// Faster than get().lock(), as it avoid one weak count cycle. // Faster than get().lock(), as it avoid one weak count cycle.
std::shared_ptr<T> lock() const { std::shared_ptr<T> lock() const {
return slots_[AccessSpreader<>::cachedCurrent(kNumSlots)].lock(); return slots_[AccessSpreader<>::cachedCurrent(SlotsConfig::num())].lock();
} }
private: private:
std::array<std::weak_ptr<T>, kNumSlots> slots_; std::array<std::weak_ptr<T>, kMaxSlots> slots_;
}; };
/** /**
...@@ -110,52 +158,53 @@ class CoreCachedWeakPtr { ...@@ -110,52 +158,53 @@ class CoreCachedWeakPtr {
* get()s will never see a newer pointer on one core, and an older * get()s will never see a newer pointer on one core, and an older
* pointer on another after a subsequent thread migration. * pointer on another after a subsequent thread migration.
*/ */
template <class T, size_t kNumSlots = kCoreCachedSharedPtrDefaultNumSlots> template <class T, size_t kMaxSlots = kCoreCachedSharedPtrDefaultMaxSlots>
class AtomicCoreCachedSharedPtr { class AtomicCoreCachedSharedPtr {
using SlotsConfig = core_cached_shared_ptr_detail::SlotsConfig<kMaxSlots>;
public: public:
explicit AtomicCoreCachedSharedPtr(const std::shared_ptr<T>& p = nullptr) { AtomicCoreCachedSharedPtr() = default;
reset(p); explicit AtomicCoreCachedSharedPtr(const std::shared_ptr<T>& p) { reset(p); }
}
~AtomicCoreCachedSharedPtr() { ~AtomicCoreCachedSharedPtr() {
auto slots = slots_.load(std::memory_order_acquire);
// Delete of AtomicCoreCachedSharedPtr must be synchronized, no // Delete of AtomicCoreCachedSharedPtr must be synchronized, no
// need for slots->retire(). // need for slots->retire().
if (slots) { delete slots_.load(std::memory_order_acquire);
delete slots;
}
} }
void reset(const std::shared_ptr<T>& p = nullptr) { void reset(const std::shared_ptr<T>& p = nullptr) {
auto newslots = std::make_unique<Slots>(); SlotsConfig::initialize();
// Allocate each Holder in a different CoreRawAllocator stripe to std::unique_ptr<Slots> newslots;
// prevent false sharing. Their control blocks will be adjacent if (!core_cached_shared_ptr_detail::isDefault(p)) {
// thanks to allocate_shared(). newslots = std::make_unique<Slots>();
for (auto slot : folly::enumerate(newslots->slots_)) { // Allocate each Holder in a different CoreRawAllocator stripe to
auto alloc = getCoreAllocator<Holder, kNumSlots>(slot.index); // prevent false sharing. Their control blocks will be adjacent
auto holder = std::allocate_shared<Holder>(alloc, p); // thanks to allocate_shared().
*slot = std::shared_ptr<T>(holder, p.get()); for (size_t i = 0; i < SlotsConfig::num(); ++i) {
auto alloc = getCoreAllocator<Holder, kMaxSlots>(i);
auto holder = std::allocate_shared<Holder>(alloc, p);
newslots->slots[i] = std::shared_ptr<T>(holder, p.get());
}
} }
auto oldslots = slots_.exchange(newslots.release()); if (auto oldslots = slots_.exchange(newslots.release())) {
if (oldslots) {
oldslots->retire(); oldslots->retire();
} }
} }
std::shared_ptr<T> get() const { std::shared_ptr<T> get() const {
folly::hazptr_local<1> hazptr; folly::hazptr_local<1> hazptr;
auto slots = hazptr[0].protect(slots_); if (auto slots = hazptr[0].protect(slots_)) {
if (!slots) { return slots->slots[AccessSpreader<>::cachedCurrent(SlotsConfig::num())];
} else {
return nullptr; return nullptr;
} }
return (slots->slots_)[AccessSpreader<>::cachedCurrent(kNumSlots)];
} }
private: private:
using Holder = std::shared_ptr<T>; using Holder = std::shared_ptr<T>;
struct Slots : folly::hazptr_obj_base<Slots> { struct Slots : folly::hazptr_obj_base<Slots> {
std::array<std::shared_ptr<T>, kNumSlots> slots_; std::array<std::shared_ptr<T>, kMaxSlots> slots;
}; };
std::atomic<Slots*> slots_{nullptr}; std::atomic<Slots*> slots_{nullptr};
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment