Commit ff9d6fee authored by Nathan Bronson's avatar Nathan Bronson Committed by Facebook Github Bot

use a single malloc/free call for F14VectorMap and Set

Summary:
Before this diff the vector storage policy for F14 allocated the
chunk array and the value_type array via separate calls to the allocator.
After this diff they are performed via a single allocation using a byte
allocator rebound from the original value allocator.  This will halve
the number of calls to malloc and free from a majority of F14FastMap-s
and F14FastSet-s (and from all F14VectorMap and F14VectorSet).
The optimization is likely to be most important for small sets and maps.

In a microbenchmark that just creates and destroys 1-element
F14VectorMap-s, this diff was a 20% CPU win.

This diff is not pure win.  The unified allocation of
`(64+10*sizeof(value_type))*N` causes more internal fragmentation in
jemalloc for most value_type sizes than separate allocations of `64*N`
and `10*sizeof(value_type)*N`.  48-byte value_type is probably the most
affected in practice; this diff will increase the memory footprint
of vector-policy tables with 48-byte keys by 13% once internal memory
fragmentation is taken into account.

Taking into account the default jemalloc allocation classes,

    sizeof(value_type)  |  footprint change
    --------------------|--------------------
                   24   |   1.03797
                   32   |   1.
                   40   |   1.07462
                   48   |   1.1273
                   56   |   1.0185
                   64   |   1.06558
                   72   |   1.02742
                   80   |   1.02672
                   88   |   1.
                   96   |   1.
                  104   |   1.05961
                  112   |   1.05849
                  120   |   0.965071
                  128   |   1.10305
                  136   |   0.942188
                  144   |   0.942984
                  152   |   1.05624
                  160   |   1.05549
                  168   |   0.927757
                  176   |   1.0238
                  184   |   1.02353
                  192   |   1.02327
                  200   |   1.08255
                  208   |   1.08173
                  216   |   1.08093
                  224   |   1.08015
                  232   |   0.981946
                  240   |   0.982102
                  248   |   0.982255
                  256   |   1.12316

Reviewed By: yfeldblum

Differential Revision: D8485933

fbshipit-source-id: 1a7df390e11e71e1f56f23527aebec4806eb03d1
parent 3c509ce1
......@@ -91,6 +91,10 @@ struct BasePolicy
using Alloc = Defaulted<AllocOrVoid, DefaultAlloc<Value>>;
using AllocTraits = std::allocator_traits<Alloc>;
using ByteAlloc = typename AllocTraits::template rebind_alloc<uint8_t>;
using ByteAllocTraits = typename std::allocator_traits<ByteAlloc>;
using BytePtr = typename ByteAllocTraits::pointer;
//////// info about user-supplied types
static_assert(
......@@ -268,10 +272,25 @@ struct BasePolicy
std::size_t /*capacity*/,
P&& /*rhs*/) {}
// Rounds chunkBytes up to the next multiple of 16 if it is possible
// that a sub-Chunk allocation has a size that is not a multiple of 16.
static std::size_t alignedChunkAllocSize(std::size_t chunkAllocSize) {
if ((sizeof(Item) % 8) != 0) {
chunkAllocSize = -(-chunkAllocSize & ~std::size_t{0xf});
}
FOLLY_SAFE_DCHECK((chunkAllocSize % 16) == 0, "");
return chunkAllocSize;
}
bool beforeRehash(
std::size_t /*size*/,
std::size_t /*oldCapacity*/,
std::size_t /*newCapacity*/) {
std::size_t /*newCapacity*/,
std::size_t chunkAllocSize,
BytePtr& outChunkAllocation) {
ByteAlloc a{alloc()};
outChunkAllocation =
ByteAllocTraits::allocate(a, alignedChunkAllocSize(chunkAllocSize));
return false;
}
......@@ -280,15 +299,33 @@ struct BasePolicy
bool /*success*/,
std::size_t /*size*/,
std::size_t /*oldCapacity*/,
std::size_t /*newCapacity*/) {}
std::size_t /*newCapacity*/,
BytePtr chunkAllocation,
std::size_t chunkAllocSize) {
// on success, this will be the old allocation, on failure the new one
if (chunkAllocation != nullptr) {
ByteAlloc a{alloc()};
ByteAllocTraits::deallocate(
a, chunkAllocation, alignedChunkAllocSize(chunkAllocSize));
}
}
void beforeClear(std::size_t /*size*/, std::size_t) {}
void beforeClear(std::size_t /*size*/, std::size_t /*capacity*/) {}
void afterClear(std::size_t /*capacity*/) {}
void afterClear(std::size_t /*size*/, std::size_t /*capacity*/) {}
void beforeReset(std::size_t /*size*/, std::size_t) {}
void beforeReset(std::size_t /*size*/, std::size_t /*capacity*/) {}
void afterReset() {}
void afterReset(
std::size_t /*size*/,
std::size_t /*capacity*/,
BytePtr chunkAllocation,
std::size_t chunkAllocSize) {
FOLLY_SAFE_DCHECK(chunkAllocation != nullptr, "");
ByteAlloc a{alloc()};
ByteAllocTraits::deallocate(
a, chunkAllocation, alignedChunkAllocSize(chunkAllocSize));
}
void prefetchValue(Item const&) const {
// Subclass should disable with prefetchBeforeRehash(),
......@@ -529,9 +566,14 @@ class ValueContainerPolicy : public BasePolicy<
template <typename V>
void visitPolicyAllocationClasses(
std::size_t chunkAllocSize,
std::size_t /*size*/,
std::size_t /*capacity*/,
V&& /*visitor*/) const {}
V&& visitor) const {
if (chunkAllocSize > 0) {
visitor(Super::alignedChunkAllocSize(chunkAllocSize), 1);
}
}
//////// F14BasicMap/Set policy
......@@ -749,10 +791,16 @@ class NodeContainerPolicy
template <typename V>
void visitPolicyAllocationClasses(
std::size_t chunkAllocSize,
std::size_t size,
std::size_t /*capacity*/,
V&& visitor) const {
visitor(sizeof(Value), size);
if (chunkAllocSize > 0) {
visitor(Super::alignedChunkAllocSize(chunkAllocSize), 1);
}
if (size > 0) {
visitor(sizeof(Value), size);
}
}
//////// F14BasicMap/Set policy
......@@ -874,6 +922,9 @@ class VectorContainerPolicy : public BasePolicy<
uint32_t>;
using typename Super::Alloc;
using typename Super::AllocTraits;
using typename Super::ByteAlloc;
using typename Super::ByteAllocTraits;
using typename Super::BytePtr;
using typename Super::Hasher;
using typename Super::Item;
using typename Super::ItemIter;
......@@ -1142,21 +1193,61 @@ class VectorContainerPolicy : public BasePolicy<
FOLLY_SAFE_DCHECK(success, "");
}
private:
// Returns the byte offset of the first Value in a unified allocation
// that first holds prefixBytes of data, where prefixBytes comes from
// Chunk storage and hence must be at least 8-byte aligned (sub-Chunk
// allocations always have an even capacity and sizeof(Item) == 4).
static std::size_t valuesOffset(std::size_t prefixBytes) {
FOLLY_SAFE_DCHECK((prefixBytes % 8) == 0, "");
if (alignof(Value) > 8) {
prefixBytes = -(-prefixBytes & ~(alignof(Value) - 1));
}
FOLLY_SAFE_DCHECK((prefixBytes % alignof(Value)) == 0, "");
return prefixBytes;
}
// Returns the total number of bytes that should be allocated to store
// prefixBytes of Chunks and valueCapacity values.
static std::size_t allocSize(
std::size_t prefixBytes,
std::size_t valueCapacity) {
auto n = valuesOffset(prefixBytes) + sizeof(Value) * valueCapacity;
if (alignof(Value) <= 8) {
// ensure that the result is a multiple of 16 to protect against
// allocators that don't always align to 16
n = -(-n & ~std::size_t{0xf});
}
FOLLY_SAFE_DCHECK((n % 16) == 0, "");
return n;
}
public:
ValuePtr beforeRehash(
std::size_t size,
std::size_t oldCapacity,
std::size_t newCapacity) {
std::size_t newCapacity,
std::size_t chunkAllocSize,
BytePtr& outChunkAllocation) {
FOLLY_SAFE_DCHECK(
size <= oldCapacity && ((values_ == nullptr) == (oldCapacity == 0)) &&
newCapacity > 0 &&
newCapacity <= (std::numeric_limits<Item>::max)(),
"");
Alloc& a = this->alloc();
{
ByteAlloc a{this->alloc()};
outChunkAllocation =
ByteAllocTraits::allocate(a, allocSize(chunkAllocSize, newCapacity));
}
ValuePtr before = values_;
ValuePtr after = AllocTraits::allocate(a, newCapacity);
ValuePtr after = std::pointer_traits<ValuePtr>::pointer_to(
*static_cast<Value*>(static_cast<void*>(
&*outChunkAllocation + valuesOffset(chunkAllocSize))));
if (size > 0) {
Alloc& a{this->alloc()};
transfer(a, std::addressof(before[0]), std::addressof(after[0]), size);
}
......@@ -1164,14 +1255,12 @@ class VectorContainerPolicy : public BasePolicy<
return before;
}
FOLLY_NOINLINE void
afterFailedRehash(ValuePtr state, std::size_t size, std::size_t newCapacity) {
FOLLY_NOINLINE void afterFailedRehash(ValuePtr state, std::size_t size) {
// state holds the old storage
Alloc& a = this->alloc();
if (size > 0) {
transfer(a, std::addressof(values_[0]), std::addressof(state[0]), size);
}
AllocTraits::deallocate(a, values_, newCapacity);
values_ = state;
}
......@@ -1180,12 +1269,21 @@ class VectorContainerPolicy : public BasePolicy<
bool success,
std::size_t size,
std::size_t oldCapacity,
std::size_t newCapacity) {
std::size_t newCapacity,
BytePtr chunkAllocation,
std::size_t chunkAllocSize) {
if (!success) {
afterFailedRehash(state, size, newCapacity);
} else if (state != nullptr) {
Alloc& a = this->alloc();
AllocTraits::deallocate(a, state, oldCapacity);
afterFailedRehash(state, size);
}
// on success, chunkAllocation is the old allocation, on failure it is the
// new one
if (chunkAllocation != nullptr) {
ByteAlloc a{this->alloc()};
ByteAllocTraits::deallocate(
a,
chunkAllocation,
allocSize(chunkAllocSize, (success ? oldCapacity : newCapacity)));
}
}
......@@ -1199,23 +1297,31 @@ class VectorContainerPolicy : public BasePolicy<
}
void beforeReset(std::size_t size, std::size_t capacity) {
FOLLY_SAFE_DCHECK(
size <= capacity && ((values_ == nullptr) == (capacity == 0)), "");
if (capacity > 0) {
beforeClear(size, capacity);
Alloc& a = this->alloc();
AllocTraits::deallocate(a, values_, capacity);
beforeClear(size, capacity);
}
void afterReset(
std::size_t /*size*/,
std::size_t capacity,
BytePtr chunkAllocation,
std::size_t chunkAllocSize) {
if (chunkAllocation != nullptr) {
ByteAlloc a{this->alloc()};
ByteAllocTraits::deallocate(
a, chunkAllocation, allocSize(chunkAllocSize, capacity));
values_ = nullptr;
}
}
template <typename V>
void visitPolicyAllocationClasses(
std::size_t chunkAllocSize,
std::size_t /*size*/,
std::size_t capacity,
V&& visitor) const {
if (capacity > 0) {
visitor(sizeof(Value) * capacity, 1);
FOLLY_SAFE_DCHECK((chunkAllocSize == 0) == (capacity == 0), "");
if (chunkAllocSize > 0) {
visitor(allocSize(chunkAllocSize, capacity), 1);
}
}
......
......@@ -1180,28 +1180,24 @@ class F14Table : public Policy {
//////// memory management helpers
static std::size_t allocSize(
static std::size_t chunkAllocSize(
std::size_t chunkCount,
std::size_t maxSizeWithoutRehash) {
if (chunkCount == 1) {
auto n = offsetof(Chunk, rawItems_) + maxSizeWithoutRehash * sizeof(Item);
FOLLY_SAFE_DCHECK((maxSizeWithoutRehash % 2) == 0, "");
if ((sizeof(Item) % 8) != 0) {
n = ((n - 1) | 15) + 1;
}
FOLLY_SAFE_DCHECK((n % 16) == 0, "");
return n;
static_assert(offsetof(Chunk, rawItems_) == 16, "");
return 16 + sizeof(Item) * maxSizeWithoutRehash;
} else {
return sizeof(Chunk) * chunkCount;
}
}
ChunkPtr newChunks(std::size_t chunkCount, std::size_t maxSizeWithoutRehash) {
ByteAlloc a{this->alloc()};
uint8_t* raw = &*std::allocator_traits<ByteAlloc>::allocate(
a, allocSize(chunkCount, maxSizeWithoutRehash));
ChunkPtr initializeChunks(
BytePtr raw,
std::size_t chunkCount,
std::size_t maxSizeWithoutRehash) {
static_assert(std::is_trivial<Chunk>::value, "F14Chunk should be POD");
auto chunks = static_cast<Chunk*>(static_cast<void*>(raw));
auto chunks = static_cast<Chunk*>(static_cast<void*>(&*raw));
for (std::size_t i = 0; i < chunkCount; ++i) {
chunks[i].clear();
}
......@@ -1209,17 +1205,6 @@ class F14Table : public Policy {
return std::pointer_traits<ChunkPtr>::pointer_to(*chunks);
}
void deleteChunks(
ChunkPtr chunks,
std::size_t chunkCount,
std::size_t maxSizeWithoutRehash) {
ByteAlloc a{this->alloc()};
BytePtr bp = std::pointer_traits<BytePtr>::pointer_to(
*static_cast<uint8_t*>(static_cast<void*>(&*chunks)));
std::allocator_traits<ByteAlloc>::deallocate(
a, bp, allocSize(chunkCount, maxSizeWithoutRehash));
}
public:
ItemIter begin() const noexcept {
FOLLY_SAFE_DCHECK(Policy::kEnableItemIteration, "");
......@@ -1509,7 +1494,7 @@ class F14Table : public Policy {
if (folly::is_trivially_copyable<Item>::value &&
!this->destroyItemOnClear() && bucket_count() == src.bucket_count()) {
// most happy path
auto n = allocSize(chunkMask_ + 1, bucket_count());
auto n = chunkAllocSize(chunkMask_ + 1, bucket_count());
std::memcpy(&chunks_[0], &src.chunks_[0], n);
sizeAndPackedBegin_.size_ = src.size();
if (Policy::kEnableItemIteration) {
......@@ -1688,21 +1673,47 @@ class F14Table : public Policy {
const auto origChunkCount = chunkMask_ + 1;
const auto origMaxSizeWithoutRehash = bucket_count();
BytePtr rawAllocation;
auto undoState = this->beforeRehash(
size(), origMaxSizeWithoutRehash, newMaxSizeWithoutRehash);
size(),
origMaxSizeWithoutRehash,
newMaxSizeWithoutRehash,
chunkAllocSize(newChunkCount, newMaxSizeWithoutRehash),
rawAllocation);
chunks_ =
initializeChunks(rawAllocation, newChunkCount, newMaxSizeWithoutRehash);
chunkMask_ = newChunkCount - 1;
bool success = false;
SCOPE_EXIT {
// this SCOPE_EXIT reverts chunks_ and chunkMask_ if necessary
BytePtr finishedRawAllocation = nullptr;
std::size_t finishedAllocSize = 0;
if (LIKELY(success)) {
if (origMaxSizeWithoutRehash > 0) {
finishedRawAllocation = std::pointer_traits<BytePtr>::pointer_to(
*static_cast<uint8_t*>(static_cast<void*>(&*origChunks)));
finishedAllocSize =
chunkAllocSize(origChunkCount, origMaxSizeWithoutRehash);
}
} else {
finishedRawAllocation = rawAllocation;
finishedAllocSize =
chunkAllocSize(newChunkCount, newMaxSizeWithoutRehash);
chunks_ = origChunks;
chunkMask_ = origChunkCount - 1;
}
this->afterRehash(
std::move(undoState),
success,
size(),
origMaxSizeWithoutRehash,
newMaxSizeWithoutRehash);
newMaxSizeWithoutRehash,
finishedRawAllocation,
finishedAllocSize);
};
chunks_ = newChunks(newChunkCount, newMaxSizeWithoutRehash);
chunkMask_ = newChunkCount - 1;
if (size() == 0) {
// nothing to do
} else if (origChunkCount == 1 && newChunkCount == 1) {
......@@ -1730,16 +1741,10 @@ class F14Table : public Policy {
if (newChunkCount <= stackBuf.size()) {
fullness = stackBuf.data();
} else {
try {
ByteAlloc a{this->alloc()};
fullness =
&*std::allocator_traits<ByteAlloc>::allocate(a, newChunkCount);
} catch (...) {
deleteChunks(chunks_, newChunkCount, newMaxSizeWithoutRehash);
chunks_ = origChunks;
chunkMask_ = origChunkCount - 1;
throw;
}
ByteAlloc a{this->alloc()};
// may throw
fullness =
&*std::allocator_traits<ByteAlloc>::allocate(a, newChunkCount);
}
std::memset(fullness, '\0', newChunkCount);
SCOPE_EXIT {
......@@ -1787,9 +1792,6 @@ class F14Table : public Policy {
}
}
if (origMaxSizeWithoutRehash != 0) {
deleteChunks(origChunks, origChunkCount, origMaxSizeWithoutRehash);
}
success = true;
}
......@@ -1897,10 +1899,12 @@ class F14Table : public Policy {
// we don't get too low a load factor
bool willReset = Reset || chunkMask_ + 1 >= 16;
auto origSize = size();
auto origCapacity = bucket_count();
if (willReset) {
this->beforeReset(size(), bucket_count());
this->beforeReset(origSize, origCapacity);
} else {
this->beforeClear(size(), bucket_count());
this->beforeClear(origSize, origCapacity);
}
if (!empty()) {
......@@ -1935,13 +1939,16 @@ class F14Table : public Policy {
}
if (willReset) {
deleteChunks(chunks_, chunkMask_ + 1, bucket_count());
BytePtr rawAllocation = std::pointer_traits<BytePtr>::pointer_to(
*static_cast<uint8_t*>(static_cast<void*>(&*chunks_)));
std::size_t rawSize = chunkAllocSize(chunkMask_ + 1, bucket_count());
chunks_ = Chunk::emptyInstance();
chunkMask_ = 0;
this->afterReset();
this->afterReset(origSize, origCapacity, rawAllocation, rawSize);
} else {
this->afterClear(bucket_count());
this->afterClear(origSize, origCapacity);
}
}
......@@ -2023,10 +2030,11 @@ class F14Table : public Policy {
template <typename V>
void visitAllocationClasses(V&& visitor) const {
auto bc = bucket_count();
if (bc != 0) {
visitor(allocSize(chunkMask_ + 1, bc), 1);
}
this->visitPolicyAllocationClasses(size(), bc, visitor);
this->visitPolicyAllocationClasses(
(bc == 0 ? 0 : chunkAllocSize(chunkMask_ + 1, bc)),
size(),
bc,
visitor);
}
// visitor should take an Item const&
......
......@@ -96,6 +96,25 @@ TEST(F14Set, getAllocatedMemorySize) {
runAllocatedMemorySizeTests<long double>();
runAllocatedMemorySizeTests<std::string>();
runAllocatedMemorySizeTests<folly::fbstring>();
{
folly::F14ValueSet<int> set;
set.insert(10);
EXPECT_EQ(sizeof(set), 32);
EXPECT_EQ(set.getAllocatedMemorySize(), 32);
}
{
folly::F14NodeSet<int> set;
set.insert(10);
EXPECT_EQ(sizeof(set), 32);
EXPECT_EQ(set.getAllocatedMemorySize(), 36);
}
{
folly::F14VectorSet<int> set;
set.insert(10);
EXPECT_EQ(sizeof(set), 24);
EXPECT_EQ(set.getAllocatedMemorySize(), 32);
}
}
///////////////////////////////////
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment