Commit c55ca702 authored by Jordan DeLong's avatar Jordan DeLong Committed by Sara Golemon

Fix a ThreadLocal bug: hold the meta lock when resizing the element vector

Summary:
There appears to be a race here.  leizha reported issues with
a heavily recycled AtomicHashMap (ThreadCachedInt inside).  It looks
like what's happening is this:

- Thread A: ~ThreadCachedInt from an AHM
- meta lock is taken, and the ThreadElement list is iterated
- all entries are zerod, and the id is marked free
- then releases the lock
- Thread B: someone is calling get() on an unrelated id
- hit reserve: rallocm on the pointer or unsynchronized memcpy from
the element vector
- waits on the lock
- when it gets the lock, it stores back the value that it read that
was zero'd by A.

Later, someone reuses the id from the freelist, and reuses the
previously freed pointer, and eventually double-freeing it.  (nullptr
is the signifier for "this thread doesn't have an instance of the
threadlocal yet").

Test Plan:
leizha's test case doesn't segv after this diff---it was
reliably breaking with corruption in malloc before it.  I'm working on
making that test case into a unit test to add to this diff, but I'm
putting it up early in case there's something wrong with the theory
above or in case someone has an idea for a better fix.

Reviewed By: tudorb@fb.com

FB internal diff: D928534
parent 21effa23
...@@ -245,14 +245,20 @@ struct StaticMeta { ...@@ -245,14 +245,20 @@ struct StaticMeta {
if (id < e->elementsCapacity && e->elements[id].ptr) { if (id < e->elementsCapacity && e->elements[id].ptr) {
elements.push_back(e->elements[id]); elements.push_back(e->elements[id]);
// Writing another thread's ThreadEntry from here is fine; /*
// the only other potential reader is the owning thread -- * Writing another thread's ThreadEntry from here is fine;
// from onThreadExit (which grabs the lock, so is properly * the only other potential reader is the owning thread --
// synchronized with us) or from get() -- but using get() on a * from onThreadExit (which grabs the lock, so is properly
// ThreadLocalPtr object that's being destroyed is a bug, so * synchronized with us) or from get(), which also grabs
// undefined behavior is fair game. * the lock if it needs to resize the elements vector.
e->elements[id].ptr = NULL; *
e->elements[id].deleter = NULL; * We can't conflict with reads for a get(id), because
* it's illegal to call get on a thread local that's
* destructing.
*/
e->elements[id].ptr = nullptr;
e->elements[id].deleter = nullptr;
e->elements[id].ownsDeleter = false;
} }
} }
meta.freeIds_.push_back(id); meta.freeIds_.push_back(id);
...@@ -275,13 +281,14 @@ struct StaticMeta { ...@@ -275,13 +281,14 @@ struct StaticMeta {
size_t newSize = static_cast<size_t>((id + 5) * 1.7); size_t newSize = static_cast<size_t>((id + 5) * 1.7);
auto& meta = instance(); auto& meta = instance();
ElementWrapper* ptr = nullptr; ElementWrapper* ptr = nullptr;
// Rely on jemalloc to zero the memory if possible -- maybe it knows // Rely on jemalloc to zero the memory if possible -- maybe it knows
// it's already zeroed and saves us some work. // it's already zeroed and saves us some work.
if (!usingJEMalloc() || if (!usingJEMalloc() ||
prevSize < jemallocMinInPlaceExpandable || prevSize < jemallocMinInPlaceExpandable ||
(rallocm( (rallocm(
static_cast<void**>(static_cast<void*>(&threadEntry_.elements)), static_cast<void**>(static_cast<void*>(&threadEntry_.elements)),
NULL, newSize * sizeof(ElementWrapper), 0, nullptr, newSize * sizeof(ElementWrapper), 0,
ALLOCM_NO_MOVE | ALLOCM_ZERO) != ALLOCM_SUCCESS)) { ALLOCM_NO_MOVE | ALLOCM_ZERO) != ALLOCM_SUCCESS)) {
// Sigh, must realloc, but we can't call realloc here, as elements is // Sigh, must realloc, but we can't call realloc here, as elements is
// still linked in meta, so another thread might access invalid memory // still linked in meta, so another thread might access invalid memory
...@@ -295,25 +302,32 @@ struct StaticMeta { ...@@ -295,25 +302,32 @@ struct StaticMeta {
// is zeroed. calloc() is simpler than malloc() followed by memset(), // is zeroed. calloc() is simpler than malloc() followed by memset(),
// and potentially faster when dealing with a lot of memory, as // and potentially faster when dealing with a lot of memory, as
// it can get already-zeroed pages from the kernel. // it can get already-zeroed pages from the kernel.
if ((ptr = static_cast<ElementWrapper*>( ptr = static_cast<ElementWrapper*>(
calloc(newSize, sizeof(ElementWrapper)))) != nullptr) { calloc(newSize, sizeof(ElementWrapper))
memcpy(ptr, threadEntry_.elements, sizeof(ElementWrapper) * prevSize); );
} else { if (!ptr) throw std::bad_alloc();
throw std::bad_alloc();
}
} }
// Success, update the entry // Success, update the entry
{ {
boost::lock_guard<boost::mutex> g(meta.lock_); boost::lock_guard<boost::mutex> g(meta.lock_);
if (prevSize == 0) { if (prevSize == 0) {
meta.push_back(&threadEntry_); meta.push_back(&threadEntry_);
} }
if (ptr) { if (ptr) {
/*
* Note: we need to hold the meta lock when copying data out of
* the old vector, because some other thread might be
* destructing a ThreadLocal and writing to the elements vector
* of this thread.
*/
memcpy(ptr, threadEntry_.elements, sizeof(ElementWrapper) * prevSize);
using std::swap; using std::swap;
swap(ptr, threadEntry_.elements); swap(ptr, threadEntry_.elements);
threadEntry_.elementsCapacity = newSize;
} }
threadEntry_.elementsCapacity = newSize;
} }
free(ptr); free(ptr);
......
/*
* Copyright 2013 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <gtest/gtest.h>
#include <thread>
#include <memory>
#include <mutex>
#include "folly/AtomicHashMap.h"
#include "folly/ScopeGuard.h"
#include "folly/Memory.h"
namespace {
struct MyObject {
explicit MyObject(int i) : i(i) {}
int i;
};
typedef folly::AtomicHashMap<int,std::shared_ptr<MyObject>> MyMap;
typedef std::lock_guard<std::mutex> Guard;
std::unique_ptr<MyMap> newMap() { return folly::make_unique<MyMap>(100); }
struct MyObjectDirectory {
MyObjectDirectory()
: cur_(newMap())
, prev_(newMap())
{}
std::shared_ptr<MyObject> get(int key) {
auto val = tryGet(key);
if (val) {
return val;
}
std::shared_ptr<MyMap> cur;
{
Guard g(lock_);
cur = cur_;
}
auto ret = cur->insert(key, std::make_shared<MyObject>(key));
return ret.first->second;
}
std::shared_ptr<MyObject> tryGet(int key) {
std::shared_ptr<MyMap> cur;
std::shared_ptr<MyMap> prev;
{
Guard g(lock_);
cur = cur_;
prev = prev_;
}
auto it = cur->find(key);
if (it != cur->end()) {
return it->second;
}
it = prev->find(key);
if (it != prev->end()) {
auto ret = cur->insert(key, it->second);
return ret.first->second;
}
return nullptr;
}
void archive() {
std::shared_ptr<MyMap> cur(newMap());
Guard g(lock_);
prev_ = cur_;
cur_ = cur;
}
std::mutex lock_;
std::shared_ptr<MyMap> cur_;
std::shared_ptr<MyMap> prev_;
};
}
//////////////////////////////////////////////////////////////////////
/*
* This test case stresses ThreadLocal allocation/deallocation heavily
* via ThreadCachedInt and AtomicHashMap, and a bunch of other
* mallocing.
*/
TEST(AHMIntStressTest, Test) {
auto const objs = new MyObjectDirectory();
SCOPE_EXIT { delete objs; };
std::vector<std::thread> threads;
for (int threadId = 0; threadId < 64; ++threadId) {
threads.emplace_back(
[objs,threadId] {
for (int recycles = 0; recycles < 500; ++recycles) {
for (int i = 0; i < 10; i++) {
auto val = objs->get(i);
}
objs->archive();
}
}
);
}
for (auto& t : threads) t.join();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment