Fix a ThreadLocal bug: hold the meta lock when resizing the element vector

Summary: There appears to be a race here. leizha reported issues with a heavily recycled AtomicHashMap (ThreadCachedInt inside). It looks like what's happening is this: - Thread A: ~ThreadCachedInt from an AHM - meta lock is taken, and the ThreadElement list is iterated - all entries are zerod, and the id is marked free - then releases the lock - Thread B: someone is calling get() on an unrelated id - hit reserve: rallocm on the pointer or unsynchronized memcpy from the element vector - waits on the lock - when it gets the lock, it stores back the value that it read that was zero'd by A. Later, someone reuses the id from the freelist, and reuses the previously freed pointer, and eventually double-freeing it. (nullptr is the signifier for "this thread doesn't have an instance of the threadlocal yet"). Test Plan: leizha's test case doesn't segv after this diff---it was reliably breaking with corruption in malloc before it. I'm working on making that test case into a unit test to add to this diff, but I'm putting it up early in case there's something wrong with the theory above or in case someone has an idea for a better fix. Reviewed By: tudorb@fb.com FB internal diff: D928534

Fix a ThreadLocal bug: hold the meta lock when resizing the element vector
Summary: There appears to be a race here. leizha reported issues with a heavily recycled AtomicHashMap (ThreadCachedInt inside). It looks like what's happening is this: - Thread A: ~ThreadCachedInt from an AHM - meta lock is taken, and the ThreadElement list is iterated - all entries are zerod, and the id is marked free - then releases the lock - Thread B: someone is calling get() on an unrelated id - hit reserve: rallocm on the pointer or unsynchronized memcpy from the element vector - waits on the lock - when it gets the lock, it stores back the value that it read that was zero'd by A. Later, someone reuses the id from the freelist, and reuses the previously freed pointer, and eventually double-freeing it. (nullptr is the signifier for "this thread doesn't have an instance of the threadlocal yet"). Test Plan: leizha's test case doesn't segv after this diff---it was reliably breaking with corruption in malloc before it. I'm working on making that test case into a unit test to add to this diff, but I'm putting it up early in case there's something wrong with the theory above or in case someone has an idea for a better fix. Reviewed By: tudorb@fb.com FB internal diff: D928534
c55ca702 · Jordan DeLong · Sara Golemon · 21effa23 · c55ca702 · c55ca702
Commit c55ca702 authored Aug 14, 2013 by Jordan DeLong Committed by Sara Golemon Aug 28, 2013
Hide whitespace changes
Inline Side-by-side

Showing with 156 additions and 16 deletions

folly/detail/ThreadLocalDetail.h folly/detail/ThreadLocalDetail.h +30 -16

folly/test/AHMIntStressTest.cpp folly/test/AHMIntStressTest.cpp +126 -0

No files found.
--- a/folly/detail/ThreadLocalDetail.h
+++ b/folly/detail/ThreadLocalDetail.h
@@ -245,14 +245,20 @@ struct StaticMeta {
          if (id < e->elementsCapacity && e->elements[id].ptr) {
            elements.push_back(e->elements[id]);
-            // Writing another thread's ThreadEntry from here is fine;
+            /*
-            // the only other potential reader is the owning thread --
+             * Writing another thread's ThreadEntry from here is fine;
-            // from onThreadExit (which grabs the lock, so is properly
+             * the only other potential reader is the owning thread --
-            // synchronized with us) or from get() -- but using get() on a
+             * from onThreadExit (which grabs the lock, so is properly
-            // ThreadLocalPtr object that's being destroyed is a bug, so
+             * synchronized with us) or from get(), which also grabs
-            // undefined behavior is fair game.
+             * the lock if it needs to resize the elements vector.
-            e->elements[id].ptr = NULL;
+             *
-            e->elements[id].deleter = NULL;
+             * We can't conflict with reads for a get(id), because
+             * it's illegal to call get on a thread local that's
+             * destructing.
+             */
+            e->elements[id].ptr = nullptr;
+            e->elements[id].deleter = nullptr;
+            e->elements[id].ownsDeleter = false;
          }
        }
        meta.freeIds_.push_back(id);
@@ -275,13 +281,14 @@ struct StaticMeta {
    size_t newSize = static_cast<size_t>((id + 5) * 1.7);
    auto& meta = instance();
    ElementWrapper* ptr = nullptr;
    // Rely on jemalloc to zero the memory if possible -- maybe it knows
    // it's already zeroed and saves us some work.
    if (!usingJEMalloc() ||
        prevSize < jemallocMinInPlaceExpandable ||
        (rallocm(
          static_cast<void**>(static_cast<void*>(&threadEntry_.elements)),
-          NULL, newSize * sizeof(ElementWrapper), 0,
+          nullptr, newSize * sizeof(ElementWrapper), 0,
          ALLOCM_NO_MOVE | ALLOCM_ZERO) != ALLOCM_SUCCESS)) {
      // Sigh, must realloc, but we can't call realloc here, as elements is
      // still linked in meta, so another thread might access invalid memory
@@ -295,25 +302,32 @@ struct StaticMeta {
      // is zeroed.  calloc() is simpler than malloc() followed by memset(),
      // and potentially faster when dealing with a lot of memory, as
      // it can get already-zeroed pages from the kernel.
-      if ((ptr = static_cast<ElementWrapper*>(
+      ptr = static_cast<ElementWrapper*>(
-             calloc(newSize, sizeof(ElementWrapper)))) != nullptr) {
+        calloc(newSize, sizeof(ElementWrapper))
-        memcpy(ptr, threadEntry_.elements, sizeof(ElementWrapper) * prevSize);
+      );
-      } else {
+      if (!ptr) throw std::bad_alloc();
-        throw std::bad_alloc();
-      }
    }
    // Success, update the entry
    {
      boost::lock_guard<boost::mutex> g(meta.lock_);
      if (prevSize == 0) {
        meta.push_back(&threadEntry_);
      }
      if (ptr) {
+       /*
+        * Note: we need to hold the meta lock when copying data out of
+        * the old vector, because some other thread might be
+        * destructing a ThreadLocal and writing to the elements vector
+        * of this thread.
+        */
+        memcpy(ptr, threadEntry_.elements, sizeof(ElementWrapper) * prevSize);
        using std::swap;
        swap(ptr, threadEntry_.elements);
+        threadEntry_.elementsCapacity = newSize;
      }
-      threadEntry_.elementsCapacity = newSize;
    }
    free(ptr);

--- a/folly/test/AHMIntStressTest.cpp
+++ b/folly/test/AHMIntStressTest.cpp
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <gtest/gtest.h>
+#include <thread>
+#include <memory>
+#include <mutex>
+#include "folly/AtomicHashMap.h"
+#include "folly/ScopeGuard.h"
+#include "folly/Memory.h"
+namespace {
+struct MyObject {
+  explicit MyObject(int i) : i(i) {}
+  int i;
+};
+typedef folly::AtomicHashMap<int,std::shared_ptr<MyObject>> MyMap;
+typedef std::lock_guard<std::mutex> Guard;
+std::unique_ptr<MyMap> newMap() { return folly::make_unique<MyMap>(100); }
+struct MyObjectDirectory {
+  MyObjectDirectory()
+    : cur_(newMap())
+    , prev_(newMap())
+  {}
+  std::shared_ptr<MyObject> get(int key) {
+    auto val = tryGet(key);
+    if (val) {
+      return val;
+    }
+    std::shared_ptr<MyMap> cur;
+    {
+      Guard g(lock_);
+      cur = cur_;
+    }
+    auto ret = cur->insert(key, std::make_shared<MyObject>(key));
+    return ret.first->second;
+  }
+  std::shared_ptr<MyObject> tryGet(int key) {
+    std::shared_ptr<MyMap> cur;
+    std::shared_ptr<MyMap> prev;
+    {
+      Guard g(lock_);
+      cur = cur_;
+      prev = prev_;
+    }
+    auto it = cur->find(key);
+    if (it != cur->end()) {
+      return it->second;
+    }
+    it = prev->find(key);
+    if (it != prev->end()) {
+      auto ret = cur->insert(key, it->second);
+      return ret.first->second;
+    }
+    return nullptr;
+  }
+  void archive() {
+    std::shared_ptr<MyMap> cur(newMap());
+    Guard g(lock_);
+    prev_ = cur_;
+    cur_ = cur;
+  }
+  std::mutex lock_;
+  std::shared_ptr<MyMap> cur_;
+  std::shared_ptr<MyMap> prev_;
+};
+}
+//////////////////////////////////////////////////////////////////////
+/*
+ * This test case stresses ThreadLocal allocation/deallocation heavily
+ * via ThreadCachedInt and AtomicHashMap, and a bunch of other
+ * mallocing.
+ */
+TEST(AHMIntStressTest, Test) {
+  auto const objs = new MyObjectDirectory();
+  SCOPE_EXIT { delete objs; };
+  std::vector<std::thread> threads;
+  for (int threadId = 0; threadId < 64; ++threadId) {
+    threads.emplace_back(
+      [objs,threadId] {
+        for (int recycles = 0; recycles < 500; ++recycles) {
+          for (int i = 0; i < 10; i++) {
+            auto val = objs->get(i);
+          }
+          objs->archive();
+        }
+      }
+    );
+  }
+  for (auto& t : threads) t.join();
+}