abstract thread_local support

Summary: change from using __thread to using FOLLY_THREAD_LOCAL macro, this will allow abstraction over gcc and msvc implementations of thread local (__thread and __declspec(thread)) which have the same semantices and will also allow drop in replacement of thread_local when compiler support for the feature is complete This doesn't do anything about apple, however, which still has broken __thread support This doesn't actually change any implementation for now, simply allows for correct compilation Test Plan: fbmake runtests Reviewed By: delong.j@fb.com FB internal diff: D1278726

abstract thread_local support
Summary: change from using __thread to using FOLLY_THREAD_LOCAL macro, this will allow abstraction over gcc and msvc implementations of thread local (__thread and __declspec(thread)) which have the same semantices and will also allow drop in replacement of thread_local when compiler support for the feature is complete This doesn't do anything about apple, however, which still has broken __thread support This doesn't actually change any implementation for now, simply allows for correct compilation Test Plan: fbmake runtests Reviewed By: delong.j@fb.com FB internal diff: D1278726
ec06f66c · Elizabeth Smith · Sara Golemon · f585e98a · ec06f66c · ec06f66c
Commit ec06f66c authored Apr 17, 2014 by Elizabeth Smith Committed by Sara Golemon Apr 18, 2014
12 changed files
--- a/folly/Portability.h
+++ b/folly/Portability.h
@@ -95,6 +95,18 @@ struct MaxAlign { char c; } __attribute__((aligned));
 # endif
 #endif

+/* Platform specific TLS support
+ * gcc implements __thread
+ * msvc implements __declspec(thread)
+ * the semantics are the same (but remember __thread is broken on apple)
+ */
+#if defined(_MSC_VER)
+# define FOLLY_TLS __declspec(thread)
+#elif defined(__GNUC__) || defined(__clang__)
+# define FOLLY_TLS __thread
+#else
+# error cannot define platform specific thread local storage
+#endif

 // Define to 1 if you have the `preadv' and `pwritev' functions, respectively
 #if !defined(FOLLY_HAVE_PREADV) && !defined(FOLLY_HAVE_PWRITEV)

--- a/folly/ThreadLocal.h
+++ b/folly/ThreadLocal.h
@@ -128,7 +128,8 @@ class ThreadLocal {
 * NOTE: Apple platforms don't support the same semantics for __thread that
 *       Linux does (and it's only supported at all on i386). For these, use
 *       pthread_setspecific()/pthread_getspecific() for the per-thread
- *       storage.
+ *       storage.  Windows (MSVC and GCC) does support the same semantics
+ *       with __declspec(thread)
 */

 template<class T, class Tag=void>

--- a/folly/detail/CacheLocality.cpp
+++ b/folly/detail/CacheLocality.cpp
@@ -230,7 +230,7 @@ template<>
 std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0);

 template<>
-__thread size_t SequentialThreadId<std::atomic>::currentId(0);
+FOLLY_TLS size_t SequentialThreadId<std::atomic>::currentId(0);

 /////////////// AccessSpreader


--- a/folly/detail/CacheLocality.h
+++ b/folly/detail/CacheLocality.h
@@ -26,6 +26,7 @@
 #include <type_traits>
 #include <vector>
 #include "folly/Likely.h"
+#include "folly/Portability.h"

 namespace folly { namespace detail {

@@ -172,8 +173,7 @@ struct SequentialThreadId {
 private:
  static Atom<size_t> prevId;

-  // TODO: switch to thread_local
-  static __thread size_t currentId;
+  static FOLLY_TLS size_t currentId;
 };

 template <template<typename> class Atom, size_t kMaxCpus>

--- a/folly/detail/MemoryIdler.cpp
+++ b/folly/detail/MemoryIdler.cpp
@@ -90,8 +90,8 @@ void MemoryIdler::flushLocalMallocCaches() {
 #ifdef __x86_64__

 static const size_t s_pageSize = sysconf(_SC_PAGESIZE);
-static __thread uintptr_t tls_stackLimit;
-static __thread size_t tls_stackSize;
+static FOLLY_TLS uintptr_t tls_stackLimit;
+static FOLLY_TLS size_t tls_stackSize;

 static void fetchStackLimits() {
  pthread_attr_t attr;

--- a/folly/detail/ThreadLocalDetail.h
+++ b/folly/detail/ThreadLocalDetail.h
@@ -169,7 +169,7 @@ struct StaticMeta {
  }

 #if !__APPLE__
-  static __thread ThreadEntry threadEntry_;
+  static FOLLY_TLS ThreadEntry threadEntry_;
 #endif
  static StaticMeta<Tag>* inst_;

@@ -412,7 +412,8 @@ struct StaticMeta {
 };

 #if !__APPLE__
-template <class Tag> __thread ThreadEntry StaticMeta<Tag>::threadEntry_ = {0};
+template <class Tag>
+FOLLY_TLS ThreadEntry StaticMeta<Tag>::threadEntry_ = {0};
 #endif
 template <class Tag> StaticMeta<Tag>* StaticMeta<Tag>::inst_ = nullptr;


--- a/folly/experimental/exception_tracer/ExceptionTracerLib.cpp
+++ b/folly/experimental/exception_tracer/ExceptionTracerLib.cpp
@@ -42,9 +42,9 @@ using namespace folly::exception_tracer;

 namespace {

-__thread bool invalid;
-__thread StackTraceStack activeExceptions;
-__thread StackTraceStack caughtExceptions;
+FOLLY_TLS bool invalid;
+FOLLY_TLS StackTraceStack activeExceptions;
+FOLLY_TLS StackTraceStack caughtExceptions;
 pthread_once_t initialized = PTHREAD_ONCE_INIT;

 extern "C" {

--- a/folly/test/CacheLocalityTest.cpp
+++ b/folly/test/CacheLocalityTest.cpp
@@ -327,7 +327,7 @@ TEST(SequentialThreadId, Simple) {
  EXPECT_EQ(cpu, again);
 }

-static __thread unsigned testingCpu = 0;
+static FOLLY_TLS unsigned testingCpu = 0;

 static int testingGetcpu(unsigned* cpu, unsigned* node, void* unused) {
  if (cpu != nullptr) {

--- a/folly/test/DeterministicSchedule.cpp
+++ b/folly/test/DeterministicSchedule.cpp
@@ -25,8 +25,8 @@

 namespace folly { namespace test {

-__thread sem_t* DeterministicSchedule::tls_sem;
-__thread DeterministicSchedule* DeterministicSchedule::tls_sched;
+FOLLY_TLS sem_t* DeterministicSchedule::tls_sem;
+FOLLY_TLS DeterministicSchedule* DeterministicSchedule::tls_sched;

 // access is protected by futexLock
 static std::unordered_map<detail::Futex<DeterministicAtomic>*,
@@ -335,7 +335,8 @@ test::DeterministicAtomic<size_t>
    SequentialThreadId<test::DeterministicAtomic>::prevId(0);

 template<>
-__thread size_t SequentialThreadId<test::DeterministicAtomic>::currentId(0);
+FOLLY_TLS size_t
+    SequentialThreadId<test::DeterministicAtomic>::currentId(0);

 template<>
 const AccessSpreader<test::DeterministicAtomic>

--- a/folly/test/DeterministicSchedule.h
+++ b/folly/test/DeterministicSchedule.h
@@ -129,8 +129,8 @@ class DeterministicSchedule : boost::noncopyable {
  static int getRandNumber(int n);

 private:
-  static __thread sem_t* tls_sem;
-  static __thread DeterministicSchedule* tls_sched;
+  static FOLLY_TLS sem_t* tls_sem;
+  static FOLLY_TLS DeterministicSchedule* tls_sched;

  std::function<int(int)> scheduler_;
  std::vector<sem_t*> sems_;

--- a/folly/test/MPMCQueueTest.cpp
+++ b/folly/test/MPMCQueueTest.cpp
@@ -418,8 +418,8 @@ enum LifecycleEvent {
  MAX_LIFECYCLE_EVENT
 };

-static __thread int lc_counts[MAX_LIFECYCLE_EVENT];
-static __thread int lc_prev[MAX_LIFECYCLE_EVENT];
+static FOLLY_TLS int lc_counts[MAX_LIFECYCLE_EVENT];
+static FOLLY_TLS int lc_prev[MAX_LIFECYCLE_EVENT];

 static int lc_outstanding() {
  return lc_counts[DEFAULT_CONSTRUCTOR] + lc_counts[COPY_CONSTRUCTOR] +

--- a/folly/test/ThreadCachedIntTest.cpp
+++ b/folly/test/ThreadCachedIntTest.cpp
@@ -152,8 +152,8 @@ ThreadLocal<int64_t> globalTL64Baseline;
 ThreadLocal<int32_t> globalTL32Baseline;
 std::atomic<int64_t> globalInt64Baseline(0);
 std::atomic<int32_t> globalInt32Baseline(0);
-__thread int64_t global__thread64;
-__thread int32_t global__thread32;
+FOLLY_TLS int64_t global__thread64;
+FOLLY_TLS int32_t global__thread32;

 // Alternate lock-free implementation.  Achieves about the same performance,
 // but uses about 20x more memory than ThreadCachedInt with 24 threads.