Move CacheLocality out of detail/ and into concurrency/

Summary: There's no reason these utilities should only be used by folly. Reviewed By: mzlee Differential Revision: D5317894 fbshipit-source-id: 5a9bdf4c5efaa5bcbe78e6723a03a468f2fe5e32

Move CacheLocality out of detail/ and into concurrency/
Summary: There's no reason these utilities should only be used by folly. Reviewed By: mzlee Differential Revision: D5317894 fbshipit-source-id: 5a9bdf4c5efaa5bcbe78e6723a03a468f2fe5e32
05ce5228 · Giuseppe Ottaviano · Facebook Github Bot · 04cf6b8f · 05ce5228 · 05ce5228
Commit 05ce5228 authored Jun 28, 2017 by Giuseppe Ottaviano Committed by Facebook Github Bot Jun 28, 2017
19 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -292,6 +292,8 @@ if (BUILD_TESTS)
  apply_folly_compile_options_to_target(folly_test_support)

  folly_define_tests(
+    DIRECTORY concurrency/
+      TEST cache_locality_test SOURCES CacheLocalityTest.cpp
    DIRECTORY experimental/test/
      TEST autotimer_test SOURCES AutoTimerTest.cpp
      TEST bits_test_2 SOURCES BitsTest.cpp
@@ -467,7 +469,6 @@ if (BUILD_TESTS)
      TEST baton_test SOURCES BatonTest.cpp
      TEST bit_iterator_test SOURCES BitIteratorTest.cpp
      TEST bits_test SOURCES BitsTest.cpp
-      TEST cache_locality_test SOURCES CacheLocalityTest.cpp
      TEST cacheline_padded_test SOURCES CachelinePaddedTest.cpp
      TEST call_once_test SOURCES CallOnceTest.cpp
      TEST checksum_test SOURCES ChecksumTest.cpp

--- a/folly/IndexedMemPool.h
+++ b/folly/IndexedMemPool.h
@@ -16,14 +16,16 @@

 #pragma once

-#include <type_traits>
 #include <assert.h>
 #include <errno.h>
 #include <stdint.h>
+
+#include <type_traits>
+
 #include <boost/noncopyable.hpp>
 #include <folly/AtomicStruct.h>
 #include <folly/Portability.h>
-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>
 #include <folly/portability/SysMman.h>
 #include <folly/portability/Unistd.h>

@@ -497,7 +499,7 @@ struct IndexedMemPool : boost::noncopyable {
  }

  AtomicStruct<TaggedPtr,Atom>& localHead() {
-    auto stripe = detail::AccessSpreader<Atom>::current(NumLocalLists);
+    auto stripe = AccessSpreader<Atom>::current(NumLocalLists);
    return local_[stripe].head;
  }


--- a/folly/LifoSem.h
+++ b/folly/LifoSem.h
@@ -27,7 +27,7 @@
 #include <folly/Baton.h>
 #include <folly/IndexedMemPool.h>
 #include <folly/Likely.h>
-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>

 namespace folly {

@@ -515,9 +515,7 @@ struct LifoSemBase {
  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
  folly::AtomicStruct<LifoSemHead,Atom> head_;

-  char padding_[folly::detail::CacheLocality::kFalseSharingRange -
-      sizeof(LifoSemHead)];
-
+  char padding_[folly::CacheLocality::kFalseSharingRange - sizeof(LifoSemHead)];

  static LifoSemNode<Handoff, Atom>& idxToNode(uint32_t idx) {
    auto raw = &LifoSemRawNode<Atom>::pool()[idx];

--- a/folly/MPMCQueue.h
+++ b/folly/MPMCQueue.h
@@ -25,7 +25,7 @@
 #include <type_traits>

 #include <folly/Traits.h>
-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>
 #include <folly/detail/TurnSequencer.h>
 #include <folly/portability/Unistd.h>

@@ -647,11 +647,11 @@ class MPMCQueueBase<Derived<T, Atom, Dynamic>> : boost::noncopyable {
    }

    // ideally this would be a static assert, but g++ doesn't allow it
-    assert(alignof(MPMCQueue<T,Atom>)
-           >= detail::CacheLocality::kFalseSharingRange);
-    assert(static_cast<uint8_t*>(static_cast<void*>(&popTicket_))
-           - static_cast<uint8_t*>(static_cast<void*>(&pushTicket_))
-           >= detail::CacheLocality::kFalseSharingRange);
+    assert(alignof(MPMCQueue<T, Atom>) >= CacheLocality::kFalseSharingRange);
+    assert(
+        static_cast<uint8_t*>(static_cast<void*>(&popTicket_)) -
+            static_cast<uint8_t*>(static_cast<void*>(&pushTicket_)) >=
+        CacheLocality::kFalseSharingRange);
  }

  /// A default-constructed queue is useful because a usable (non-zero
@@ -971,8 +971,7 @@ class MPMCQueueBase<Derived<T, Atom, Dynamic>> : boost::noncopyable {
    /// To avoid false sharing in slots_ with neighboring memory
    /// allocations, we pad it with this many SingleElementQueue-s at
    /// each end
-    kSlotPadding = (detail::CacheLocality::kFalseSharingRange - 1)
-        / sizeof(Slot) + 1
+    kSlotPadding = (CacheLocality::kFalseSharingRange - 1) / sizeof(Slot) + 1
  };

  /// The maximum number of items in the queue at once
@@ -1024,8 +1023,7 @@ class MPMCQueueBase<Derived<T, Atom, Dynamic>> : boost::noncopyable {

  /// Alignment doesn't prevent false sharing at the end of the struct,
  /// so fill out the last cache line
-  char padding_[detail::CacheLocality::kFalseSharingRange -
-                sizeof(Atom<uint32_t>)];
+  char padding_[CacheLocality::kFalseSharingRange - sizeof(Atom<uint32_t>)];

  /// We assign tickets in increasing order, but we don't want to
  /// access neighboring elements of slots_ because that will lead to

--- a/folly/Makefile.am
+++ b/folly/Makefile.am
@@ -56,12 +56,12 @@ nobase_follyinclude_HEADERS = \
 	CppAttributes.h \
 	CpuId.h \
 	CPortability.h \
+	concurrency/CacheLocality.h \
 	concurrency/CoreCachedSharedPtr.h \
 	detail/AtomicHashUtils.h \
 	detail/AtomicUnorderedMapUtils.h \
 	detail/AtomicUtils.h \
 	detail/BitIteratorDetail.h \
-	detail/CacheLocality.h \
 	detail/CachelinePaddedImpl.h \
 	detail/ChecksumDetail.h \
 	detail/DiscriminatedPtrDetail.h \
@@ -459,7 +459,7 @@ libfolly_la_SOURCES = \
 	Assume.cpp \
 	Checksum.cpp \
 	ClockGettimeWrappers.cpp \
-	detail/CacheLocality.cpp \
+	concurrency/CacheLocality.cpp \
 	detail/IPAddress.cpp \
 	dynamic.cpp \
 	ExceptionWrapper.cpp \

--- a/folly/ProducerConsumerQueue.h
+++ b/folly/ProducerConsumerQueue.h
@@ -27,7 +27,7 @@
 #include <type_traits>
 #include <utility>

-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>

 namespace folly {

@@ -168,14 +168,14 @@ struct ProducerConsumerQueue {
  }

 private:
-  char pad0_[detail::CacheLocality::kFalseSharingRange];
+ char pad0_[CacheLocality::kFalseSharingRange];
 const uint32_t size_;
 T* const records_;

 FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> readIndex_;
 FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> writeIndex_;

-  char pad1_[detail::CacheLocality::kFalseSharingRange - sizeof(writeIndex_)];
+ char pad1_[CacheLocality::kFalseSharingRange - sizeof(writeIndex_)];
 };

 }
--- a/folly/SharedMutex.h
+++ b/folly/SharedMutex.h
@@ -19,11 +19,13 @@
 #pragma once

 #include <stdint.h>
+
 #include <atomic>
 #include <thread>
 #include <type_traits>
+
 #include <folly/Likely.h>
-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>
 #include <folly/detail/Futex.h>
 #include <folly/portability/Asm.h>
 #include <folly/portability/SysResource.h>
@@ -1417,8 +1419,7 @@ bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
        // starting point for our empty-slot search, can change after
        // calling waitForZeroBits
        uint32_t bestSlot =
-            (uint32_t)folly::detail::AccessSpreader<Atom>::current(
-                kMaxDeferredReaders);
+            (uint32_t)folly::AccessSpreader<Atom>::current(kMaxDeferredReaders);

        // deferred readers are already enabled, or it is time to
        // enable them if we can find a slot

--- a/folly/TokenBucket.h
+++ b/folly/TokenBucket.h
@@ -21,7 +21,7 @@
 #include <chrono>

 #include <folly/Likely.h>
-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>

 namespace folly {


--- a/folly/detail/CacheLocality.cpp
+++ b/folly/detail/CacheLocality.cpp
@@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>

 #ifndef _MSC_VER
 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
@@ -29,7 +29,6 @@
 #include <folly/ScopeGuard.h>

 namespace folly {
-namespace detail {

 ///////////// CacheLocality

@@ -244,13 +243,13 @@ SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz)
 SimpleAllocator::~SimpleAllocator() {
  std::lock_guard<std::mutex> g(m_);
  for (auto& block : blocks_) {
-    aligned_free(block);
+    detail::aligned_free(block);
  }
 }

 void* SimpleAllocator::allocateHard() {
  // Allocate a new slab.
-  mem_ = static_cast<uint8_t*>(aligned_malloc(allocSize_, allocSize_));
+  mem_ = static_cast<uint8_t*>(detail::aligned_malloc(allocSize_, allocSize_));
  if (!mem_) {
    std::__throw_bad_alloc();
  }
@@ -271,5 +270,4 @@ void* SimpleAllocator::allocateHard() {
  return mem;
 }

-} // namespace detail
 } // namespace folly
--- a/folly/detail/CacheLocality.h
+++ b/folly/detail/CacheLocality.h
@@ -38,7 +38,6 @@
 #include <folly/portability/Memory.h>

 namespace folly {
-namespace detail {

 // This file contains several classes that might be useful if you are
 // trying to dynamically optimize cache locality: CacheLocality reads
@@ -458,7 +457,8 @@ class CoreAllocator {
        // Align to a cacheline
        size = size + (CacheLocality::kFalseSharingRange - 1);
        size &= ~size_t(CacheLocality::kFalseSharingRange - 1);
-        void* mem = aligned_malloc(size, CacheLocality::kFalseSharingRange);
+        void* mem =
+            detail::aligned_malloc(size, CacheLocality::kFalseSharingRange);
        if (!mem) {
          std::__throw_bad_alloc();
        }
@@ -478,7 +478,7 @@ class CoreAllocator {
        auto allocator = *static_cast<SimpleAllocator**>(addr);
        allocator->deallocate(mem);
      } else {
-        aligned_free(mem);
+        detail::aligned_free(mem);
      }
    }
  };
@@ -507,5 +507,4 @@ StlAllocator<typename CoreAllocator<Stripes>::Allocator, T> getCoreAllocatorStl(
  return StlAllocator<typename CoreAllocator<Stripes>::Allocator, T>(alloc);
 }

-} // namespace detail
 } // namespace folly
--- a/folly/concurrency/CoreCachedSharedPtr.h
+++ b/folly/concurrency/CoreCachedSharedPtr.h
@@ -20,7 +20,7 @@
 #include <memory>

 #include <folly/Enumerate.h>
-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>

 namespace folly {

@@ -46,14 +46,14 @@ class CoreCachedSharedPtr {
    // prevent false sharing. Their control blocks will be adjacent
    // thanks to allocate_shared().
    for (auto slot : folly::enumerate(slots_)) {
-      auto alloc = detail::getCoreAllocatorStl<Holder, kNumSlots>(slot.index);
+      auto alloc = getCoreAllocatorStl<Holder, kNumSlots>(slot.index);
      auto holder = std::allocate_shared<Holder>(alloc, p);
      *slot = std::shared_ptr<T>(holder, p.get());
    }
  }

  std::shared_ptr<T> get() const {
-    return slots_[detail::AccessSpreader<>::current(kNumSlots)];
+    return slots_[AccessSpreader<>::current(kNumSlots)];
  }

 private:
@@ -75,7 +75,7 @@ class CoreCachedWeakPtr {
  }

  std::weak_ptr<T> get() const {
-    return slots_[detail::AccessSpreader<>::current(kNumSlots)];
+    return slots_[AccessSpreader<>::current(kNumSlots)];
  }

 private:

--- a/folly/test/CacheLocalityBenchmark.cpp
+++ b/folly/test/CacheLocalityBenchmark.cpp
@@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>

 #include <memory>
 #include <thread>
@@ -24,7 +24,7 @@

 #include <folly/Benchmark.h>

-using namespace folly::detail;
+using namespace folly;

 #define DECLARE_SPREADER_TAG(tag, locality, func)      \
  namespace {                                          \
@@ -32,7 +32,6 @@ using namespace folly::detail;
  struct tag {};                                       \
  }                                                    \
  namespace folly {                                    \
-  namespace detail {                                   \
  template <>                                          \
  const CacheLocality& CacheLocality::system<tag>() {  \
    static auto* inst = new CacheLocality(locality);   \
@@ -42,16 +41,16 @@ using namespace folly::detail;
  Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
    return func;                                       \
  }                                                    \
-  }                                                    \
  }

 DECLARE_SPREADER_TAG(
    ThreadLocalTag,
    CacheLocality::system<>(),
-    folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
-DECLARE_SPREADER_TAG(PthreadSelfTag,
+    folly::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
+DECLARE_SPREADER_TAG(
+    PthreadSelfTag,
    CacheLocality::system<>(),
-                     folly::detail::FallbackGetcpu<HashingThreadId>::getcpu)
+    folly::FallbackGetcpu<HashingThreadId>::getcpu)

 BENCHMARK(AccessSpreaderUse, iters) {
  for (unsigned long i = 0; i < iters; ++i) {

--- a/folly/test/CacheLocalityTest.cpp
+++ b/folly/test/CacheLocalityTest.cpp
@@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>

 #include <folly/portability/GTest.h>

@@ -24,7 +24,7 @@
 #include <unordered_map>
 #include <glog/logging.h>

-using namespace folly::detail;
+using namespace folly;

 /// This is the relevant nodes from a production box's sysfs tree.  If you
 /// think this map is ugly you should see the version of this test that
@@ -363,13 +363,12 @@ TEST(Getcpu, VdsoGetcpu) {
 #ifdef FOLLY_TLS
 TEST(ThreadId, SimpleTls) {
  unsigned cpu = 0;
-  auto rv =
-      folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
+  auto rv = folly::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
      &cpu, nullptr, nullptr);
  EXPECT_EQ(rv, 0);
  EXPECT_TRUE(cpu > 0);
  unsigned again;
-  folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
+  folly::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
      &again, nullptr, nullptr);
  EXPECT_EQ(cpu, again);
 }
@@ -377,13 +376,12 @@ TEST(ThreadId, SimpleTls) {

 TEST(ThreadId, SimplePthread) {
  unsigned cpu = 0;
-  auto rv = folly::detail::FallbackGetcpu<HashingThreadId>::getcpu(
-      &cpu, nullptr, nullptr);
+  auto rv =
+      folly::FallbackGetcpu<HashingThreadId>::getcpu(&cpu, nullptr, nullptr);
  EXPECT_EQ(rv, 0);
  EXPECT_TRUE(cpu > 0);
  unsigned again;
-  folly::detail::FallbackGetcpu<HashingThreadId>::getcpu(
-      &again, nullptr, nullptr);
+  folly::FallbackGetcpu<HashingThreadId>::getcpu(&again, nullptr, nullptr);
  EXPECT_EQ(cpu, again);
 }

@@ -414,7 +412,6 @@ TEST(AccessSpreader, Simple) {
  struct tag {};                                       \
  }                                                    \
  namespace folly {                                    \
-  namespace detail {                                   \
  template <>                                          \
  const CacheLocality& CacheLocality::system<tag>() {  \
    static auto* inst = new CacheLocality(locality);   \
@@ -424,7 +421,6 @@ TEST(AccessSpreader, Simple) {
  Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
    return func;                                       \
  }                                                    \
-  }                                                    \
  }

 DECLARE_SPREADER_TAG(ManualTag, CacheLocality::uniform(16), testingGetcpu)

--- a/folly/detail/CachelinePaddedImpl.h
+++ b/folly/detail/CachelinePaddedImpl.h
@@ -16,7 +16,7 @@

 #pragma once

-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>

 namespace folly {

@@ -33,7 +33,7 @@ struct CachelinePaddedImpl;
 // We need alignas(T) alignas(kFalseSharingRange) for the case where alignof(T)
 // > alignof(kFalseSharingRange).
 template <typename T>
-struct alignas(T) alignas(detail::CacheLocality::kFalseSharingRange)
+struct alignas(T) alignas(CacheLocality::kFalseSharingRange)
    CachelinePaddedImpl<T, /* needsPadding = */ false> {
  template <typename... Args>
  explicit CachelinePaddedImpl(Args&&... args)
@@ -42,7 +42,7 @@ struct alignas(T) alignas(detail::CacheLocality::kFalseSharingRange)
 };

 template <typename T>
-struct alignas(T) alignas(detail::CacheLocality::kFalseSharingRange)
+struct alignas(T) alignas(CacheLocality::kFalseSharingRange)
    CachelinePaddedImpl<T, /* needsPadding = */ true> {
  template <typename... Args>
  explicit CachelinePaddedImpl(Args&&... args)

--- a/folly/detail/MemoryIdler.cpp
+++ b/folly/detail/MemoryIdler.cpp
@@ -21,7 +21,7 @@
 #include <folly/Malloc.h>
 #include <folly/Portability.h>
 #include <folly/ScopeGuard.h>
-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>
 #include <folly/portability/PThread.h>
 #include <folly/portability/SysMman.h>
 #include <folly/portability/Unistd.h>

--- a/folly/experimental/flat_combining/FlatCombining.h
+++ b/folly/experimental/flat_combining/FlatCombining.h
@@ -20,7 +20,7 @@
 #include <folly/Function.h>
 #include <folly/IndexedMemPool.h>
 #include <folly/Portability.h>
-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>

 #include <atomic>
 #include <cassert>

--- a/folly/test/CachelinePaddedTest.cpp
+++ b/folly/test/CachelinePaddedTest.cpp
@@ -26,7 +26,7 @@ static_assert(
    std::is_standard_layout<CachelinePadded<int>>::value,
    "CachelinePadded<T> must be standard-layout if T is.");

-const int kCachelineSize = folly::detail::CacheLocality::kFalseSharingRange;
+const int kCachelineSize = folly::CacheLocality::kFalseSharingRange;

 template <int dataSize>
 struct SizedData {

--- a/folly/test/DeterministicSchedule.cpp
+++ b/folly/test/DeterministicSchedule.cpp
@@ -382,6 +382,7 @@ int Futex<DeterministicAtomic>::futexWake(int count, uint32_t wakeMask) {
  DeterministicSchedule::afterSharedAccess();
  return rv;
 }
+}

 template <>
 CacheLocality const& CacheLocality::system<test::DeterministicAtomic>() {
@@ -391,7 +392,6 @@ CacheLocality const& CacheLocality::system<test::DeterministicAtomic>() {

 template <>
 Getcpu::Func AccessSpreader<test::DeterministicAtomic>::pickGetcpuFunc() {
-  return &DeterministicSchedule::getcpu;
-}
+  return &detail::DeterministicSchedule::getcpu;
 }
 }
--- a/folly/test/DeterministicSchedule.h
+++ b/folly/test/DeterministicSchedule.h
@@ -28,8 +28,8 @@
 #include <vector>

 #include <folly/ScopeGuard.h>
+#include <folly/concurrency/CacheLocality.h>
 #include <folly/detail/AtomicUtils.h>
-#include <folly/detail/CacheLocality.h>
 #include <folly/detail/Futex.h>
 #include <folly/portability/Semaphore.h>

@@ -499,8 +499,9 @@ FutexResult Futex<test::DeterministicAtomic>::futexWaitImpl(
    std::chrono::time_point<std::chrono::system_clock>* absSystemTime,
    std::chrono::time_point<std::chrono::steady_clock>* absSteadyTime,
    uint32_t waitMask);
+}

 template <>
 Getcpu::Func AccessSpreader<test::DeterministicAtomic>::pickGetcpuFunc();
-}
+
 } // namespace folly::detail