fix SIOF in CacheLocality.h's AccessSpreader

Summary:This diff moves all data accessed during AccessSpreader<>::current(x) into the .data segment, avoiding SIOF without adding indirection or dynamic gating as would be the case for normal singleton-like constructs. The diff also trims the AccessSpreader API to include only those methods that people actually seem to use. Reviewed By: djwatson Differential Revision: D2945205 fb-gh-sync-id: 847e31adc4450217f4ed0575686be261fb504d7c shipit-source-id: 847e31adc4450217f4ed0575686be261fb504d7c

fix SIOF in CacheLocality.h's AccessSpreader
Summary:This diff moves all data accessed during AccessSpreader<>::current(x) into the .data segment, avoiding SIOF without adding indirection or dynamic gating as would be the case for normal singleton-like constructs. The diff also trims the AccessSpreader API to include only those methods that people actually seem to use. Reviewed By: djwatson Differential Revision: D2945205 fb-gh-sync-id: 847e31adc4450217f4ed0575686be261fb504d7c shipit-source-id: 847e31adc4450217f4ed0575686be261fb504d7c
bf385e23 · Nathan Bronson · facebook-github-bot-4 · d31eb712 · bf385e23 · bf385e23
Commit bf385e23 authored Feb 22, 2016 by Nathan Bronson Committed by facebook-github-bot-4 Feb 22, 2016
5 changed files
--- a/folly/detail/CacheLocality.cpp
+++ b/folly/detail/CacheLocality.cpp
@@ -28,6 +28,8 @@
 #include <folly/Format.h>
 #include <folly/ScopeGuard.h>

+DECLARE_ACCESS_SPREADER_TYPE(std::atomic)
+
 namespace folly {
 namespace detail {

@@ -60,8 +62,8 @@ static CacheLocality getSystemLocalityInfo() {

 template <>
 const CacheLocality& CacheLocality::system<std::atomic>() {
-  static CacheLocality cache(getSystemLocalityInfo());
-  return cache;
+  static auto* cache = new CacheLocality(getSystemLocalityInfo());
+  return *cache;
 }

 // Each level of cache has sharing sets, which are the set of cpus
@@ -110,8 +112,7 @@ CacheLocality CacheLocality::readFromSysfsTree(
    std::vector<size_t> levels;
    for (size_t index = 0;; ++index) {
      auto dir =
-          format("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index)
-              .str();
+          sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index);
      auto cacheType = mapping(dir + "type");
      auto equivStr = mapping(dir + "shared_cpu_list");
      if (cacheType.size() == 0 || equivStr.size() == 0) {
@@ -208,9 +209,7 @@ CacheLocality CacheLocality::uniform(size_t numCpus) {

 ////////////// Getcpu

-/// Resolves the dynamically loaded symbol __vdso_getcpu, returning null
-/// on failure
-static Getcpu::Func loadVdsoGetcpu() {
+Getcpu::Func Getcpu::resolveVdsoFunc() {
 #if defined(_MSC_VER) || defined(__BIONIC__)
  return nullptr;
 #else
@@ -232,11 +231,6 @@ static Getcpu::Func loadVdsoGetcpu() {
 #endif
 }

-Getcpu::Func Getcpu::vdsoFunc() {
-  static Func func = loadVdsoGetcpu();
-  return func;
-}
-
 #ifdef FOLLY_TLS
 /////////////// SequentialThreadId

@@ -250,40 +244,10 @@ FOLLY_TLS size_t SequentialThreadId<std::atomic>::currentId(0);
 /////////////// AccessSpreader

 template <>
-const AccessSpreader<std::atomic> AccessSpreader<std::atomic>::stripeByCore(
-    CacheLocality::system<>().numCachesByLevel.front());
-
-template <>
-const AccessSpreader<std::atomic> AccessSpreader<std::atomic>::stripeByChip(
-    CacheLocality::system<>().numCachesByLevel.back());
-
-template <>
-AccessSpreaderArray<std::atomic, 128>
-    AccessSpreaderArray<std::atomic, 128>::sharedInstance = {};
-
-/// Always claims to be on CPU zero, node zero
-static int degenerateGetcpu(unsigned* cpu, unsigned* node, void* /* unused */) {
-  if (cpu != nullptr) {
-    *cpu = 0;
-  }
-  if (node != nullptr) {
-    *node = 0;
-  }
-  return 0;
+Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc() {
+  auto best = Getcpu::resolveVdsoFunc();
+  return best ? best : &FallbackGetcpuType::getcpu;
 }

-template <>
-Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) {
-  if (numStripes == 1) {
-    // there's no need to call getcpu if there is only one stripe.
-    // This should not be common, so we don't want to waste a test and
-    // branch in the main code path, but we might as well use a faster
-    // function pointer
-    return &degenerateGetcpu;
-  } else {
-    auto best = Getcpu::vdsoFunc();
-    return best ? best : &FallbackGetcpuType::getcpu;
-  }
-}
-}
-} // namespace folly::detail
+} // namespace detail
+} // namespace folly
--- a/folly/detail/CacheLocality.h
+++ b/folly/detail/CacheLocality.h
--- a/folly/test/CacheLocalityTest.cpp
+++ b/folly/test/CacheLocalityTest.cpp
--- a/folly/test/DeterministicSchedule.cpp
+++ b/folly/test/DeterministicSchedule.cpp
@@ -23,6 +23,8 @@
 #include <unordered_map>
 #include <assert.h>

+DECLARE_ACCESS_SPREADER_TYPE(folly::test::DeterministicAtomic)
+
 namespace folly {
 namespace test {

@@ -352,22 +354,7 @@ CacheLocality const& CacheLocality::system<test::DeterministicAtomic>() {
 }

 template <>
-const AccessSpreader<test::DeterministicAtomic>
-    AccessSpreader<test::DeterministicAtomic>::stripeByCore(
-        CacheLocality::system<>().numCachesByLevel.front());
-
-template <>
-const AccessSpreader<test::DeterministicAtomic>
-    AccessSpreader<test::DeterministicAtomic>::stripeByChip(
-        CacheLocality::system<>().numCachesByLevel.back());
-
-template <>
-AccessSpreaderArray<test::DeterministicAtomic, 128>
-    AccessSpreaderArray<test::DeterministicAtomic, 128>::sharedInstance = {};
-
-template <>
-Getcpu::Func AccessSpreader<test::DeterministicAtomic>::pickGetcpuFunc(
-    size_t /* numStripes */) {
+Getcpu::Func AccessSpreader<test::DeterministicAtomic>::pickGetcpuFunc() {
  return &DeterministicSchedule::getcpu;
 }
 }

--- a/folly/test/DeterministicSchedule.h
+++ b/folly/test/DeterministicSchedule.h
@@ -400,7 +400,6 @@ FutexResult Futex<test::DeterministicAtomic>::futexWaitImpl(
    uint32_t waitMask);

 template <>
-Getcpu::Func AccessSpreader<test::DeterministicAtomic>::pickGetcpuFunc(
-    size_t numStripes);
+Getcpu::Func AccessSpreader<test::DeterministicAtomic>::pickGetcpuFunc();
 }
 } // namespace folly::detail