extract locality info from /proc/cpuinfo instead of sysfs

Summary: Cache locality information under /sys is dispersed across a very large number of files. This is a problem for short-lived processes due to direct overheads and lock contention in the kernel. This diff switches to a heuristic strategy that infers the interference pattern from /proc/cpuinfo instead of computing it exactly. This doesn't necessarily produce exactly the correct cache hierarchy info, but it yields the correct topological sort for machines that have only core-local and socket-local cache locality. Differential Revision: D16459331 fbshipit-source-id: a322c126d1a4775d015bfb81451dbc6ad6fcc0fd

extract locality info from /proc/cpuinfo instead of sysfs
Summary: Cache locality information under /sys is dispersed across a very large number of files. This is a problem for short-lived processes due to direct overheads and lock contention in the kernel. This diff switches to a heuristic strategy that infers the interference pattern from /proc/cpuinfo instead of computing it exactly. This doesn't necessarily produce exactly the correct cache hierarchy info, but it yields the correct topological sort for machines that have only core-local and socket-local cache locality. Differential Revision: D16459331 fbshipit-source-id: a322c126d1a4775d015bfb81451dbc6ad6fcc0fd
60be5ec6 · Nathan Bronson · Facebook Github Bot · 2af6f0c7 · 60be5ec6 · 60be5ec6
Commit 60be5ec6 authored Jul 27, 2019 by Nathan Bronson Committed by Facebook Github Bot Jul 27, 2019
3 changed files
--- a/folly/concurrency/CacheLocality.cpp
+++ b/folly/concurrency/CacheLocality.cpp
@@ -32,11 +32,11 @@ namespace folly {
 ///////////// CacheLocality
-/// Returns the best real CacheLocality information available
+/// Returns the CacheLocality information best for this machine
 static CacheLocality getSystemLocalityInfo() {
  if (kIsLinux) {
    try {
-      return CacheLocality::readFromSysfs();
+      return CacheLocality::readFromProcCpuinfo();
    } catch (...) {
      // keep trying
    }
@@ -187,6 +187,90 @@ CacheLocality CacheLocality::readFromSysfs() {
  });
 }
+static bool procCpuinfoLineRelevant(std::string const& line) {
+  return line.size() > 4 && (line[0] == 'p' || line[0] == 'c');
+}
+CacheLocality CacheLocality::readFromProcCpuinfoLines(
+    std::vector<std::string> const& lines) {
+  size_t physicalId = 0;
+  size_t coreId = 0;
+  std::vector<std::tuple<size_t, size_t, size_t>> cpus;
+  for (auto iter = lines.rbegin(); iter != lines.rend(); ++iter) {
+    auto& line = *iter;
+    if (!procCpuinfoLineRelevant(line)) {
+      continue;
+    }
+    auto sepIndex = line.find(':');
+    if (sepIndex == std::string::npos || sepIndex + 2 > line.size()) {
+      continue;
+    }
+    auto arg = line.substr(sepIndex + 2);
+    // "physical id" is socket, which is the most important locality
+    // context.  "core id" is a real core, so two "processor" entries with
+    // the same physical id and core id are hyperthreads of each other.
+    // "processor" is the top line of each record, so when we hit it in
+    // the reverse order then we can emit a record.
+    if (line.find("physical id") == 0) {
+      physicalId = parseLeadingNumber(arg);
+    } else if (line.find("core id") == 0) {
+      coreId = parseLeadingNumber(arg);
+    } else if (line.find("processor") == 0) {
+      auto cpu = parseLeadingNumber(arg);
+      cpus.emplace_back(physicalId, coreId, cpu);
+    }
+  }
+  if (cpus.empty()) {
+    throw std::runtime_error("no CPUs parsed from /proc/cpuinfo");
+  }
+  std::sort(cpus.begin(), cpus.end());
+  size_t cpusPerCore = 1;
+  while (cpusPerCore < cpus.size() &&
+         std::get<0>(cpus[cpusPerCore]) == std::get<0>(cpus[0]) &&
+         std::get<1>(cpus[cpusPerCore]) == std::get<1>(cpus[0])) {
+    ++cpusPerCore;
+  }
+  // we can't tell the real cache hierarchy from /proc/cpuinfo, but it
+  // works well enough to assume there are 3 levels, L1 and L2 per-core
+  // and L3 per socket
+  std::vector<size_t> numCachesByLevel;
+  numCachesByLevel.push_back(cpus.size() / cpusPerCore);
+  numCachesByLevel.push_back(cpus.size() / cpusPerCore);
+  numCachesByLevel.push_back(std::get<0>(cpus.back()) + 1);
+  std::vector<size_t> indexes(cpus.size());
+  for (size_t i = 0; i < cpus.size(); ++i) {
+    indexes[std::get<2>(cpus[i])] = i;
+  }
+  return CacheLocality{
+      cpus.size(), std::move(numCachesByLevel), std::move(indexes)};
+}
+CacheLocality CacheLocality::readFromProcCpuinfo() {
+  std::vector<std::string> lines;
+  {
+    std::ifstream xi("/proc/cpuinfo");
+    if (xi.fail()) {
+      throw std::runtime_error("unable to open /proc/cpuinfo");
+    }
+    char buf[8192];
+    while (xi.good() && lines.size() < 20000) {
+      xi.getline(buf, sizeof(buf));
+      std::string str(buf);
+      if (procCpuinfoLineRelevant(str)) {
+        lines.emplace_back(std::move(str));
+      }
+    }
+  }
+  return readFromProcCpuinfoLines(lines);
+}
 CacheLocality CacheLocality::uniform(size_t numCpus) {
  CacheLocality rv;

--- a/folly/concurrency/CacheLocality.h
+++ b/folly/concurrency/CacheLocality.h
@@ -111,6 +111,18 @@ struct CacheLocality {
  /// Throws an exception if no cache information can be loaded.
  static CacheLocality readFromSysfs();
+  /// readFromProcCpuinfo(), except input is taken from memory rather
+  /// than the file system.
+  static CacheLocality readFromProcCpuinfoLines(
+      std::vector<std::string> const& lines);
+  /// Returns an estimate of the CacheLocality information by reading
+  /// /proc/cpuinfo.  This isn't as accurate as readFromSysfs(), but
+  /// is a lot faster because the info isn't scattered across
+  /// hundreds of files.  Throws an exception if no cache information
+  /// can be loaded.
+  static CacheLocality readFromProcCpuinfo();
  /// Returns a usable (but probably not reflective of reality)
  /// CacheLocality structure with the specified number of cpus and a
  /// single cache level that associates one cpu per cache.

--- a/folly/concurrency/test/CacheLocalityTest.cpp
+++ b/folly/concurrency/test/CacheLocalityTest.cpp