Fix ParkingLot memory ordering bug

Summary: ``` auto x = std::atomic<std::uint64_t>{0}; auto y = std::atomic<std::uint64_t>{0}; // thread 1 x.store(1, std::memory_order_release); auto one = y.load(std::memory_order_seq_cst); // thread 2 y.fetch_add(1, std::memory_order_seq_cst); auto two = x.load(std::memory_order_seq_cst); ``` Here it is possible for both `one` and `two` to end up with the value `0`. The code in ParkingLot assumed that this would not be possible; and the counter used to track the number of waiters could get reordered with respect to loads around it. This diff adds a seq_cst fence to ensure unparking threads always sequence their stores before parking _before_ the counter load globally. Reviewed By: yfeldblum, ot Differential Revision: D28972810 fbshipit-source-id: 06eb6a2e6df6b00bf07ac8454a79257a5276e154

Fix ParkingLot memory ordering bug
Summary: ``` auto x = std::atomic<std::uint64_t>{0}; auto y = std::atomic<std::uint64_t>{0}; // thread 1 x.store(1, std::memory_order_release); auto one = y.load(std::memory_order_seq_cst); // thread 2 y.fetch_add(1, std::memory_order_seq_cst); auto two = x.load(std::memory_order_seq_cst); ``` Here it is possible for both `one` and `two` to end up with the value `0`. The code in ParkingLot assumed that this would not be possible; and the counter used to track the number of waiters could get reordered with respect to loads around it. This diff adds a seq_cst fence to ensure unparking threads always sequence their stores before parking _before_ the counter load globally. Reviewed By: yfeldblum, ot Differential Revision: D28972810 fbshipit-source-id: 06eb6a2e6df6b00bf07ac8454a79257a5276e154
ff7ab9db · Aaryaman Sagar · Facebook GitHub Bot · d418b5ee · ff7ab9db · ff7ab9db
Commit ff7ab9db authored Jun 12, 2021 by Aaryaman Sagar Committed by Facebook GitHub Bot Jun 27, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 0 deletions

folly/synchronization/ParkingLot.h folly/synchronization/ParkingLot.h +1 -0

folly/synchronization/test/ParkingLotTest.cpp folly/synchronization/test/ParkingLotTest.cpp +48 -0

No files found.
--- a/folly/synchronization/ParkingLot.h
+++ b/folly/synchronization/ParkingLot.h
@@ -300,6 +300,7 @@ void ParkingLot<Data>::unpark(const Key bits, Func&& func) {
  // B: Must be seq_cst.  Matches A.  If true, A *must* see in seq_cst
  // order any atomic updates in toPark() (and matching updates that
  // happen before unpark is called)
+  std::atomic_thread_fence(std::memory_order_seq_cst);
  if (bucket.count_.load(std::memory_order_seq_cst) == 0) {
    return;
  }

--- a/folly/synchronization/test/ParkingLotTest.cpp
+++ b/folly/synchronization/test/ParkingLotTest.cpp
@@ -61,6 +61,54 @@ TEST(ParkingLot, multilot) {
  large.join();
 }
+TEST(ParkingLot, StressTestPingPong) {
+  auto lot = ParkingLot<std::uint32_t>{};
+  auto one = std::atomic<std::uint64_t>{0};
+  auto two = std::atomic<std::uint64_t>{0};
+  auto testDone = std::atomic<bool>{false};
+  auto threadOneDone = std::atomic<bool>{false};
+  auto threadOne = std::thread{[&]() {
+    auto local = std::uint64_t{0};
+    while (!testDone.load(std::memory_order_relaxed)) {
+      // wait while the atomic is still equal to c, the other thread unblocks us
+      // because it signals before spinning itself
+      lot.park(
+          &one, -1, [&]() { return one.load() == local; }, []() {});
+      local = one.load(std::memory_order_acquire);
+      two.store(local, std::memory_order_release);
+    }
+    threadOneDone.store(true, std::memory_order_release);
+  }};
+  auto threadTwo = std::thread{[&]() {
+    for (auto i = std::uint64_t{1}; true; ++i) {
+      auto local = two.load(std::memory_order_acquire);
+      assert(local < i);
+      // unblock the other thread
+      one.store(i, std::memory_order_release);
+      lot.unpark(&one, [&](auto&&) { return UnparkControl::RemoveBreak; });
+      // spinning (vs sleeping with ParkingLot::park) happens to expose the bug
+      // more frequently in practice
+      while (two.load(std::memory_order_acquire) == local) {
+        if (threadOneDone.load(std::memory_order_acquire)) {
+          return;
+        }
+      }
+    }
+  }};
+  /* sleep override */
+  std::this_thread::sleep_for(std::chrono::seconds{10});
+  testDone.store(true);
+  threadOne.join();
+  threadTwo.join();
+}
 // This is not possible to implement with Futex, because futex
 // and the native linux syscall are 32-bit only.
 TEST(ParkingLot, LargeWord) {