Commit efc07505 authored by Dave Watson's avatar Dave Watson Committed by Facebook Github Bot

update small locks benchmark

Summary: Add fairness benchmark, contended benchmark

Reviewed By: davidtgoldblatt

Differential Revision: D3706680

fbshipit-source-id: 01814fcd6b0cc3dd12815a2b2c248abc6a3771ec
parent eab8d3c3
......@@ -16,6 +16,225 @@
#include <folly/Benchmark.h>
#include <folly/SmallLocks.h>
#include <algorithm>
#include <condition_variable>
#include <numeric>
#include <thread>
#include <vector>
/* "Work cycle" is just an additional nop loop iteration.
* A smaller number of work cyles will result in more contention,
* which is what we're trying to measure. The relative ratio of
* locked to unlocked cycles will simulate how big critical sections
* are in production code
*/
DEFINE_int32(work, 100, "Number of work cycles");
DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
DEFINE_int32(
threads,
std::thread::hardware_concurrency(),
"Number of threads for fairness test");
static void burn(size_t n) {
for (size_t i = 0; i < n; ++i) {
folly::doNotOptimizeAway(i);
}
}
namespace {
struct SimpleBarrier {
explicit SimpleBarrier(size_t count) : lock_(), cv_(), count_(count) {}
void wait() {
std::unique_lock<std::mutex> lockHeld(lock_);
if (++num_ == count_) {
cv_.notify_all();
} else {
cv_.wait(lockHeld);
}
}
private:
std::mutex lock_;
std::condition_variable cv_;
size_t num_{0};
size_t count_;
};
}
template <typename Lock>
class InitLock {
Lock lock_;
public:
InitLock() {
lock_.init();
}
void lock() {
lock_.lock();
}
void unlock() {
lock_.unlock();
}
};
template <typename Lock>
static void runContended(size_t numOps, size_t numThreads) {
folly::BenchmarkSuspender braces;
size_t totalthreads = std::thread::hardware_concurrency();
if (totalthreads < numThreads) {
totalthreads = numThreads;
}
size_t threadgroups = totalthreads / numThreads;
struct lockstruct {
char padding1[128];
Lock lock;
char padding2[128];
long value = 1;
};
auto locks =
(struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
char padding3[128];
(void)padding3;
std::vector<std::thread> threads(totalthreads);
SimpleBarrier runbarrier(totalthreads + 1);
for (size_t t = 0; t < totalthreads; ++t) {
threads[t] = std::thread([&, t, totalthreads] {
lockstruct* lock = &locks[t % threadgroups];
runbarrier.wait();
for (size_t op = 0; op < numOps; op += 1) {
lock->lock.lock();
burn(FLAGS_work);
lock->value++;
lock->lock.unlock();
burn(FLAGS_unlocked_work);
}
});
}
runbarrier.wait();
braces.dismissing([&] {
for (auto& thr : threads) {
thr.join();
}
});
}
template <typename Lock>
static void runFairness() {
size_t numThreads = FLAGS_threads;
size_t totalthreads = std::thread::hardware_concurrency();
if (totalthreads < numThreads) {
totalthreads = numThreads;
}
long threadgroups = totalthreads / numThreads;
struct lockstruct {
char padding1[128];
Lock lock;
};
auto locks =
(struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
char padding3[64];
(void)padding3;
std::vector<std::thread> threads(totalthreads);
std::atomic<bool> stop{false};
std::mutex rlock;
std::vector<long> results;
std::vector<std::chrono::microseconds> maxes;
std::vector<std::chrono::microseconds> aqTime;
std::vector<unsigned long> aqTimeSq;
SimpleBarrier runbarrier(totalthreads + 1);
for (size_t t = 0; t < totalthreads; ++t) {
threads[t] = std::thread([&, t, totalthreads] {
lockstruct* lock = &locks[t % threadgroups];
long value = 0;
std::chrono::microseconds max(0);
std::chrono::microseconds time(0);
unsigned long timeSq(0);
runbarrier.wait();
while (!stop) {
std::chrono::steady_clock::time_point prelock =
std::chrono::steady_clock::now();
lock->lock.lock();
std::chrono::steady_clock::time_point postlock =
std::chrono::steady_clock::now();
auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
postlock - prelock);
time += diff;
timeSq += diff.count() * diff.count();
if (diff > max) {
max = diff;
}
burn(FLAGS_work);
value++;
lock->lock.unlock();
burn(FLAGS_unlocked_work);
}
{
std::lock_guard<std::mutex> g(rlock);
results.push_back(value);
maxes.push_back(max);
aqTime.push_back(time);
aqTimeSq.push_back(timeSq);
}
});
}
runbarrier.wait();
/* sleep override */
std::this_thread::sleep_for(std::chrono::seconds(2));
stop = true;
for (auto& thr : threads) {
thr.join();
}
// Calulate some stats
unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0);
double m = sum / results.size();
double accum = 0.0;
std::for_each(results.begin(), results.end(), [&](const double d) {
accum += (d - m) * (d - m);
});
double stdev = sqrt(accum / (results.size() - 1));
std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
std::chrono::microseconds agAqTime = std::accumulate(
aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
unsigned long agAqTimeSq =
std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
std::chrono::microseconds mean = agAqTime / sum;
double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
sum / (sum - 1);
double stddev2 = sqrt(variance);
printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev);
printf(
"Lock time stats in us: mean %li stddev %.0f max %li\n",
mean.count(),
stddev2,
mx.count());
}
BENCHMARK(StdMutexUncontendedBenchmark, iters) {
std::mutex lock;
while (iters--) {
lock.lock();
lock.unlock();
}
}
BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
folly::MicroSpinLock lock;
......@@ -45,14 +264,6 @@ BENCHMARK(MicroLockUncontendedBenchmark, iters) {
}
}
BENCHMARK(StdMutexUncontendedBenchmark, iters) {
std::mutex lock;
while (iters--) {
lock.lock();
lock.unlock();
}
}
struct VirtualBase {
virtual void foo() = 0;
virtual ~VirtualBase() {}
......@@ -78,8 +289,137 @@ BENCHMARK(VirtualFunctionCall, iters) {
}
#endif
BENCHMARK_DRAW_LINE()
#define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__))
#define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__))
static void std_mutex(size_t numOps, size_t numThreads) {
runContended<std::mutex>(numOps, numThreads);
}
static void folly_microspin(size_t numOps, size_t numThreads) {
runContended<InitLock<folly::MicroSpinLock>>(numOps, numThreads);
}
static void folly_picospin(size_t numOps, size_t numThreads) {
runContended<InitLock<folly::PicoSpinLock<uint16_t>>>(numOps, numThreads);
}
static void folly_microlock(size_t numOps, size_t numThreads) {
runContended<folly::MicroLock>(numOps, numThreads);
}
BENCHMARK_DRAW_LINE()
BENCH_BASE(std_mutex, 1thread, 1)
BENCH_REL(folly_microspin, 1thread, 1)
BENCH_REL(folly_picospin, 1thread, 1)
BENCH_REL(folly_microlock, 1thread, 1)
BENCHMARK_DRAW_LINE()
BENCH_BASE(std_mutex, 2thread, 2)
BENCH_REL(folly_microspin, 2thread, 2)
BENCH_REL(folly_picospin, 2thread, 2)
BENCH_REL(folly_microlock, 2thread, 2)
BENCHMARK_DRAW_LINE()
BENCH_BASE(std_mutex, 4thread, 4)
BENCH_REL(folly_microspin, 4thread, 4)
BENCH_REL(folly_picospin, 4thread, 4)
BENCH_REL(folly_microlock, 4thread, 4)
BENCHMARK_DRAW_LINE()
BENCH_BASE(std_mutex, 8thread, 8)
BENCH_REL(folly_microspin, 8thread, 8)
BENCH_REL(folly_picospin, 8thread, 8)
BENCH_REL(folly_microlock, 8thread, 8)
BENCHMARK_DRAW_LINE()
BENCH_BASE(std_mutex, 16thread, 16)
BENCH_REL(folly_microspin, 16thread, 16)
BENCH_REL(folly_picospin, 16thread, 16)
BENCH_REL(folly_microlock, 16thread, 16)
BENCHMARK_DRAW_LINE()
BENCH_BASE(std_mutex, 32thread, 32)
BENCH_REL(folly_microspin, 32thread, 32)
BENCH_REL(folly_picospin, 32thread, 32)
BENCH_REL(folly_microlock, 32thread, 32)
BENCHMARK_DRAW_LINE()
BENCH_BASE(std_mutex, 64thread, 64)
BENCH_REL(folly_microspin, 64thread, 64)
BENCH_REL(folly_picospin, 64thread, 64)
BENCH_REL(folly_microlock, 64thread, 64)
#define FairnessTest(type) \
{ \
printf(#type ": \n"); \
runFairness<type>(); \
}
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
FairnessTest(std::mutex);
FairnessTest(InitLock<folly::MicroSpinLock>);
FairnessTest(InitLock<folly::PicoSpinLock<uint16_t>>);
FairnessTest(folly::MicroLock);
folly::runBenchmarks();
return 0;
}
/*
locks_benchmark --bm_min_iters=1000000
std::mutex:
Sum: 2895849 Mean: 90495 stddev: 3147
Lock time stats in us: mean 11 stddev 1483 max 17823
InitLock<folly::MicroSpinLock>:
Sum: 3114365 Mean: 97323 stddev: 92604
Lock time stats in us: mean 19 stddev 1379 max 235710
InitLock<folly::PicoSpinLock<uint16_t>>:
Sum: 1189713 Mean: 37178 stddev: 23295
Lock time stats in us: mean 51 stddev 3610 max 414093
folly::MicroLock:
Sum: 1890150 Mean: 59067 stddev: 26795
Lock time stats in us: mean 31 stddev 2272 max 70996
============================================================================
folly/test/SmallLocksBenchmark.cpp relative time/iter iters/s
============================================================================
StdMutexUncontendedBenchmark 25.97ns 38.51M
MicroSpinLockUncontendedBenchmark 16.26ns 61.49M
PicoSpinLockUncontendedBenchmark 15.92ns 62.82M
MicroLockUncontendedBenchmark 28.54ns 35.03M
VirtualFunctionCall 1.73ns 576.51M
----------------------------------------------------------------------------
----------------------------------------------------------------------------
std_mutex(1thread) 898.49ns 1.11M
folly_microspin(1thread) 87.68% 1.02us 975.85K
folly_picospin(1thread) 95.06% 945.15ns 1.06M
folly_microlock(1thread) 87.34% 1.03us 972.05K
----------------------------------------------------------------------------
std_mutex(2thread) 1.29us 773.03K
folly_microspin(2thread) 90.35% 1.43us 698.40K
folly_picospin(2thread) 126.11% 1.03us 974.90K
folly_microlock(2thread) 109.99% 1.18us 850.24K
----------------------------------------------------------------------------
std_mutex(4thread) 3.03us 330.05K
folly_microspin(4thread) 94.03% 3.22us 310.33K
folly_picospin(4thread) 127.16% 2.38us 419.70K
folly_microlock(4thread) 64.39% 4.71us 212.51K
----------------------------------------------------------------------------
std_mutex(8thread) 4.79us 208.81K
folly_microspin(8thread) 71.50% 6.70us 149.30K
folly_picospin(8thread) 54.58% 8.77us 113.96K
folly_microlock(8thread) 55.58% 8.62us 116.05K
----------------------------------------------------------------------------
std_mutex(16thread) 12.14us 82.39K
folly_microspin(16thread) 102.02% 11.90us 84.05K
folly_picospin(16thread) 62.30% 19.48us 51.33K
folly_microlock(16thread) 64.54% 18.81us 53.17K
----------------------------------------------------------------------------
std_mutex(32thread) 22.52us 44.41K
folly_microspin(32thread) 88.25% 25.52us 39.19K
folly_picospin(32thread) 46.04% 48.91us 20.45K
folly_microlock(32thread) 67.08% 33.57us 29.79K
----------------------------------------------------------------------------
std_mutex(64thread) 43.29us 23.10K
folly_microspin(64thread) 98.01% 44.17us 22.64K
folly_picospin(64thread) 48.62% 89.04us 11.23K
folly_microlock(64thread) 62.68% 69.07us 14.48K
============================================================================
*/
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment