Commit e2d713fc authored by Tingzhe Zhou's avatar Tingzhe Zhou Committed by Facebook Github Bot

Change file name and add benchmark

Summary:
Make the file name consistent with its class name
Add benchmark code
Attach performance data

Reviewed By: magedm

Differential Revision: D9250954

fbshipit-source-id: b9f9b90cb83ca319d91fe65667d0072511be5e26
parent c2e51a8e
......@@ -18,7 +18,8 @@
#include <boost/thread.hpp>
#include <folly/Random.h>
#include <folly/SpinLock.h>
#include <folly/experimental/ConcurrentPriorityQueue.h>
#include <folly/experimental/FlatCombiningPriorityQueue.h>
#include <folly/experimental/RelaxedConcurrentPriorityQueue.h>
#include <folly/portability/GFlags.h>
#include <folly/portability/GTest.h>
#include <folly/test/DeterministicSchedule.h>
......@@ -26,6 +27,7 @@
using namespace folly;
DEFINE_bool(bench, false, "run benchmark");
DEFINE_int32(reps, 1, "number of reps");
DEFINE_int64(ops, 32, "number of operations per rep");
DEFINE_int64(elems, 64, "number of elements");
......@@ -89,7 +91,7 @@ TEST(CPQ, BasicOpsTest) {
/// execute the function for nthreads
template <typename Func>
static void run_once(const Func& fn) {
static uint64_t run_once(const Func& fn) {
boost::barrier barrier_start{nthreads + 1};
std::vector<std::thread> threads(nthreads);
for (uint32_t tid = 0; tid < nthreads; ++tid) {
......@@ -100,9 +102,17 @@ static void run_once(const Func& fn) {
}
barrier_start.wait(); // start the execution
auto tbegin = std::chrono::steady_clock::now();
for (auto& t : threads) {
t.join();
}
// end time measurement
uint64_t duration = 0;
auto tend = std::chrono::steady_clock::now();
duration = std::chrono::duration_cast<std::chrono::nanoseconds>(tend - tbegin)
.count();
return duration;
}
template <class PriorityQueue>
......@@ -816,3 +826,746 @@ TEST(CPQ, DSchedMixedRelaxedTest) {
DeterministicAtomic>,
DeterministicAtomic>();
}
template <typename T>
class Queue {
std::queue<T> q_;
public:
void push(const T& val) {
q_.push(val);
}
void pop(T& val) {
val = q_.front();
q_.pop();
}
};
template <typename T>
class GlobalLockPQ {
std::priority_queue<T> q_;
std::mutex m_;
public:
void push(const T& val) {
std::lock_guard<std::mutex> g(m_);
q_.push(val);
}
void pop(T& val) {
while (true) {
std::lock_guard<std::mutex> g(m_);
if (q_.empty()) {
continue;
}
val = q_.top();
q_.pop();
return;
}
}
};
template <class PriorityQueue>
static uint64_t producer_consumer_test(
std::string name,
uint32_t PushThr,
uint32_t PopThr,
uint64_t initial_size) {
int ops = 1 << 18;
int reps = 15;
if (name.find("RCPQ") != std::string::npos) {
ops <<= 3;
}
uint64_t min = UINTMAX_MAX;
uint64_t max = 0;
uint64_t sum = 0;
uint32_t total_threads = PushThr + PopThr;
for (int r = 0; r < reps; ++r) {
uint64_t dur;
PriorityQueue pq;
folly::Random::DefaultGenerator rng;
rng.seed(initial_size);
// initialize the queue according to initial_size
for (uint64_t i = 0; i < initial_size; i++) {
int val = folly::Random::rand32(rng) % ops;
pq.push(val);
}
auto fn_popthr = [&](uint32_t tid) {
for (int i = tid; i < ops; i += PopThr) {
int val;
pq.pop(val);
}
};
auto fn_pushthr = [&](uint32_t tid) {
folly::Random::DefaultGenerator rng_t;
rng_t.seed(tid);
for (int i = tid; i < ops; i += PushThr) {
int val = folly::Random::rand32(rng_t) % ops;
pq.push(val);
}
};
boost::barrier barrier_start{total_threads + 1};
std::vector<std::thread> threads_push(PushThr);
for (uint32_t tid = 0; tid < PushThr; ++tid) {
threads_push[tid] = std::thread([&, tid] {
barrier_start.wait();
fn_pushthr(tid);
});
}
std::vector<std::thread> threads_pop(PopThr);
for (uint32_t tid = 0; tid < PopThr; ++tid) {
threads_pop[tid] = std::thread([&, tid] {
barrier_start.wait();
fn_popthr(tid);
});
}
barrier_start.wait(); // start the execution
// begin time measurement
auto tbegin = std::chrono::steady_clock::now();
for (auto& t : threads_push) {
t.join();
}
for (auto& t : threads_pop) {
t.join();
}
// end time measurement
auto tend = std::chrono::steady_clock::now();
dur = std::chrono::duration_cast<std::chrono::nanoseconds>(tend - tbegin)
.count();
sum += dur;
min = std::min(min, dur);
max = std::max(max, dur);
}
uint64_t avg = sum / reps;
std::cout << std::setw(12) << name;
std::cout << " " << std::setw(8) << max / ops << " ns";
std::cout << " " << std::setw(8) << avg / ops << " ns";
std::cout << " " << std::setw(8) << min / ops << " ns";
std::cout << std::endl;
return min;
}
template <class PriorityQueue>
static uint64_t throughtput_test(std::string name, uint64_t initial_size) {
int ops = 1 << 18;
int reps = 15;
uint64_t min = UINTMAX_MAX;
uint64_t max = 0;
uint64_t sum = 0;
for (int r = 0; r < reps; ++r) {
uint64_t dur;
PriorityQueue pq;
folly::Random::DefaultGenerator rng;
rng.seed(initial_size);
// initialize the queue according to initial_size
for (uint64_t i = 0; i < initial_size; i++) {
int val = folly::Random::rand32(rng) % (ops + 1);
pq.push(val);
}
auto fn = [&](uint32_t tid) {
folly::Random::DefaultGenerator rng_tl;
rng_tl.seed(tid);
uint32_t counter = 0;
for (int i = tid; i < ops; i += nthreads) {
int val;
counter++;
if (counter % 2) {
val = folly::Random::rand32(rng_tl) % (ops + 1);
pq.push(val);
} else {
pq.pop(val);
}
}
};
dur = run_once(fn);
sum += dur;
min = std::min(min, dur);
max = std::max(max, dur);
}
uint64_t avg = sum / reps;
std::cout << std::setw(12) << name;
std::cout << " " << std::setw(8) << max / ops << " ns";
std::cout << " " << std::setw(8) << avg / ops << " ns";
std::cout << " " << std::setw(8) << min / ops << " ns";
std::cout << std::endl;
return min;
}
template <class PriorityQueue>
static void
accuracy_test(std::string name, uint64_t initial_size, uint32_t top_percent) {
int avg = 0;
int reps = 15;
int valid = initial_size / top_percent;
if (valid < 1) {
return;
}
int target = initial_size - valid;
for (int r = 0; r < reps; ++r) {
PriorityQueue pq;
std::unordered_set<int> filter;
folly::Random::DefaultGenerator rng;
rng.seed(initial_size + r);
// initialize the queue according to initial_size
// eliminate repeated priorities
for (uint64_t i = 0; i < initial_size; i++) {
int val;
do {
val = folly::Random::rand32(rng) % initial_size;
} while (filter.find(val) != filter.end());
filter.insert(val);
pq.push(val);
}
int counter = 0;
int stop = valid;
for (uint64_t i = 0; i < initial_size; i++) {
int val;
pq.pop(val);
if (val >= target) {
stop--;
}
if (stop > 0 && val < target) {
counter++;
}
if (stop == 0) {
break;
}
}
avg += counter;
}
avg /= reps;
std::cout << std::setw(16) << name << " ";
std::cout << "Lower priority popped: " << avg;
std::cout << std::endl;
}
using FCPQ = folly::FlatCombiningPriorityQueue<int>;
TEST(CPQ, ThroughtputBench) {
if (!FLAGS_bench) {
return;
}
std::vector<int> test_sizes = {64, 512, 65536};
std::vector<int> nthrs = {1, 2, 4, 8, 12, 14, 16, 28, 32, 56};
std::cout << "Threads have equal chance to push and pop. \n"
<< "The bench caculates the avg execution time for\n"
<< "one operation (push OR pop).\n"
<< "GL : std::priority_queue protected by global lock\n"
<< "FL : flatcombinning priority queue\n"
<< "RCPQ: the relaxed concurrent priority queue\n"
<< std::endl;
std::cout << "\nTest_name, Max time, Avg time, Min time" << std::endl;
for (auto s : test_sizes) {
std::cout << "\n ------ Initial size: " << s << " ------" << std::endl;
for (int i : nthrs) {
nthreads = i;
std::cout << "Thread number: " << i << std::endl;
throughtput_test<GlobalLockPQ<int>>("GL", s);
throughtput_test<FCPQ>("FC", s);
throughtput_test<RelaxedConcurrentPriorityQueue<int>>("RCPQ", s);
}
}
}
TEST(CPQ, ProducerConsumerBench) {
if (!FLAGS_bench) {
return;
}
std::vector<int> test_sizes = {0, 512, 65536};
std::vector<int> nthrs = {1, 2, 4, 8, 12, 16, 24};
std::cout << "<Producer, Consumer> pattern \n"
<< "The bench caculates the avg execution time for\n"
<< "push AND pop pair(two operations).\n"
<< "GL : std::priority_queue protected by global lock\n"
<< "FL : flatcombinning priority queue\n"
<< "RCPQ SPN: RCPQ spinning\n"
<< "RCPQ BLK: RCPQ blocking\n"
<< std::endl;
for (int s : test_sizes) {
std::cout << "\n ------ Scalability ------" << std::endl;
for (int m : nthrs) {
for (int n : nthrs) {
if (m != n) {
continue;
}
std::cout << "<" << m << " , " << n << "> , size = " << s << ":"
<< std::endl;
producer_consumer_test<GlobalLockPQ<int>>("GL", m, n, s);
producer_consumer_test<FCPQ>("FC", m, n, s);
producer_consumer_test<
RelaxedConcurrentPriorityQueue<int, false, false>>(
"RCPQ SPN", m, n, s);
producer_consumer_test<
RelaxedConcurrentPriorityQueue<int, true, false>>(
"RCPQ BLK", m, n, s);
}
}
std::cout << "\n ------ Unbalanced(Producer<Consumer) ------" << std::endl;
for (int m : nthrs) {
for (int n : nthrs) {
if (m > 4 || n - 4 <= m) {
continue;
}
std::cout << "<" << m << " , " << n << "> , size = " << s << ":"
<< std::endl;
producer_consumer_test<GlobalLockPQ<int>>("GL", m, n, s);
producer_consumer_test<FCPQ>("FC", m, n, s);
producer_consumer_test<
RelaxedConcurrentPriorityQueue<int, false, false>>(
"RCPQ SPN", m, n, s);
producer_consumer_test<
RelaxedConcurrentPriorityQueue<int, true, false>>(
"RCPQ BLK", m, n, s);
}
}
std::cout << "\n ------ Unbalanced(Producer>Consumer) ------" << std::endl;
for (int m : nthrs) {
for (int n : nthrs) {
if (m <= 8 || n > m - 4 || n % 4 != 0) {
continue;
}
std::cout << "<" << m << " , " << n << "> , size = " << s << ":"
<< std::endl;
producer_consumer_test<GlobalLockPQ<int>>("GL", m, n, s);
producer_consumer_test<FCPQ>("FC", m, n, s);
producer_consumer_test<
RelaxedConcurrentPriorityQueue<int, false, false>>(
"RCPQ SPN", m, n, s);
producer_consumer_test<
RelaxedConcurrentPriorityQueue<int, true, false>>(
"RCPQ BLK", m, n, s);
}
}
}
}
TEST(CPQ, Accuracy) {
if (!FLAGS_bench) {
return;
}
std::vector<int> test_sizes = {512, 65536, 1 << 20};
std::vector<int> rates = {1000, 100, 10};
for (auto s : test_sizes) {
for (auto p : rates) {
std::cout << "\n------ Size: " << s << " Get top: " << 100. / p << "%"
<< " (Num: " << s / p << ")"
<< " ------" << std::endl;
accuracy_test<Queue<int>>("FIFO Q", s, p);
accuracy_test<RelaxedConcurrentPriorityQueue<int, false, false, 0>>(
"RCPQ(strict)", s, p);
accuracy_test<RelaxedConcurrentPriorityQueue<int, false, false, 2>>(
"RCPQ(batch=2)", s, p);
accuracy_test<RelaxedConcurrentPriorityQueue<int, false, false, 8>>(
"RCPQ(batch=8)", s, p);
accuracy_test<RelaxedConcurrentPriorityQueue<int, false, false, 16>>(
"RCPQ(batch=16)", s, p);
accuracy_test<RelaxedConcurrentPriorityQueue<int, false, false, 50>>(
"RCPQ(batch=50)", s, p);
}
}
}
/*
* The folly::SpinningLock use CAS directly for try_lock, which is not
efficient in the
* experiment. The lock used in the experiment based on the test-test-and-set
lock(Add
* check before doing CAS).
*
Threads have equal chance to push and pop.
The bench caculates the avg execution time for
one operation (push OR pop).
GL : std::priority_queue protected by global lock
FL : flatcombinning priority queue
RCPQ: the relaxed concurrent priority queue
Test_name, Max time, Avg time, Min time
------ Initial size: 64 ------
Thread number: 1
GL 30 ns 29 ns 27 ns
FC 47 ns 42 ns 40 ns
RCPQ 85 ns 81 ns 77 ns
Thread number: 2
GL 377 ns 274 ns 154 ns
FC 227 ns 187 ns 139 ns
RCPQ 108 ns 106 ns 102 ns
Thread number: 4
GL 244 ns 214 ns 191 ns
FC 212 ns 191 ns 173 ns
RCPQ 98 ns 95 ns 92 ns
Thread number: 8
GL 252 ns 221 ns 197 ns
FC 127 ns 112 ns 102 ns
RCPQ 78 ns 78 ns 76 ns
Thread number: 12
GL 251 ns 227 ns 217 ns
FC 104 ns 96 ns 88 ns
RCPQ 81 ns 79 ns 77 ns
Thread number: 14
GL 243 ns 232 ns 224 ns
FC 103 ns 96 ns 90 ns
RCPQ 84 ns 82 ns 81 ns
Thread number: 16
GL 254 ns 239 ns 229 ns
FC 105 ns 98 ns 92 ns
RCPQ 88 ns 85 ns 83 ns
Thread number: 28
GL 265 ns 261 ns 258 ns
FC 106 ns 100 ns 96 ns
RCPQ 93 ns 87 ns 68 ns
Thread number: 32
GL 274 ns 267 ns 261 ns
FC 110 ns 98 ns 37 ns
RCPQ 93 ns 80 ns 47 ns
Thread number: 56
GL 274 ns 263 ns 257 ns
FC 78 ns 50 ns 24 ns
RCPQ 85 ns 71 ns 45 ns
------ Initial size: 512 ------
Thread number: 1
GL 36 ns 35 ns 33 ns
FC 54 ns 49 ns 47 ns
RCPQ 79 ns 76 ns 72 ns
Thread number: 2
GL 248 ns 187 ns 151 ns
FC 228 ns 179 ns 147 ns
RCPQ 95 ns 92 ns 90 ns
Thread number: 4
GL 282 ns 260 ns 236 ns
FC 218 ns 199 ns 174 ns
RCPQ 85 ns 81 ns 79 ns
Thread number: 8
GL 306 ns 288 ns 270 ns
FC 188 ns 114 ns 104 ns
RCPQ 64 ns 62 ns 59 ns
Thread number: 12
GL 317 ns 296 ns 280 ns
FC 105 ns 99 ns 91 ns
RCPQ 59 ns 57 ns 52 ns
Thread number: 14
GL 331 ns 305 ns 293 ns
FC 109 ns 99 ns 92 ns
RCPQ 64 ns 57 ns 53 ns
Thread number: 16
GL 316 ns 308 ns 291 ns
FC 110 ns 99 ns 92 ns
RCPQ 58 ns 54 ns 52 ns
Thread number: 28
GL 348 ns 339 ns 333 ns
FC 109 ns 105 ns 100 ns
RCPQ 64 ns 62 ns 56 ns
Thread number: 32
GL 353 ns 347 ns 341 ns
FC 116 ns 102 ns 39 ns
RCPQ 62 ns 32 ns 3 ns
Thread number: 56
GL 360 ns 352 ns 342 ns
FC 101 ns 58 ns 41 ns
RCPQ 59 ns 43 ns 26 ns
------ Initial size: 65536 ------
Thread number: 1
GL 64 ns 60 ns 56 ns
FC 93 ns 72 ns 67 ns
RCPQ 293 ns 286 ns 281 ns
Thread number: 2
GL 262 ns 248 ns 231 ns
FC 318 ns 301 ns 288 ns
RCPQ 230 ns 216 ns 206 ns
Thread number: 4
GL 463 ns 452 ns 408 ns
FC 273 ns 265 ns 257 ns
RCPQ 141 ns 131 ns 126 ns
Thread number: 8
GL 582 ns 574 ns 569 ns
FC 152 ns 139 ns 131 ns
RCPQ 98 ns 81 ns 72 ns
Thread number: 12
GL 593 ns 586 ns 576 ns
FC 126 ns 123 ns 119 ns
RCPQ 85 ns 72 ns 62 ns
Thread number: 14
GL 599 ns 595 ns 588 ns
FC 138 ns 123 ns 119 ns
RCPQ 79 ns 70 ns 62 ns
Thread number: 16
GL 599 ns 592 ns 587 ns
FC 138 ns 123 ns 117 ns
RCPQ 75 ns 65 ns 56 ns
Thread number: 28
GL 611 ns 609 ns 608 ns
FC 147 ns 144 ns 137 ns
RCPQ 74 ns 70 ns 66 ns
Thread number: 32
GL 635 ns 630 ns 627 ns
FC 151 ns 143 ns 76 ns
RCPQ 199 ns 94 ns 59 ns
Thread number: 56
GL 637 ns 633 ns 627 ns
FC 176 ns 103 ns 41 ns
RCPQ 561 ns 132 ns 46 ns
<Producer, Consumer> pattern
The bench caculates the avg execution time for
push AND pop pair(two operations).
GL : std::priority_queue protected by global lock
FL : flatcombinning priority queue
RCPQ SPN: RCPQ spinning
RCPQ BLK: RCPQ blocking
------ Scalability ------
<1 , 1> , size = 0:
GL 781 ns 735 ns 652 ns
FC 599 ns 535 ns 462 ns
RCPQ SPN 178 ns 166 ns 148 ns
RCPQ BLK 217 ns 201 ns 182 ns
<2 , 2> , size = 0:
GL 686 ns 665 ns 619 ns
FC 487 ns 430 ns 398 ns
RCPQ SPN 281 ns 239 ns 139 ns
RCPQ BLK 405 ns 367 ns 181 ns
<4 , 4> , size = 0:
GL 1106 ns 1082 ns 1050 ns
FC 278 ns 242 ns 208 ns
RCPQ SPN 114 ns 107 ns 103 ns
RCPQ BLK 169 ns 158 ns 148 ns
<8 , 8> , size = 0:
GL 1169 ns 1156 ns 1144 ns
FC 236 ns 214 ns 197 ns
RCPQ SPN 121 ns 114 ns 110 ns
RCPQ BLK 154 ns 150 ns 141 ns
<12 , 12> , size = 0:
GL 1191 ns 1185 ns 1178 ns
FC 232 ns 221 ns 201 ns
RCPQ SPN 802 ns 205 ns 123 ns
RCPQ BLK 218 ns 161 ns 147 ns
<16 , 16> , size = 0:
GL 1236 ns 1227 ns 1221 ns
FC 269 ns 258 ns 243 ns
RCPQ SPN 826 ns 733 ns 655 ns
RCPQ BLK 172 ns 149 ns 137 ns
<24 , 24> , size = 0:
GL 1269 ns 1262 ns 1255 ns
FC 280 ns 225 ns 171 ns
RCPQ SPN 931 ns 891 ns 836 ns
RCPQ BLK 611 ns 445 ns 362 ns
------ Unbalanced(Producer<Consumer) ------
<1 , 8> , size = 0:
GL 1454 ns 1225 ns 1144 ns
FC 2141 ns 1974 ns 1811 ns
RCPQ SPN 597 ns 586 ns 573 ns
RCPQ BLK 663 ns 649 ns 636 ns
<1 , 12> , size = 0:
GL 1763 ns 1658 ns 1591 ns
FC 3396 ns 3261 ns 3107 ns
RCPQ SPN 735 ns 714 ns 651 ns
RCPQ BLK 773 ns 761 ns 744 ns
<1 , 16> , size = 0:
GL 2231 ns 2070 ns 1963 ns
FC 6305 ns 5771 ns 5603 ns
RCPQ SPN 787 ns 756 ns 694 ns
RCPQ BLK 828 ns 806 ns 775 ns
<1 , 24> , size = 0:
GL 3802 ns 3545 ns 3229 ns
FC 10625 ns 10311 ns 10119 ns
RCPQ SPN 781 ns 756 ns 739 ns
RCPQ BLK 892 ns 882 ns 870 ns
<2 , 8> , size = 0:
GL 873 ns 750 ns 718 ns
FC 815 ns 712 ns 659 ns
RCPQ SPN 720 ns 691 ns 673 ns
RCPQ BLK 738 ns 707 ns 694 ns
<2 , 12> , size = 0:
GL 1061 ns 968 ns 904 ns
FC 1410 ns 1227 ns 1190 ns
RCPQ SPN 862 ns 829 ns 767 ns
RCPQ BLK 825 ns 804 ns 771 ns
<2 , 16> , size = 0:
GL 1438 ns 1283 ns 1162 ns
FC 2095 ns 2012 ns 1909 ns
RCPQ SPN 763 ns 706 ns 628 ns
RCPQ BLK 833 ns 804 ns 777 ns
<2 , 24> , size = 0:
GL 2031 ns 1972 ns 1872 ns
FC 4298 ns 4191 ns 4107 ns
RCPQ SPN 762 ns 709 ns 680 ns
RCPQ BLK 876 ns 859 ns 825 ns
<4 , 12> , size = 0:
GL 696 ns 649 ns 606 ns
FC 561 ns 517 ns 480 ns
RCPQ SPN 759 ns 698 ns 498 ns
RCPQ BLK 823 ns 803 ns 786 ns
<4 , 16> , size = 0:
GL 862 ns 800 ns 749 ns
FC 857 ns 824 ns 781 ns
RCPQ SPN 730 ns 679 ns 589 ns
RCPQ BLK 863 ns 824 ns 803 ns
<4 , 24> , size = 0:
GL 1138 ns 1125 ns 1105 ns
FC 1635 ns 1576 ns 1540 ns
RCPQ SPN 756 ns 717 ns 668 ns
RCPQ BLK 865 ns 839 ns 812 ns
------ Unbalanced(Producer>Consumer) ------
<12 , 4> , size = 0:
GL 1115 ns 1087 ns 1053 ns
FC 373 ns 355 ns 333 ns
RCPQ SPN 155 ns 147 ns 142 ns
RCPQ BLK 202 ns 190 ns 182 ns
<12 , 8> , size = 0:
GL 1167 ns 1157 ns 1148 ns
FC 281 ns 256 ns 227 ns
RCPQ SPN 132 ns 126 ns 120 ns
RCPQ BLK 175 ns 164 ns 161 ns
<16 , 4> , size = 0:
GL 1103 ns 1088 ns 1074 ns
FC 442 ns 380 ns 327 ns
RCPQ SPN 178 ns 162 ns 150 ns
RCPQ BLK 217 ns 200 ns 188 ns
<16 , 8> , size = 0:
GL 1164 ns 1153 ns 1143 ns
FC 290 ns 268 ns 243 ns
RCPQ SPN 146 ns 138 ns 134 ns
RCPQ BLK 184 ns 175 ns 161 ns
<16 , 12> , size = 0:
GL 1196 ns 1189 ns 1185 ns
FC 269 ns 260 ns 245 ns
RCPQ SPN 405 ns 172 ns 129 ns
RCPQ BLK 172 ns 165 ns 152 ns
<24 , 4> , size = 0:
GL 1097 ns 1081 ns 1030 ns
FC 407 ns 369 ns 301 ns
RCPQ SPN 184 ns 176 ns 164 ns
RCPQ BLK 220 ns 211 ns 201 ns
<24 , 8> , size = 0:
GL 1177 ns 1158 ns 1148 ns
FC 321 ns 297 ns 233 ns
RCPQ SPN 155 ns 148 ns 139 ns
RCPQ BLK 204 ns 188 ns 173 ns
<24 , 12> , size = 0:
GL 1224 ns 1215 ns 1205 ns
FC 320 ns 287 ns 218 ns
RCPQ SPN 145 ns 141 ns 135 ns
RCPQ BLK 176 ns 167 ns 160 ns
<24 , 16> , size = 0:
GL 1250 ns 1244 ns 1238 ns
FC 339 ns 257 ns 209 ns
RCPQ SPN 615 ns 480 ns 359 ns
RCPQ BLK 185 ns 151 ns 137 ns
[ RUN ] CPQ.Accuracy
The Accuracy test check how many pops return lower
priority when popping the top X% priorities.
The default batch size is 16.
------ Size: 512 Get top: 1% (Num: 5) ------
FIFO Q Lower priority popped: 439
RCPQ(strict) Lower priority popped: 0
RCPQ(batch=2) Lower priority popped: 1
RCPQ(batch=8) Lower priority popped: 10
RCPQ(batch=16) Lower priority popped: 13
RCPQ(batch=50) Lower priority popped: 11
------ Size: 512 Get top: 10% (Num: 51) ------
FIFO Q Lower priority popped: 451
RCPQ(strict) Lower priority popped: 0
RCPQ(batch=2) Lower priority popped: 15
RCPQ(batch=8) Lower priority popped: 73
RCPQ(batch=16) Lower priority popped: 147
RCPQ(batch=50) Lower priority popped: 201
------ Size: 65536 Get top: 0.1% (Num: 65) ------
FIFO Q Lower priority popped: 64917
RCPQ(strict) Lower priority popped: 0
RCPQ(batch=2) Lower priority popped: 35
RCPQ(batch=8) Lower priority popped: 190
RCPQ(batch=16) Lower priority popped: 387
RCPQ(batch=50) Lower priority popped: 655
------ Size: 65536 Get top: 1% (Num: 655) ------
FIFO Q Lower priority popped: 64793
RCPQ(strict) Lower priority popped: 0
RCPQ(batch=2) Lower priority popped: 122
RCPQ(batch=8) Lower priority popped: 516
RCPQ(batch=16) Lower priority popped: 1450
RCPQ(batch=50) Lower priority popped: 3219
------ Size: 65536 Get top: 10% (Num: 6553) ------
FIFO Q Lower priority popped: 58977
RCPQ(strict) Lower priority popped: 0
RCPQ(batch=2) Lower priority popped: 174
RCPQ(batch=8) Lower priority popped: 753
RCPQ(batch=16) Lower priority popped: 1436
RCPQ(batch=50) Lower priority popped: 3297
------ Size: 1048576 Get top: 0.1% (Num: 1048) ------
FIFO Q Lower priority popped: 1046345
RCPQ(strict) Lower priority popped: 0
RCPQ(batch=2) Lower priority popped: 124
RCPQ(batch=8) Lower priority popped: 449
RCPQ(batch=16) Lower priority popped: 1111
RCPQ(batch=50) Lower priority popped: 3648
------ Size: 1048576 Get top: 1% (Num: 10485) ------
FIFO Q Lower priority popped: 1038012
RCPQ(strict) Lower priority popped: 0
RCPQ(batch=2) Lower priority popped: 297
RCPQ(batch=8) Lower priority popped: 1241
RCPQ(batch=16) Lower priority popped: 2489
RCPQ(batch=50) Lower priority popped: 7764
------ Size: 1048576 Get top: 10% (Num: 104857) ------
FIFO Q Lower priority popped: 943706
RCPQ(strict) Lower priority popped: 0
RCPQ(batch=2) Lower priority popped: 1984
RCPQ(batch=8) Lower priority popped: 8150
RCPQ(batch=16) Lower priority popped: 15787
RCPQ(batch=50) Lower priority popped: 42778
The experiment was running on 1 NUMA node,
which is 14 cores.
rchitecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 56
On-line CPU(s) list: 0-55
Thread(s) per core: 2
Core(s) per socket: 14
Socket(s): 2
NUMA node(s): 2
Vendor ID: GenuineIntel
CPU family: 6
Model: 79
Model name: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz
Stepping: 1
CPU MHz: 2401.000
CPU max MHz: 2401.0000
CPU min MHz: 1200.0000
BogoMIPS: 4788.91
Virtualization: VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 35840K
NUMA node0 CPU(s): 0-13,28-41
NUMA node1 CPU(s): 14-27,42-55
*/
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment