Commit 3c1306e1 authored by Philip Pronin's avatar Philip Pronin Committed by Pavlo Kushnir

BMI1 support in EliasFanoCoding

Summary:
This diff updates `folly::CpuId` with support of extended features (EAX =
7, ECX = 0) to provide detection logic for BMI1 introduced in Haswell, and
provides support for `BLSR` instruction in `EliasFanoReader`.

Test Plan:
I used clang to compile the logic and run unittests

Reviewed By: lucian@fb.com

Subscribers: fbcode-common-diffs@, trunkagent, chaoyc, search-fbcode-diffs@, unicorn-diffs@, njormrod, folly-diffs@

FB internal diff: D1658100

Signature: t1:1658100:1415126635:d1820b8eb41c9e9786b5c8062b801cf1e2049a97
parent c4035d06
......@@ -24,7 +24,7 @@ namespace folly {
/**
* Identification of an Intel CPU.
* Supports CPUID (EAX=1) feature flags.
* Supports CPUID feature flags (EAX=1) and extended features (EAX=7, ECX=0).
* Values from http://www.intel.com/content/www/us/en/processors/processor-identification-cpuid-instruction-note.html
*/
class CpuId {
......@@ -32,23 +32,34 @@ class CpuId {
CpuId() {
#ifdef _MSC_VER
int reg[4];
__cpuid((int *)reg, 1);
c_ = reg[2];
d_ = reg[3];
__cpuid(static_cast<int*>(reg), 0);
const int n = reg[0];
if (n >= 1) {
__cpuid(static_cast<int*>(reg), 1);
f1c_ = reg[2];
f1d_ = reg[3];
}
if (n >= 7) {
__cpuidex(static_cast<int*>(reg), 7, 0);
f7b_ = reg[1];
f7c_ = reg[2];
}
#elif FOLLY_X64 || defined(__i386__)
__asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
#else
// On non-Intel, none of these features exist; at least not in the same form
// as they do on Intel
c_ = 0;
d_ = 0;
uint32_t n;
__asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "edx", "ecx");
if (n >= 1) {
__asm__("cpuid" : "=c"(f1c_), "=d"(f1d_) : "a"(1) : "ebx");
}
if (n >= 7) {
__asm__("cpuid" : "=b"(f7b_), "=c"(f7c_) : "a"(7), "c"(0) : "edx");
}
#endif
}
#define X(name, r, bit) bool name() const { return r & (1U << bit); }
#define C(name, bit) X(name, c_, bit)
#define D(name, bit) X(name, d_, bit)
#define X(name, r, bit) bool name() const { return (r) & (1U << bit); }
// cpuid(1): Processor Info and Feature Bits.
#define C(name, bit) X(name, f1c_, bit)
C(sse3, 0)
C(pclmuldq, 1)
C(dtes64, 2)
......@@ -60,12 +71,10 @@ class CpuId {
C(tm2, 8)
C(ssse3, 9)
C(cnxtid, 10)
// 11 is reserved
C(fma, 12)
C(cx16, 13)
C(xtpr, 14)
C(pdcm, 15)
// 16 is reserved
C(pcid, 17)
C(dca, 18)
C(sse41, 19)
......@@ -80,7 +89,8 @@ class CpuId {
C(avx, 28)
C(f16c, 29)
C(rdrand, 30)
// 31 is not used
#undef C
#define D(name, bit) X(name, f1d_, bit)
D(fpu, 0)
D(vme, 1)
D(de, 2)
......@@ -91,7 +101,6 @@ class CpuId {
D(mce, 7)
D(cx8, 8)
D(apic, 9)
// 10 is reserved
D(sep, 11)
D(mtrr, 12)
D(pge, 13)
......@@ -101,7 +110,6 @@ class CpuId {
D(pse36, 17)
D(psn, 18)
D(clfsh, 19)
// 20 is reserved
D(ds, 21)
D(acpi, 22)
D(mmx, 23)
......@@ -111,14 +119,48 @@ class CpuId {
D(ss, 27)
D(htt, 28)
D(tm, 29)
// 30 is reserved
D(pbe, 31)
#undef D
// cpuid(7): Extended Features.
#define B(name, bit) X(name, f7b_, bit)
B(bmi1, 3)
B(hle, 4)
B(avx2, 5)
B(smep, 7)
B(bmi2, 8)
B(erms, 9)
B(invpcid, 10)
B(rtm, 11)
B(mpx, 14)
B(avx512f, 16)
B(avx512dq, 17)
B(rdseed, 18)
B(adx, 19)
B(smap, 20)
B(avx512ifma, 21)
B(pcommit, 22)
B(clflushopt, 23)
B(clwb, 24)
B(avx512pf, 26)
B(avx512er, 27)
B(avx512cd, 28)
B(sha, 29)
B(avx512bw, 30)
B(avx512vl, 31)
#undef B
#define C(name, bit) X(name, f7c_, bit)
C(prefetchwt1, 0)
C(avx512vbmi, 1)
#undef C
#undef X
private:
uint32_t c_; // ECX
uint32_t d_; // EDX
uint32_t f1c_ = 0;
uint32_t f1d_ = 0;
uint32_t f7b_ = 0;
uint32_t f7c_ = 0;
};
} // namespace folly
......
......@@ -291,7 +291,7 @@ struct EliasFanoEncoder {
namespace instructions {
struct Default {
static bool supported() {
static bool supported(const folly::CpuId& cpuId = {}) {
return true;
}
static inline uint64_t popcount(uint64_t value) {
......@@ -301,20 +301,36 @@ struct Default {
DCHECK_GT(value, 0);
return __builtin_ctzll(value);
}
static inline uint64_t blsr(uint64_t value) {
return value & (value - 1);
}
};
struct Fast : public Default {
static bool supported() {
folly::CpuId cpuId;
struct Nehalem : public Default {
static bool supported(const folly::CpuId& cpuId = {}) {
return cpuId.popcnt();
}
static inline uint64_t popcount(uint64_t value) {
// POPCNT is supported starting with Intel Nehalem, AMD K10.
uint64_t result;
asm ("popcntq %1, %0" : "=r" (result) : "r" (value));
return result;
}
};
struct Haswell : public Nehalem {
static bool supported(const folly::CpuId& cpuId = {}) {
return Nehalem::supported(cpuId) && cpuId.bmi1();
}
static inline uint64_t blsr(uint64_t value) {
// BMI1 is supported starting with Intel Haswell, AMD Piledriver.
// BLSR combines two instuctions into one and reduces register pressure.
uint64_t result;
asm ("blsrq %1, %0" : "=r" (result) : "r" (value));
return result;
}
};
} // namespace instructions
namespace detail {
......@@ -352,7 +368,7 @@ class UpperBitsReader {
++position_;
inner_ = Instructions::ctz(block_);
block_ &= block_ - 1;
block_ = Instructions::blsr(block_);
return setValue();
}
......@@ -396,11 +412,11 @@ class UpperBitsReader {
// Kill n - 1 least significant 1-bits.
for (size_t i = 0; i < n - 1; ++i) {
block_ &= block_ - 1;
block_ = Instructions::blsr(block_);
}
inner_ = Instructions::ctz(block_);
block_ &= block_ - 1;
block_ = Instructions::blsr(block_);
return setValue();
}
......
......@@ -172,7 +172,7 @@ BENCHMARK(Encode_1M) {
}
#if 0
Intel(R) Xeon(R) CPU E5-2660 @ 2.7GHz (turbo on), using instructions::Fast.
Intel(R) Xeon(R) CPU E5-2660 @ 2.7GHz (turbo on), using instructions::Nehalem.
============================================================================
folly/experimental/test/EliasFanoCodingTest.cpp relative time/iter iters/s
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment