Commit 7fd87e7e authored by Tudor Bosman's avatar Tudor Bosman

Detect popcnt instruction at runtime, use it if available.

Summary:
If compiled for a popcnt-supporting target (-march=corei7, for example),
use __builtin_popcount, as it's presumably inlined.  Otherwise, detect
on startup (in the same way as glibc dispatches to one of the many
flavors of memcpy): GCC allows us to add a resolver function which the
dynamic loader will call on startup to resolve a function to one of
various alternatives; we check (using the cpuid instruction) whether
popcnt is supported, and use it if available.

Test Plan: tests added

Reviewed By: soren@fb.com

FB internal diff: D542977
parent 192cff5d
/*
* Copyright 2012 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "folly/Bits.h"
#include "folly/CpuId.h"
// None of this is necessary if we're compiling for a target that supports
// popcnt
#ifndef __POPCNT__
namespace {
int popcount_inst(unsigned int x) {
asm ("popcntl %0, %0" : "=r" (x) : "0" (x));
return x;
}
int popcount_builtin(unsigned int x) {
return __builtin_popcount(x);
}
int popcountll_inst(unsigned long long x) {
asm ("popcntq %0, %0" : "=r" (x) : "0" (x));
return x;
}
int popcountll_builtin(unsigned long long x) {
return __builtin_popcountll(x);
}
typedef decltype(popcount_builtin) Type_popcount;
typedef decltype(popcountll_builtin) Type_popcountll;
} // namespace
// This function is called on startup to resolve folly::detail::popcount
extern "C" Type_popcount* folly_popcount_ifunc() {
return folly::CpuId().popcnt() ? popcount_inst : popcount_builtin;
}
// This function is called on startup to resolve folly::detail::popcountll
extern "C" Type_popcountll* folly_popcountll_ifunc() {
return folly::CpuId().popcnt() ? popcountll_inst : popcountll_builtin;
}
namespace folly {
namespace detail {
// Call folly_popcount_ifunc on startup to resolve to either popcount_inst
// or popcount_builtin
int popcount(unsigned int x)
__attribute__((ifunc("folly_popcount_ifunc")));
// Call folly_popcount_ifunc on startup to resolve to either popcountll_inst
// or popcountll_builtin
int popcountll(unsigned long long x)
__attribute__((ifunc("folly_popcountll_ifunc")));
} // namespace detail
} // namespace folly
#endif /* !__POPCNT__ */
......@@ -26,6 +26,9 @@
* 1-based. 0 = no bits are set (x == 0)
* for x != 0, findLastSet(x) == 1 + floor(log2(x))
*
* popcount(x)
* return the number of 1 bits in x
*
* nextPowTwo(x)
* Finds the next power of two >= x.
*
......@@ -55,6 +58,11 @@
#define _GNU_SOURCE 1
#endif
#ifndef __GNUC__
#error GCC required
#endif
#include "folly/detail/BitsDetail.h"
#include "folly/detail/BitIteratorDetail.h"
#include "folly/Likely.h"
......@@ -125,21 +133,6 @@ typename std::enable_if<
return findFirstSet(static_cast<typename std::make_signed<T>::type>(x));
}
namespace detail {
// Portable, but likely slow...
inline unsigned int findLastSetPortable(uint64_t x) {
unsigned int r = (x != 0); // 1-based index, except for x==0
while (x >>= 1) {
++r;
}
return r;
}
} // namespace detail
#ifdef __GNUC__
// findLastSet: return the 1-based index of the highest bit set
// for x > 0, findLastSet(x) == 1 + floor(log2(x))
template <class T>
......@@ -179,19 +172,6 @@ typename std::enable_if<
return x ? 8 * sizeof(unsigned long long) - __builtin_clzll(x) : 0;
}
#else /* !__GNUC__ */
template <class T>
typename std::enable_if<
(std::is_integral<T>::value &&
std::is_unsigned<T>::value),
unsigned int>::type
findLastSet(T x) {
return detail:findLastSetPortable(x);
}
#endif
template <class T>
typename std::enable_if<
(std::is_integral<T>::value &&
......@@ -201,36 +181,6 @@ typename std::enable_if<
return findLastSet(static_cast<typename std::make_unsigned<T>::type>(x));
}
namespace detail {
template <class T>
inline
typename std::enable_if<
std::is_integral<T>::value && std::is_unsigned<T>::value,
T>::type
nextPowTwoPortable(T v) {
if (UNLIKELY(v == 0)) {
return 1;
}
--v;
for (uint32_t i = 1; i < sizeof(T) * 8; i <<= 8) {
v |= (v >> i);
v |= (v >> (i << 1));
v |= (v >> (i << 2));
v |= (v >> (i << 3));
v |= (v >> (i << 4));
v |= (v >> (i << 5));
v |= (v >> (i << 6));
v |= (v >> (i << 7));
}
return v + 1;
}
} // namespace detail
#ifdef __GNUC__
template <class T>
inline
typename std::enable_if<
......@@ -243,20 +193,29 @@ nextPowTwo(T v) {
return 1ul << findLastSet(v - 1);
}
#else /* __GNUC__ */
/**
* Population count
*/
template <class T>
inline
typename std::enable_if<
std::is_integral<T>::value && std::is_unsigned<T>::value,
T>::type
nextPowTwo(T v) {
return detail::nextPowTwoPortable(v);
inline typename std::enable_if<
(std::is_integral<T>::value &&
std::is_unsigned<T>::value &&
sizeof(T) <= sizeof(unsigned int)),
size_t>::type
popcount(T x) {
return detail::popcount(x);
}
#endif /* __GNUC__ */
template <class T>
inline typename std::enable_if<
(std::is_integral<T>::value &&
std::is_unsigned<T>::value &&
sizeof(T) > sizeof(unsigned int) &&
sizeof(T) <= sizeof(unsigned long long)),
size_t>::type
popcount(T x) {
return detail::popcountll(x);
}
/**
* Endianness detection and manipulation primitives.
......
/*
* Copyright 2012 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FOLLY_CPUID_H_
#define FOLLY_CPUID_H_
#include <cstdint>
namespace folly {
/**
* Identification of an Intel CPU.
* Supports CPUID (EAX=1) feature flags.
* Values from http://www.intel.com/content/www/us/en/processors/processor-identification-cpuid-instruction-note.html
*/
class CpuId {
public:
CpuId() {
__asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
}
#define X(name, r, bit) bool name() const { return r & (1U << bit); }
#define C(name, bit) X(name, c_, bit)
#define D(name, bit) X(name, d_, bit)
C(sse3, 0)
C(pclmuldq, 1)
C(dtes64, 2)
C(monitor, 3)
C(dscpl, 4)
C(vmx, 5)
C(smx, 6)
C(eist, 7)
C(tm2, 8)
C(ssse3, 9)
C(cnxtid, 10)
// 11 is reserved
C(fma, 12)
C(cx16, 13)
C(xtpr, 14)
C(pdcm, 15)
// 16 is reserved
C(pcid, 17)
C(dca, 18)
C(sse41, 19)
C(sse42, 20)
C(x2apic, 21)
C(movbe, 22)
C(popcnt, 23)
C(tscdeadline, 24)
C(aes, 25)
C(xsave, 26)
C(osxsave, 27)
C(avx, 28)
C(f16c, 29)
C(rdrand, 30)
// 31 is not used
D(fpu, 0)
D(vme, 1)
D(de, 2)
D(pse, 3)
D(tsc, 4)
D(msr, 5)
D(pae, 6)
D(mce, 7)
D(cx8, 8)
D(apic, 9)
// 10 is reserved
D(sep, 11)
D(mtrr, 12)
D(pge, 13)
D(mca, 14)
D(cmov, 15)
D(pat, 16)
D(pse36, 17)
D(psn, 18)
D(clfsh, 19)
// 20 is reserved
D(ds, 21)
D(acpi, 22)
D(mmx, 23)
D(fxsr, 24)
D(sse, 25)
D(sse2, 26)
D(ss, 27)
D(htt, 28)
D(tm, 29)
// 30 is reserved
D(pbe, 31)
#undef D
#undef C
#undef X
private:
uint32_t c_; // ECX
uint32_t d_; // EDX
};
} // namespace folly
#endif /* FOLLY_CPUID_H_ */
/*
* Copyright 2012 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FOLLY_DETAIL_BITSDETAIL_H_
#define FOLLY_DETAIL_BITSDETAIL_H_
namespace folly {
namespace detail {
// If we're targeting an architecture with popcnt support, use
// __builtin_popcount directly, as it's presumably inlined.
// If not, use runtime detection using __attribute__((ifunc))
// (see Bits.cpp)
#ifdef __POPCNT__
inline int popcount(unsigned int x) {
return __builtin_popcount(x);
}
inline int popcountll(unsigned long long x) {
return __builtin_popcountll(x);
}
#else /* !__POPCNT__ */
int popcount(unsigned int x);
int popcountll(unsigned long long x);
#endif /* !__POPCNT__ */
} // namespace detail
} // namespace folly
#endif /* FOLLY_DETAIL_BITSDETAIL_H_ */
......@@ -30,31 +30,6 @@ namespace folly {
// right-shift is arithmetic for signed values, and that can lead to
// unpleasant bugs.
/**
* Population count (number of bits set), using __builtin_popcount or
* __builtin_popcountll, depending on size.
*/
template <class T>
inline typename std::enable_if<
(std::is_integral<T>::value &&
std::is_unsigned<T>::value &&
sizeof(T) <= sizeof(unsigned int)),
size_t>::type
popcount(T x) {
return __builtin_popcount(x);
}
template <class T>
inline typename std::enable_if<
(std::is_integral<T>::value &&
std::is_unsigned<T>::value &&
sizeof(T) > sizeof(unsigned int) &&
sizeof(T) <= sizeof(unsigned long long)),
size_t>::type
popcount(T x) {
return __builtin_popcountll(x);
}
namespace detail {
/**
......
......@@ -37,12 +37,6 @@ void testFFS() {
}
}
template <class INT>
unsigned int findLastSetPortable(INT x) {
return detail::findLastSetPortable(
static_cast<typename std::make_unsigned<INT>::type>(x));
}
template <class INT>
void testFLS() {
typedef typename std::make_unsigned<INT>::type UINT;
......@@ -51,11 +45,9 @@ void testFLS() {
for (size_t i = 0; i < bits; i++) {
INT v1 = static_cast<UINT>(1) << i;
EXPECT_EQ(i + 1, findLastSet(v1));
EXPECT_EQ(i + 1, findLastSetPortable(v1));
INT v2 = (static_cast<UINT>(1) << i) - 1;
EXPECT_EQ(i, findLastSet(v2));
EXPECT_EQ(i, findLastSetPortable(v2));
}
}
......@@ -113,8 +105,6 @@ TEST(Bits, FindLastSet) {
}
#ifdef __GNUC__
TEST(Bits, nextPowTwoClz) {
testPowTwo(nextPowTwo);
}
......@@ -124,18 +114,13 @@ BENCHMARK(nextPowTwoClz, iters) {
x = folly::nextPowTwo(iters);
}
#endif
TEST(Bits, nextPowTwoPortable) {
testPowTwo(detail::nextPowTwoPortable);
}
BENCHMARK(nextPowTwoPortable, iters) {
x = detail::nextPowTwoPortable(iters);
TEST(Bits, popcount) {
EXPECT_EQ(0, popcount(0U));
EXPECT_EQ(1, popcount(1U));
EXPECT_EQ(32, popcount(uint32_t(-1)));
EXPECT_EQ(64, popcount(uint64_t(-1)));
}
BENCHMARK_DRAW_LINE();
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
google::ParseCommandLineFlags(&argc, &argv, true);
......@@ -153,5 +138,4 @@ Benchmarks run on dual Xeon X5650's @ 2.67GHz w/hyperthreading enabled
Benchmark Iters Total t t/iter iter/sec
------------------------------------------------------------------------------
* nextPowTwoClz 1000000 1.659 ms 1.659 ns 574.8 M
+66.8% nextPowTwoPortable 1000000 2.767 ms 2.767 ns 344.7 M
*/
/*
* Copyright 2012 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "folly/CpuId.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
using namespace folly;
TEST(CpuId, Simple) {
// All CPUs should support MMX
CpuId id;
EXPECT_TRUE(id.mmx());
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment