Commit 10001685 authored by Christopher Dykes's avatar Christopher Dykes Committed by Facebook Github Bot 8

Support SSE 4.2 qfind under MSVC

Summary:MSVC has support in the compiler for the intrinsics required, but both refuses to tell us that, and also gives them proper names.
The code already checks for runtime support, this just enables compiling the SSE 4.2 version in the first place.

Reviewed By: yfeldblum

Differential Revision: D3104296

fb-gh-sync-id: 9143240bede9b756817691fdd86818001267dac1
fbshipit-source-id: 9143240bede9b756817691fdd86818001267dac1
parent 06fe0874
...@@ -285,10 +285,9 @@ namespace std { typedef ::max_align_t max_align_t; } ...@@ -285,10 +285,9 @@ namespace std { typedef ::max_align_t max_align_t; }
// Hide a GCC specific thing that breaks MSVC if left alone. // Hide a GCC specific thing that breaks MSVC if left alone.
# define __extension__ # define __extension__
#ifdef _M_IX86_FP // We have compiler support for the newest of the new, but
# define FOLLY_SSE _M_IX86_FP // MSVC doesn't tell us that.
# define FOLLY_SSE_MINOR 0 #define __SSE4_2__ 1
#endif
#endif #endif
......
...@@ -14,49 +14,35 @@ ...@@ -14,49 +14,35 @@
* limitations under the License. * limitations under the License.
*/ */
#include "RangeSse42.h" #include "RangeSse42.h"
#include <glog/logging.h> #include <glog/logging.h>
#include <folly/Portability.h> #include <folly/Portability.h>
// Essentially, two versions of this file: one with an SSE42 implementation // Essentially, two versions of this file: one with an SSE42 implementation
// and one with a fallback implementation. We determine which version to use by // and one with a fallback implementation. We determine which version to use by
// testing for the presence of the required headers. // testing for the presence of the required headers.
// //
// TODO: Maybe this should be done by the build system.... // TODO: Maybe this should be done by the build system....
#if !FOLLY_SSE_PREREQ(4, 2) #if !FOLLY_SSE_PREREQ(4, 2)
namespace folly { namespace folly {
namespace detail { namespace detail {
size_t qfind_first_byte_of_sse42(const StringPieceLite haystack, size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
const StringPieceLite needles) { const StringPieceLite needles) {
CHECK(false) << "Function " << __func__ << " only works with SSE42!"; CHECK(false) << "Function " << __func__ << " only works with SSE42!";
return qfind_first_byte_of_nosse(haystack, needles); return qfind_first_byte_of_nosse(haystack, needles);
} }
} }
} }
# else # else
#include <cstdint> #include <cstdint>
#include <limits> #include <limits>
#include <string> #include <string>
#include <emmintrin.h> #include <emmintrin.h>
#include <nmmintrin.h>
#include <smmintrin.h> #include <smmintrin.h>
#include <folly/Likely.h> #include <folly/Likely.h>
// GCC 4.9 with ASAN has a problem: a function with no_sanitize_address calling // GCC 4.9 with ASAN has a problem: a function with no_sanitize_address calling
...@@ -68,10 +54,14 @@ size_t qfind_first_byte_of_sse42(const StringPieceLite haystack, ...@@ -68,10 +54,14 @@ size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
__GNUC_PREREQ(4, 9) __GNUC_PREREQ(4, 9)
# define _mm_load_si128(p) (*(p)) # define _mm_load_si128(p) (*(p))
# define _mm_loadu_si128(p) ((__m128i)__builtin_ia32_loaddqu((const char*)(p))) # define _mm_loadu_si128(p) ((__m128i)__builtin_ia32_loaddqu((const char*)(p)))
# ifdef _mm_cmpestri
# undef _mm_cmpestri
# endif
# define _mm_cmpestri(a, b, c, d, e) \
__builtin_ia32_pcmpestri128((__v16qi)(a), b, (__v16qi)(c), d, e)
#endif #endif
namespace folly { namespace folly {
namespace detail { namespace detail {
// It's okay if pages are bigger than this (as powers of two), but they should // It's okay if pages are bigger than this (as powers of two), but they should
...@@ -116,8 +106,8 @@ size_t qfind_first_byte_of_needles16(const StringPieceLite haystack, ...@@ -116,8 +106,8 @@ size_t qfind_first_byte_of_needles16(const StringPieceLite haystack,
// do an unaligned load for first block of haystack // do an unaligned load for first block of haystack
auto arr1 = _mm_loadu_si128( auto arr1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(haystack.data())); reinterpret_cast<const __m128i*>(haystack.data()));
auto index = __builtin_ia32_pcmpestri128((__v16qi)arr2, needles.size(), auto index = _mm_cmpestri(arr2, needles.size(),
(__v16qi)arr1, haystack.size(), 0); arr1, haystack.size(), 0);
if (index < 16) { if (index < 16) {
return index; return index;
} }
...@@ -127,9 +117,9 @@ size_t qfind_first_byte_of_needles16(const StringPieceLite haystack, ...@@ -127,9 +117,9 @@ size_t qfind_first_byte_of_needles16(const StringPieceLite haystack,
for (; i < haystack.size(); i+= 16) { for (; i < haystack.size(); i+= 16) {
auto arr1 = _mm_load_si128( auto arr1 = _mm_load_si128(
reinterpret_cast<const __m128i*>(haystack.data() + i)); reinterpret_cast<const __m128i*>(haystack.data() + i));
auto index = __builtin_ia32_pcmpestri128( auto index = _mm_cmpestri(
(__v16qi)arr2, needles.size(), arr2, needles.size(),
(__v16qi)arr1, haystack.size() - i, 0); arr1, haystack.size() - i, 0);
if (index < 16) { if (index < 16) {
return i + index; return i + index;
} }
...@@ -172,17 +162,17 @@ size_t scanHaystackBlock(const StringPieceLite haystack, ...@@ -172,17 +162,17 @@ size_t scanHaystackBlock(const StringPieceLite haystack,
// This load is safe because needles.size() >= 16 // This load is safe because needles.size() >= 16
auto arr2 = _mm_loadu_si128( auto arr2 = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(needles.data())); reinterpret_cast<const __m128i*>(needles.data()));
size_t b = __builtin_ia32_pcmpestri128( size_t b = _mm_cmpestri(
(__v16qi)arr2, 16, (__v16qi)arr1, haystack.size() - blockStartIdx, 0); arr2, 16, arr1, haystack.size() - blockStartIdx, 0);
size_t j = nextAlignedIndex(needles.data()); size_t j = nextAlignedIndex(needles.data());
for (; j < needles.size(); j += 16) { for (; j < needles.size(); j += 16) {
arr2 = _mm_load_si128( arr2 = _mm_load_si128(
reinterpret_cast<const __m128i*>(needles.data() + j)); reinterpret_cast<const __m128i*>(needles.data() + j));
auto index = __builtin_ia32_pcmpestri128( auto index = _mm_cmpestri(
(__v16qi)arr2, needles.size() - j, arr2, needles.size() - j,
(__v16qi)arr1, haystack.size() - blockStartIdx, 0); arr1, haystack.size() - blockStartIdx, 0);
b = std::min<size_t>(index, b); b = std::min<size_t>(index, b);
} }
...@@ -229,11 +219,6 @@ size_t qfind_first_byte_of_sse42(const StringPieceLite haystack, ...@@ -229,11 +219,6 @@ size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
return std::string::npos; return std::string::npos;
} }
} }
} }
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment