Commit a0e3a745 authored by Ilya Maykov's avatar Ilya Maykov Committed by Facebook Github Bot

Implemented LtHash in folly/experimental/crypto

Summary:
Added LtHash, a cryptographic homomorphic hash, to folly/experimental/crypto.
This has a soft dependency on libsodium and the code will not be compiled if libsodium is not detected by cmake.

Reviewed By: djwatson

Differential Revision: D13390825

fbshipit-source-id: f7597ced7bcc7b403e8bbaa733837b795128f1b3
parent f42dd787
...@@ -170,12 +170,52 @@ if (NOT ${LIBAIO_FOUND}) ...@@ -170,12 +170,52 @@ if (NOT ${LIBAIO_FOUND})
${FOLLY_DIR}/experimental/io/AsyncIO.h ${FOLLY_DIR}/experimental/io/AsyncIO.h
) )
endif() endif()
if (NOT ${LIBSODIUM_FOUND}) if (${LIBSODIUM_FOUND})
string(FIND ${CMAKE_LIBRARY_ARCHITECTURE} "x86_64" IS_X86_64_ARCH)
if (${IS_X86_64_ARCH} STREQUAL "-1")
message(
STATUS
"arch ${CMAKE_LIBRARY_ARCHITECTURE} does not match x86_64, "
"skipping setting SSE2/AVX2 compile flags for LtHash SIMD code"
)
else()
message(
STATUS
"arch ${CMAKE_LIBRARY_ARCHITECTURE} matches x86_64, "
"setting SSE2/AVX2 compile flags for LtHash SIMD code"
)
set_source_files_properties(
${FOLLY_DIR}/experimental/crypto/detail/MathOperation_AVX2.cpp
PROPERTIES
COMPILE_FLAGS
-mavx -mavx2 -msse2
)
set_source_files_properties(
${FOLLY_DIR}/experimental/crypto/detail/MathOperation_Simple.cpp
PROPERTIES
COMPILE_FLAGS
-mno-avx -mno-avx2 -mno-sse2
)
set_source_files_properties(
${FOLLY_DIR}/experimental/crypto/detail/MathOperation_SSE2.cpp
PROPERTIES
COMPILE_FLAGS
-mno-avx -mno-avx2 -msse2
)
endif()
else()
list(REMOVE_ITEM files list(REMOVE_ITEM files
${FOLLY_DIR}/experimental/crypto/Blake2xb.cpp ${FOLLY_DIR}/experimental/crypto/Blake2xb.cpp
${FOLLY_DIR}/experimental/crypto/detail/MathOperation_AVX2.cpp
${FOLLY_DIR}/experimental/crypto/detail/MathOperation_Simple.cpp
${FOLLY_DIR}/experimental/crypto/detail/MathOperation_SSE2.cpp
${FOLLY_DIR}/experimental/crypto/LtHash.cpp
) )
list(REMOVE_ITEM hfiles list(REMOVE_ITEM hfiles
${FOLLY_DIR}/experimental/crypto/Blake2xb.h ${FOLLY_DIR}/experimental/crypto/Blake2xb.h
${FOLLY_DIR}/experimental/crypto/detail/LtHashInternal.h
${FOLLY_DIR}/experimental/crypto/LtHash-inl.h
${FOLLY_DIR}/experimental/crypto/LtHash.h
) )
endif() endif()
if(CMAKE_SYSTEM_NAME STREQUAL "Windows") if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
......
This diff is collapsed.
/*
* Copyright 2017-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/experimental/crypto/LtHash.h>
#include <folly/CpuId.h>
#ifdef __SSE2__
#include <emmintrin.h>
#endif
#ifdef __AVX2__
#include <immintrin.h>
#endif
#include <folly/Memory.h>
namespace folly {
namespace crypto {
namespace detail {
folly::IOBuf allocateCacheAlignedIOBuf(size_t size) {
void* ptr = folly::aligned_malloc(size, kCacheLineSize);
if (ptr == nullptr) {
throw std::bad_alloc();
}
return folly::IOBuf(
folly::IOBuf::TAKE_OWNERSHIP,
ptr,
static_cast<uint64_t>(size), // capacity
0ULL, // initial size
[](void* addr, void* /* userData*/) { folly::aligned_free(addr); });
}
std::unique_ptr<folly::IOBuf> allocateCacheAlignedIOBufUnique(size_t size) {
return std::make_unique<folly::IOBuf>(allocateCacheAlignedIOBuf(size));
}
bool isCacheAlignedAddress(const void* addr) {
size_t addrValue = reinterpret_cast<size_t>(addr);
return (addrValue & (kCacheLineSize - 1)) == 0;
}
// static
template <>
bool MathOperation<MathEngine::SIMPLE>::isAvailable() {
return true;
}
// static
template <>
bool MathOperation<MathEngine::SSE2>::isAvailable() {
static const bool kIsAvailable =
CpuId().sse2() && MathOperation<MathEngine::SSE2>::isImplemented();
return kIsAvailable;
}
// static
template <>
bool MathOperation<MathEngine::AVX2>::isAvailable() {
static const bool kIsAvailable =
CpuId().avx2() && MathOperation<MathEngine::AVX2>::isImplemented();
return kIsAvailable;
}
// static
template <>
bool MathOperation<MathEngine::AUTO>::isAvailable() {
return true;
}
// static
template <>
bool MathOperation<MathEngine::AUTO>::isImplemented() {
return true;
}
// static
template <>
void MathOperation<MathEngine::AUTO>::add(
uint64_t dataMask,
size_t bitsPerElement,
folly::ByteRange b1,
folly::ByteRange b2,
folly::MutableByteRange out) {
// Note: implementation is a function pointer that is initialized to point
// at the fastest available implementation the first time this function is
// called.
static auto implementation = []() {
if (MathOperation<MathEngine::AVX2>::isAvailable()) {
LOG(INFO) << "Selected AVX2 MathEngine for add() operation";
return MathOperation<MathEngine::AVX2>::add;
} else if (MathOperation<MathEngine::SSE2>::isAvailable()) {
LOG(INFO) << "Selected SSE2 MathEngine for add() operation";
return MathOperation<MathEngine::SSE2>::add;
} else {
LOG(INFO) << "Selected SIMPLE MathEngine for add() operation";
return MathOperation<MathEngine::SIMPLE>::add;
}
}();
implementation(dataMask, bitsPerElement, b1, b2, out);
}
// static
template <>
void MathOperation<MathEngine::AUTO>::sub(
uint64_t dataMask,
size_t bitsPerElement,
folly::ByteRange b1,
folly::ByteRange b2,
folly::MutableByteRange out) {
// Note: implementation is a function pointer that is initialized to point
// at the fastest available implementation the first time this function is
// called.
static auto implementation = []() {
if (MathOperation<MathEngine::AVX2>::isAvailable()) {
LOG(INFO) << "Selected AVX2 MathEngine for sub() operation";
return MathOperation<MathEngine::AVX2>::sub;
} else if (MathOperation<MathEngine::SSE2>::isAvailable()) {
LOG(INFO) << "Selected SSE2 MathEngine for sub() operation";
return MathOperation<MathEngine::SSE2>::sub;
} else {
LOG(INFO) << "Selected SIMPLE MathEngine for sub() operation";
return MathOperation<MathEngine::SIMPLE>::sub;
}
}();
implementation(dataMask, bitsPerElement, b1, b2, out);
}
// static
template <>
void MathOperation<MathEngine::AUTO>::clearPaddingBits(
uint64_t dataMask,
folly::MutableByteRange buf) {
// Note: implementation is a function pointer that is initialized to point
// at the fastest available implementation the first time this function is
// called.
static auto implementation = []() {
if (MathOperation<MathEngine::AVX2>::isAvailable()) {
LOG(INFO) << "Selected AVX2 MathEngine for clearPaddingBits() operation";
return MathOperation<MathEngine::AVX2>::clearPaddingBits;
} else if (MathOperation<MathEngine::SSE2>::isAvailable()) {
LOG(INFO) << "Selected SSE2 MathEngine for clearPaddingBits() operation";
return MathOperation<MathEngine::SSE2>::clearPaddingBits;
} else {
LOG(INFO)
<< "Selected SIMPLE MathEngine for clearPaddingBits() operation";
return MathOperation<MathEngine::SIMPLE>::clearPaddingBits;
}
}();
implementation(dataMask, buf);
}
// static
template <>
bool MathOperation<MathEngine::AUTO>::checkPaddingBits(
uint64_t dataMask,
folly::ByteRange buf) {
// Note: implementation is a function pointer that is initialized to point
// at the fastest available implementation the first time this function is
// called.
static auto implementation = []() {
if (MathOperation<MathEngine::AVX2>::isAvailable()) {
LOG(INFO) << "Selected AVX2 MathEngine for checkPaddingBits() operation";
return MathOperation<MathEngine::AVX2>::checkPaddingBits;
} else if (MathOperation<MathEngine::SSE2>::isAvailable()) {
LOG(INFO) << "Selected SSE2 MathEngine for checkPaddingBits() operation";
return MathOperation<MathEngine::SSE2>::checkPaddingBits;
} else {
LOG(INFO)
<< "Selected SIMPLE MathEngine for checkPaddingBits() operation";
return MathOperation<MathEngine::SIMPLE>::checkPaddingBits;
}
}();
return implementation(dataMask, buf);
}
template struct MathOperation<MathEngine::AUTO>;
} // namespace detail
} // namespace crypto
} // namespace folly
/*
* Copyright 2017-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cstddef>
#include <memory>
#include <folly/Range.h>
#include <folly/experimental/crypto/Blake2xb.h>
#include <folly/io/IOBuf.h>
namespace folly {
namespace crypto {
namespace detail {
/**
* Allocates an IOBuf of the given size, aligned on a cache line boundary.
* Similar to folly::IOBuf::create(), the returned IOBuf has an initial
* capacity == size and an initial length == 0.
*/
folly::IOBuf allocateCacheAlignedIOBuf(size_t size);
/**
* Similar to allocateCacheAlignedIOBuf(), but returns a unique_ptr to an IOBuf
* instead of an IOBuf.
*/
std::unique_ptr<folly::IOBuf> allocateCacheAlignedIOBufUnique(size_t size);
/**
* Returns true if the given memory address is aligned on a cache line boundary
* and false if it isn't.
*/
bool isCacheAlignedAddress(const void* addr);
} // namespace detail
/**
* Templated homomorphic hash, using LtHash (lattice-based crypto).
* Template parameters: B = element size in bits, N = number of elements.
*
* Current constraints (checked at compile time with static asserts):
* (1) B must be 16, 20 or 32.
* (2) N must be > 999.
* (3) when B is 16, N must be divisible by 32.
* (4) when B is 20, N must be divisible by 24.
* (5) when B is 32, N must be divisible by 16.
*/
template <std::size_t B, std::size_t N>
class LtHash {
public:
explicit LtHash(const folly::IOBuf& initialChecksum = {});
/**
* Like the above constructor but takes ownership of the checksum buffer,
* avoiding a copy if these conditions about the input buffer are met:
* - initialChecksum->isChained() is false
* - initialChecksum->isShared() is false
* - detail::isCacheAlignedAddress(initialChecksum.data()) is true
*
* If you want to take advantage of this and need to make sure your IOBuf
* address is aligned on a cache line boundary, you can use the
* function detail::allocateCacheAlignedIOBufUnique() to do it.
*/
explicit LtHash(std::unique_ptr<folly::IOBuf> initialChecksum);
// Note: we explicitly implement copy constructor and copy assignment
// operator to make sure the checksum_ IOBuf is deep-copied.
LtHash(const LtHash<B, N>& that);
LtHash<B, N>& operator=(const LtHash<B, N>& that);
LtHash(LtHash<B, N>&& that) noexcept = default;
LtHash<B, N>& operator=(LtHash<B, N>&& that) noexcept = default;
~LtHash() = default;
/**
* Resets the checksum in this LtHash. This puts the hash into the same
* state as if it was just constructed with the zero-argument constructor.
*/
void reset();
/**
* IMPORTANT: Unlike regular hash, the incremental hash functions operate on
* individual objects, not a stream of data. For example, the following
* example codes will lead to different checksum values.
* (1) addObject("Hello"); addObject(" World");
* (2) addObject("Hello World");
* because addObject() calculates hashes for the two words separately, and
* aggregate them to update checksum.
*
* addObject() is commutative. LtHash generates the same checksum over a
* given set of objects regardless of the order they were added.
* Example: H(a + b + c) = H(b + c + a)
*
* addObject() can be called with multiple ByteRange parameters, in which
* case it will behave as if it was called with a single ByteRange which
* contained the concatenation of all the input ByteRanges. This allows
* adding an object whose hash is computed from several non-contiguous
* ranges of data, without having to copy the data to a contiguous
* piece of memory.
*
* Example: addObject(r1, r2, r3) is equivalent to
* addObject(r4) where r4 contains the concatenation of r1 + r2 + r3.
*/
template <typename... Args>
LtHash<B, N>& addObject(folly::ByteRange firstRange, Args&&... moreRanges);
/**
* removeObject() is the inverse function of addObject(). Note that it does
* NOT check whether the object has been actually added to LtHash. The caller
* should ensure that the object is valid.
*
* Example: H(a - a + b - b + c - c) = H(a + b + c - a - b - c) = H()
*
* Similar to addObject(), removeObject() can be called with more than one
* ByteRange parameter.
*/
template <typename... Args>
LtHash<B, N>& removeObject(folly::ByteRange firstRange, Args&&... moreRanges);
/**
* Because the addObject() operation in LtHash is commutative and transitive,
* it's possible to break down a large LtHash computation (i.e. adding 100k
* objects) into several parallel steps each of which computes a LtHash of a
* subset of the objects, and then add the LtHash objects together.
* Pseudocode:
*
* std::vector<std::string> objects = ...;
* Future<LtHash<20, 1008>> h1 = computeInBackgroundThread(
* &objects[0], &objects[10000]);
* Future<LtHash<20, 1008>> h2 = computeInBackgroundThread(
* &objects[10001], &objects[20000]);
* LtHash<20, 1008> result = h1.get() + h2.get();
*/
LtHash<B, N>& operator+=(const LtHash<B, N>& rhs);
friend LtHash<B, N> operator+(
const LtHash<B, N>& lhs,
const LtHash<B, N>& rhs) {
LtHash<B, N> result = lhs;
result += rhs;
return result;
}
friend LtHash<B, N> operator+(LtHash<B, N>&& lhs, const LtHash<B, N>& rhs) {
LtHash<B, N> result = std::move(lhs);
result += rhs;
return result;
}
friend LtHash<B, N> operator+(const LtHash<B, N>& lhs, LtHash<B, N>&& rhs) {
// addition is commutative so we can just swap the two arguments
return std::move(rhs) + lhs;
}
friend LtHash<B, N> operator+(LtHash<B, N>&& lhs, LtHash<B, N>&& rhs) {
LtHash<B, N> result = std::move(lhs);
result += rhs;
return result;
}
/**
* The subtraction operator is provided for symmetry, but I'm not sure if
* anyone will ever actually use it outside of tests.
*/
LtHash<B, N>& operator-=(const LtHash<B, N>& rhs);
friend LtHash<B, N> operator-(
const LtHash<B, N>& lhs,
const LtHash<B, N>& rhs) {
LtHash<B, N> result = lhs;
result -= rhs;
return result;
}
friend LtHash<B, N> operator-(LtHash<B, N>&& lhs, const LtHash<B, N>& rhs) {
LtHash<B, N> result = std::move(lhs);
result -= rhs;
return result;
}
/**
* Equality comparison operator, implemented in a data-independent way to
* guard against timing attacks. Always use this to check if two LtHash
* values are equal instead of manually comparing checksum buffers.
*/
bool operator==(const LtHash<B, N>& that) const;
/**
* Equality comparison operator for checksum in ByteRange, implemented in a
* data-independent way to guard against timing attacks.
*/
bool checksumEquals(folly::ByteRange otherChecksum) const;
/**
* Inequality comparison operator.
*/
bool operator!=(const LtHash<B, N>& that) const;
/**
* Sets the intial checksum value to use for processing objects in the
* xxxObject() calls.
*/
void setChecksum(const folly::IOBuf& checksum);
/**
* Like the above method but takes ownership of the checksum buffer,
* avoiding a copy if these conditions about the input buffer are met:
* - checksum->isChained() is false
* - checksum->isShared() is false
* - detail::isCacheAlignedAddress(checksum.data()) is true
*
* If you want to take advantage of this and need to make sure your IOBuf
* address is aligned on a cache line boundary, you can use the
* function detail::allocateCacheAlignedIOBufUnique() to do it.
*/
void setChecksum(std::unique_ptr<folly::IOBuf> checksum);
/**
* Returns the total length of the checksum (element_count * element_length)
*/
static constexpr size_t getChecksumSizeBytes();
/**
* Returns the template parameter B.
*/
static constexpr size_t getElementSizeInBits();
/**
* Returns the number of elements that get packed into a single uint64_t.
*/
static constexpr size_t getElementsPerUint64();
/**
* Returns the template parameter N.
*/
static constexpr size_t getElementCount();
/**
* Retruns true if the internal checksum uses padding bits between elements.
*/
static constexpr bool hasPaddingBits();
/**
* Returns a copy of the current checksum value
*/
std::unique_ptr<folly::IOBuf> getChecksum() const;
private:
template <typename... Args>
void hashObject(
folly::MutableByteRange out,
folly::ByteRange firstRange,
Args&&... moreRanges);
template <typename... Args>
void
updateDigest(Blake2xb& digest, folly::ByteRange range, Args&&... moreRanges);
void updateDigest(Blake2xb& digest);
// current checksum
folly::IOBuf checksum_;
};
} // namespace crypto
} // namespace folly
#include <folly/experimental/crypto/LtHash-inl.h>
namespace folly {
namespace crypto {
// This is the fastest and smallest specialization and should be
// preferred in most cases. It provides over 200 bits of security
// which should be good enough for most cases.
using LtHash16_1024 = LtHash<16, 1024>;
// These specializations are available to users who want a higher
// level of cryptographic security. They are slower and larger than
// the one above.
using LtHash20_1008 = LtHash<20, 1008>;
using LtHash32_1024 = LtHash<32, 1024>;
} // namespace crypto
} // namespace folly
/*
* Copyright 2017-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <folly/Range.h>
namespace folly {
namespace crypto {
namespace detail {
// As of 2019, most (or all?) modern Intel CPUs have 64-byte L1 cache lines,
// and aligning data buffers on cache line boundaries on such CPUs
// noticeably benefits performance (up to 10% difference).
//
// If you change this, code that depends on it in MathOperation_*.cpp may
// break and could need fixing.
constexpr size_t kCacheLineSize = 64;
// Invariants about kCacheLineSize that other logic depends on: it must be
// a power of 2 and cannot be zero.
static_assert(kCacheLineSize > 0, "kCacheLineSize cannot be 0");
static_assert(
(kCacheLineSize & (kCacheLineSize - 1)) == 0,
"kCacheLineSize must be a power of 2");
/**
* Defines available math engines that we can use to perform element-wise
* modular addition or subtraction of element vectors.
* - AUTO: pick the best available, from best to worst: AVX2, SSE2, SIMPLE
* - SIMPLE: perform addition/subtraction using uint64_t values
* - SSE2: perform addition/subtraction using 128-bit __m128i values.
* Intel only, requires SSE2 instruction support.
* - AVX2: perform addition/subtraction using 256-bit __m256i values.
* Intel only, requires AVX2 instruction support.
*/
enum class MathEngine { AUTO, SIMPLE, SSE2, AVX2 };
/**
* This actually implements the bulk addition/subtraction operations.
*/
template <MathEngine E>
struct MathOperation {
/**
* Returns true if the math engine E is supported by the CPU and OS and is
* implemented.
*/
static bool isAvailable();
/**
* Returns true if the math engine E is implemented.
*/
static bool isImplemented();
/**
* Performs element-wise modular addition of 2 vectors of elements packed
* into the buffers b1 and b2. Writes the output into the buffer out. The
* output buffer may be the same as one of the input buffers. The dataMask
* parameter should be Bits<B>::kDataMask() where B is the element size
* in bits.
*/
static void add(
uint64_t dataMask,
size_t bitsPerElement,
ByteRange b1,
ByteRange b2,
MutableByteRange out);
/**
* Performs element-wise modular subtraction of 2 groups of elements packed
* into the buffers b1 and b2. Note that (a - b) % M == (a + (M - b)) % M,
* which is how we actually implement it to avoid underflow issues. The
* dataMask parameter should be Bits<B>::kDataMask() where B is the element
* size in bits.
*/
static void sub(
uint64_t dataMask,
size_t bitsPerElement,
ByteRange b1,
ByteRange b2,
MutableByteRange out);
/**
* Clears the padding bits of the given buffer according to the given
* data mask: for each uint64_t in the input buffer, all 0 bits in the
* data mask are cleared, and all 1 bits in the data mask are preserved.
*/
static void clearPaddingBits(uint64_t dataMask, MutableByteRange buf);
/**
* Returns true if the given checksum buffer contains 0 bits at the padding
* bit positions, according to the given data mask.
*/
static bool checkPaddingBits(uint64_t dataMask, ByteRange buf);
};
// These forward declarations of explicit template instantiations seem to be
// required to get things to compile. I tried to get things to work without it,
// but the compiler complained when I had any AVX2 types in this header, so I
// think they need to be hidden in the .cpp file for some reason.
#define FORWARD_DECLARE_EXTERN_TEMPLATE(E) \
template <> \
bool MathOperation<E>::isAvailable(); \
template <> \
bool MathOperation<E>::isImplemented(); \
template <> \
void MathOperation<E>::add( \
uint64_t dataMask, \
size_t bitsPerElement, \
ByteRange b1, \
ByteRange b2, \
MutableByteRange out); \
template <> \
void MathOperation<E>::sub( \
uint64_t dataMask, \
size_t bitsPerElement, \
ByteRange b1, \
ByteRange b2, \
MutableByteRange out); \
template <> \
void MathOperation<E>::clearPaddingBits( \
uint64_t dataMask, MutableByteRange buf); \
template <> \
bool MathOperation<E>::checkPaddingBits(uint64_t dataMask, ByteRange buf); \
extern template struct MathOperation<E>
FORWARD_DECLARE_EXTERN_TEMPLATE(MathEngine::AUTO);
FORWARD_DECLARE_EXTERN_TEMPLATE(MathEngine::SIMPLE);
FORWARD_DECLARE_EXTERN_TEMPLATE(MathEngine::SSE2);
FORWARD_DECLARE_EXTERN_TEMPLATE(MathEngine::AVX2);
#undef FORWARD_DECLARE_EXTERN_TEMPLATE
} // namespace detail
} // namespace crypto
} // namespace folly
/*
* Copyright 2017-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Implementation of the MathOperation<MathEngine::AVX2> template
// specializations.
#include <folly/experimental/crypto/detail/LtHashInternal.h>
#ifdef __AVX2__
#include <immintrin.h>
#include <sodium.h>
#include <folly/lang/Bits.h>
#endif // __AVX2__
#include <folly/Memory.h>
namespace folly {
namespace crypto {
namespace detail {
#ifdef __AVX2__
// static
template <>
bool MathOperation<MathEngine::AVX2>::isImplemented() {
return true;
}
// static
template <>
void MathOperation<MathEngine::AVX2>::add(
uint64_t dataMask,
size_t bitsPerElement,
ByteRange b1,
ByteRange b2,
MutableByteRange out) {
DCHECK_EQ(b1.size(), b2.size());
DCHECK_EQ(b1.size(), out.size());
DCHECK_EQ(0, b1.size() % kCacheLineSize);
static_assert(
kCacheLineSize % sizeof(__m256i) == 0,
"kCacheLineSize must be a multiple of sizeof(__m256i)");
static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m256i);
static_assert(
kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m256i)");
// gcc issues 'ignoring attributes on template argument' warning if
// __m256i is used below, so have to type explicitly
alignas(kCacheLineSize) std::array<
long long __attribute__((__vector_size__(sizeof(__m256i)))),
kValsPerCacheLine>
results;
// Note: AVX2 is Intel x86_64 only which is little-endian, so we don't need
// the Endian::little() conversions when loading or storing data.
if (bitsPerElement == 16 || bitsPerElement == 32) {
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const __m256i* v1p = reinterpret_cast<const __m256i*>(b1.data() + pos);
const __m256i* v2p = reinterpret_cast<const __m256i*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
__m256i v1 = _mm256_load_si256(v1p + i);
__m256i v2 = _mm256_load_si256(v2p + i);
if (bitsPerElement == 16) {
results[i] = _mm256_add_epi16(v1, v2);
} else { // bitsPerElement == 32
results[i] = _mm256_add_epi32(v1, v2);
}
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
} else {
__m256i mask = _mm256_set1_epi64x(dataMask);
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const __m256i* v1p = reinterpret_cast<const __m256i*>(b1.data() + pos);
const __m256i* v2p = reinterpret_cast<const __m256i*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
__m256i v1 = _mm256_load_si256(v1p + i);
__m256i v2 = _mm256_load_si256(v2p + i);
results[i] = _mm256_and_si256(_mm256_add_epi64(v1, v2), mask);
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
}
}
// static
template <>
void MathOperation<MathEngine::AVX2>::sub(
uint64_t dataMask,
size_t bitsPerElement,
ByteRange b1,
ByteRange b2,
MutableByteRange out) {
DCHECK_EQ(b1.size(), b2.size());
DCHECK_EQ(b1.size(), out.size());
DCHECK_EQ(0, b1.size() % kCacheLineSize);
static_assert(
kCacheLineSize % sizeof(__m256i) == 0,
"kCacheLineSize must be a multiple of sizeof(__m256i)");
static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m256i);
static_assert(
kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m256i)");
// gcc issues 'ignoring attributes on template argument' warning if
// __m256i is used below, so have to type explicitly
alignas(kCacheLineSize) std::array<
long long __attribute__((__vector_size__(sizeof(__m256i)))),
kValsPerCacheLine>
results;
// Note: AVX2 is Intel x86_64 only which is little-endian, so we don't need
// the Endian::little() conversions when loading or storing data.
if (bitsPerElement == 16 || bitsPerElement == 32) {
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const __m256i* v1p = reinterpret_cast<const __m256i*>(b1.data() + pos);
const __m256i* v2p = reinterpret_cast<const __m256i*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
__m256i v1 = _mm256_load_si256(v1p + i);
__m256i v2 = _mm256_load_si256(v2p + i);
if (bitsPerElement == 16) {
results[i] = _mm256_sub_epi16(v1, v2);
} else { // bitsPerElement == 32
results[i] = _mm256_sub_epi32(v1, v2);
}
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
} else {
__m256i mask = _mm256_set1_epi64x(dataMask);
__m256i paddingMask = _mm256_set1_epi64x(~dataMask);
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const __m256i* v1p = reinterpret_cast<const __m256i*>(b1.data() + pos);
const __m256i* v2p = reinterpret_cast<const __m256i*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
__m256i v1 = _mm256_load_si256(v1p + i);
__m256i v2 = _mm256_load_si256(v2p + i);
__m256i negV2 =
_mm256_and_si256(_mm256_sub_epi64(paddingMask, v2), mask);
results[i] = _mm256_and_si256(_mm256_add_epi64(v1, negV2), mask);
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
}
}
template <>
void MathOperation<MathEngine::AVX2>::clearPaddingBits(
uint64_t dataMask,
MutableByteRange buf) {
if (dataMask == 0xffffffffffffffffULL) {
return;
}
DCHECK_EQ(0, buf.size() % kCacheLineSize);
static_assert(
kCacheLineSize % sizeof(__m256i) == 0,
"kCacheLineSize must be a multiple of sizeof(__m256i)");
static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m256i);
static_assert(
kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m256i)");
// gcc issues 'ignoring attributes on template argument' warning if
// __m256i is used below, so have to type explicitly
alignas(kCacheLineSize) std::array<
long long __attribute__((__vector_size__(sizeof(__m256i)))),
kValsPerCacheLine>
results;
__m256i mask = _mm256_set1_epi64x(dataMask);
for (size_t pos = 0; pos < buf.size(); pos += kCacheLineSize) {
const __m256i* p = reinterpret_cast<const __m256i*>(buf.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
results[i] = _mm256_and_si256(_mm256_load_si256(p + i), mask);
}
std::memcpy(buf.data() + pos, results.data(), sizeof(results));
}
}
template <>
bool MathOperation<MathEngine::AVX2>::checkPaddingBits(
uint64_t dataMask,
ByteRange buf) {
if (dataMask == 0xffffffffffffffffULL) {
return true;
}
DCHECK_EQ(0, buf.size() % sizeof(__m256i));
__m256i paddingMask = _mm256_set1_epi64x(~dataMask);
static const __m256i kZero = _mm256_setzero_si256();
for (size_t pos = 0; pos < buf.size(); pos += sizeof(__m256i)) {
__m256i val =
_mm256_load_si256(reinterpret_cast<const __m256i*>(buf.data() + pos));
__m256i paddingBits = _mm256_and_si256(val, paddingMask);
if (sodium_memcmp(&paddingBits, &kZero, sizeof(kZero)) != 0) {
return false;
}
}
return true;
}
#else // !__AVX2__
// static
template <>
bool MathOperation<MathEngine::AVX2>::isImplemented() {
return false;
}
// static
template <>
void MathOperation<MathEngine::AVX2>::add(
uint64_t /* dataMask */,
size_t bitsPerElement,
ByteRange /* b1 */,
ByteRange /* b2 */,
MutableByteRange /* out */) {
if (bitsPerElement != 0) { // hack to defeat [[noreturn]] compiler warning
LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::AVX2>::"
<< "add() called";
}
}
// static
template <>
void MathOperation<MathEngine::AVX2>::sub(
uint64_t /* dataMask */,
size_t bitsPerElement,
ByteRange /* b1 */,
ByteRange /* b2 */,
MutableByteRange /* out */) {
if (bitsPerElement != 0) { // hack to defeat [[noreturn]] compiler warning
LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::AVX2>::"
<< "sub() called";
}
}
template <>
void MathOperation<MathEngine::AVX2>::clearPaddingBits(
uint64_t /* dataMask */,
MutableByteRange buf) {
if (buf.data() != nullptr) { // hack to defeat [[noreturn]] compiler warning
LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::AVX2>::"
<< "clearPaddingBits() called";
}
}
template <>
bool MathOperation<MathEngine::AVX2>::checkPaddingBits(
uint64_t /* dataMask */,
ByteRange buf) {
if (buf.data() != nullptr) { // hack to defeat [[noreturn]] compiler warning
LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::AVX2>::"
<< "checkPaddingBits() called";
}
return false;
}
#endif // __AVX2__
template struct MathOperation<MathEngine::AVX2>;
} // namespace detail
} // namespace crypto
} // namespace folly
/*
* Copyright 2017-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Implementation of the MathOperation<MathEngine::SSE2> template
// specializations.
#include <folly/experimental/crypto/detail/LtHashInternal.h>
#ifdef __SSE2__
#include <emmintrin.h>
#include <sodium.h>
#include <folly/lang/Bits.h>
#endif // __SSE2__
#include <folly/Memory.h>
namespace folly {
namespace crypto {
namespace detail {
#ifdef __SSE2__
// static
template <>
bool MathOperation<MathEngine::SSE2>::isImplemented() {
return true;
}
// static
template <>
void MathOperation<MathEngine::SSE2>::add(
uint64_t dataMask,
size_t bitsPerElement,
ByteRange b1,
ByteRange b2,
MutableByteRange out) {
DCHECK_EQ(b1.size(), b2.size());
DCHECK_EQ(b1.size(), out.size());
DCHECK_EQ(0, b1.size() % kCacheLineSize);
static_assert(
kCacheLineSize % sizeof(__m128i) == 0,
"kCacheLineSize must be a multiple of sizeof(__m128i)");
static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m128i);
static_assert(
kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m128i)");
// gcc issues 'ignoring attributes on template argument' warning if
// __m128i is used below, so have to type explicitly
alignas(kCacheLineSize) std::array<
long long __attribute__((__vector_size__(sizeof(__m128i)))),
kValsPerCacheLine>
results;
// Note: SSE2 is Intel x86(_64) only which is little-endian, so we don't need
// the Endian::little() conversions when loading or storing data.
if (bitsPerElement == 16 || bitsPerElement == 32) {
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const __m128i* v1p = reinterpret_cast<const __m128i*>(b1.data() + pos);
const __m128i* v2p = reinterpret_cast<const __m128i*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
__m128i v1 = _mm_load_si128(v1p + i);
__m128i v2 = _mm_load_si128(v2p + i);
if (bitsPerElement == 16) {
results[i] = _mm_add_epi16(v1, v2);
} else { // bitsPerElement == 32
results[i] = _mm_add_epi32(v1, v2);
}
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
} else {
__m128i mask = _mm_set_epi64x(dataMask, dataMask);
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const __m128i* v1p = reinterpret_cast<const __m128i*>(b1.data() + pos);
const __m128i* v2p = reinterpret_cast<const __m128i*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
__m128i v1 = _mm_load_si128(v1p + i);
__m128i v2 = _mm_load_si128(v2p + i);
results[i] = _mm_and_si128(_mm_add_epi64(v1, v2), mask);
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
}
}
// static
template <>
void MathOperation<MathEngine::SSE2>::sub(
uint64_t dataMask,
size_t bitsPerElement,
ByteRange b1,
ByteRange b2,
MutableByteRange out) {
DCHECK_EQ(b1.size(), b2.size());
DCHECK_EQ(b1.size(), out.size());
DCHECK_EQ(0, b1.size() % kCacheLineSize);
static_assert(
kCacheLineSize % sizeof(__m128i) == 0,
"kCacheLineSize must be a multiple of sizeof(__m128i)");
static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m128i);
static_assert(
kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m128i)");
// gcc issues 'ignoring attributes on template argument' warning if
// __m128i is used below, so have to type explicitly
alignas(kCacheLineSize) std::array<
long long __attribute__((__vector_size__(sizeof(__m128i)))),
kValsPerCacheLine>
results;
// Note: SSE2 is Intel x86(_64) only which is little-endian, so we don't need
// the Endian::little() conversions when loading or storing data.
if (bitsPerElement == 16 || bitsPerElement == 32) {
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const __m128i* v1p = reinterpret_cast<const __m128i*>(b1.data() + pos);
const __m128i* v2p = reinterpret_cast<const __m128i*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
__m128i v1 = _mm_load_si128(v1p + i);
__m128i v2 = _mm_load_si128(v2p + i);
if (bitsPerElement == 16) {
results[i] = _mm_sub_epi16(v1, v2);
} else { // bitsPerElement == 32
results[i] = _mm_sub_epi32(v1, v2);
}
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
} else {
__m128i mask = _mm_set_epi64x(dataMask, dataMask);
__m128i paddingMask = _mm_set_epi64x(~dataMask, ~dataMask);
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const __m128i* v1p = reinterpret_cast<const __m128i*>(b1.data() + pos);
const __m128i* v2p = reinterpret_cast<const __m128i*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
__m128i v1 = _mm_load_si128(v1p + i);
__m128i v2 = _mm_load_si128(v2p + i);
__m128i negV2 = _mm_and_si128(_mm_sub_epi64(paddingMask, v2), mask);
results[i] = _mm_and_si128(_mm_add_epi64(v1, negV2), mask);
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
}
}
template <>
void MathOperation<MathEngine::SSE2>::clearPaddingBits(
uint64_t dataMask,
MutableByteRange buf) {
if (dataMask == 0xffffffffffffffffULL) {
return;
}
DCHECK_EQ(0, buf.size() % kCacheLineSize);
static_assert(
kCacheLineSize % sizeof(__m128i) == 0,
"kCacheLineSize must be a multiple of sizeof(__m128i)");
static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(__m128i);
static_assert(
kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(__m128i)");
// gcc issues 'ignoring attributes on template argument' warning if
// __m128i is used below, so have to type explicitly
alignas(kCacheLineSize) std::array<
long long __attribute__((__vector_size__(sizeof(__m128i)))),
kValsPerCacheLine>
results;
__m128i mask = _mm_set_epi64x(dataMask, dataMask);
for (size_t pos = 0; pos < buf.size(); pos += kCacheLineSize) {
const __m128i* p = reinterpret_cast<const __m128i*>(buf.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
results[i] = _mm_and_si128(_mm_load_si128(p + i), mask);
}
std::memcpy(buf.data() + pos, results.data(), sizeof(results));
}
}
template <>
bool MathOperation<MathEngine::SSE2>::checkPaddingBits(
uint64_t dataMask,
ByteRange buf) {
if (dataMask == 0xffffffffffffffffULL) {
return true;
}
DCHECK_EQ(0, buf.size() % sizeof(__m128i));
__m128i paddingMask = _mm_set_epi64x(~dataMask, ~dataMask);
static const __m128i kZero = _mm_setzero_si128();
for (size_t pos = 0; pos < buf.size(); pos += sizeof(__m128i)) {
__m128i val =
_mm_load_si128(reinterpret_cast<const __m128i*>(buf.data() + pos));
__m128i paddingBits = _mm_and_si128(val, paddingMask);
if (sodium_memcmp(&paddingBits, &kZero, sizeof(kZero)) != 0) {
return false;
}
}
return true;
}
#else // !__SSE2__
// static
template <>
bool MathOperation<MathEngine::SSE2>::isImplemented() {
return false;
}
// static
template <>
void MathOperation<MathEngine::SSE2>::add(
uint64_t /* dataMask */,
size_t bitsPerElement,
ByteRange /* b1 */,
ByteRange /* b2 */,
MutableByteRange /* out */) {
if (bitsPerElement != 0) { // hack to defeat [[noreturn]] compiler warning
LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::SSE2>::"
<< "add() called";
}
}
// static
template <>
void MathOperation<MathEngine::SSE2>::sub(
uint64_t /* dataMask */,
size_t bitsPerElement,
ByteRange /* b1 */,
ByteRange /* b2 */,
MutableByteRange /* out */) {
if (bitsPerElement != 0) { // hack to defeat [[noreturn]] compiler warning
LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::SSE2>::"
<< "sub() called";
}
}
template <>
void MathOperation<MathEngine::SSE2>::clearPaddingBits(
uint64_t /* dataMask */,
MutableByteRange buf) {
if (buf.data() != nullptr) { // hack to defeat [[noreturn]] compiler warning
LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::SSE2>::"
<< "clearPaddingBits() called";
}
return; // not reached
}
template <>
bool MathOperation<MathEngine::SSE2>::checkPaddingBits(
uint64_t /* dataMask */,
ByteRange buf) {
if (buf.data() != nullptr) { // hack to defeat [[noreturn]] compiler warning
LOG(FATAL) << "Unimplemented function MathOperation<MathEngine::SSE2>::"
<< "checkPaddingBits() called";
}
return false;
}
#endif // __SSE2__
template struct MathOperation<MathEngine::SSE2>;
} // namespace detail
} // namespace crypto
} // namespace folly
/*
* Copyright 2017-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Implementation of the MathOperation<MathEngine::SIMPLE> template
// specializations.
#include <folly/experimental/crypto/detail/LtHashInternal.h>
#include <folly/Memory.h>
#include <folly/lang/Bits.h>
namespace folly {
namespace crypto {
namespace detail {
// static
template <>
bool MathOperation<MathEngine::SIMPLE>::isImplemented() {
return true;
}
// static
template <>
void MathOperation<MathEngine::SIMPLE>::add(
uint64_t dataMask,
size_t bitsPerElement,
ByteRange b1,
ByteRange b2,
MutableByteRange out) {
DCHECK_EQ(b1.size(), b2.size());
DCHECK_EQ(b1.size(), out.size());
DCHECK_EQ(0, b1.size() % kCacheLineSize);
static_assert(
kCacheLineSize % sizeof(uint64_t) == 0,
"kCacheLineSize must be a multiple of sizeof(uint64_t)");
static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(uint64_t);
static_assert(
kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(uint64_t)");
alignas(kCacheLineSize) std::array<uint64_t, kValsPerCacheLine> results;
if (bitsPerElement == 16 || bitsPerElement == 32) {
// When bitsPerElement is 16:
// There are no padding bits, 4x 16-bit values fit exactly into a uint64_t:
// uint64_t U = [ uint16_t W, uint16_t X, uint16_t Y, uint16_t Z ].
// We break them up into A and B groups, with each group containing
// alternating elements, such that A | B = the original number:
// uint64_t A = [ uint16_t W, 0, uint16_t Y, 0 ]
// uint64_t B = [ 0, uint16_t X, 0, uint16_t Z ]
// Then we add the A group and B group independently, and bitwise-OR
// the results.
// When bitsPerElement is 32:
// There are no padding bits, 2x 32-bit values fit exactly into a uint64_t.
// We independently add the high and low halves and then XOR them together.
const uint64_t kMaskA =
bitsPerElement == 16 ? 0xffff0000ffff0000ULL : 0xffffffff00000000ULL;
const uint64_t kMaskB = ~kMaskA;
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const uint64_t* v1p = reinterpret_cast<const uint64_t*>(b1.data() + pos);
const uint64_t* v2p = reinterpret_cast<const uint64_t*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
uint64_t v1 = Endian::little(*(v1p + i));
uint64_t v2 = Endian::little(*(v2p + i));
uint64_t v1a = v1 & kMaskA;
uint64_t v1b = v1 & kMaskB;
uint64_t v2a = v2 & kMaskA;
uint64_t v2b = v2 & kMaskB;
uint64_t v3a = (v1a + v2a) & kMaskA;
uint64_t v3b = (v1b + v2b) & kMaskB;
results[i] = Endian::little(v3a | v3b);
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
} else {
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const uint64_t* v1p = reinterpret_cast<const uint64_t*>(b1.data() + pos);
const uint64_t* v2p = reinterpret_cast<const uint64_t*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
uint64_t v1 = Endian::little(*(v1p + i));
uint64_t v2 = Endian::little(*(v2p + i));
results[i] = Endian::little((v1 + v2) & dataMask);
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
}
}
// static
template <>
void MathOperation<MathEngine::SIMPLE>::sub(
uint64_t dataMask,
size_t bitsPerElement,
ByteRange b1,
ByteRange b2,
MutableByteRange out) {
DCHECK_EQ(b1.size(), b2.size());
DCHECK_EQ(b1.size(), out.size());
DCHECK_EQ(0, b1.size() % kCacheLineSize);
static_assert(
kCacheLineSize % sizeof(uint64_t) == 0,
"kCacheLineSize must be a multiple of sizeof(uint64_t)");
static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(uint64_t);
static_assert(
kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(uint64_t)");
alignas(kCacheLineSize) std::array<uint64_t, kValsPerCacheLine> results;
if (bitsPerElement == 16 || bitsPerElement == 32) {
// When bitsPerElement is 16:
// There are no padding bits, 4x 16-bit values fit exactly into a uint64_t:
// uint64_t U = [ uint16_t W, uint16_t X, uint16_t Y, uint16_t Z ].
// We break them up into A and B groups, with each group containing
// alternating elements, such that A | B = the original number:
// uint64_t A = [ uint16_t W, 0, uint16_t Y, 0 ]
// uint64_t B = [ 0, uint16_t X, 0, uint16_t Z ]
// Then we add the A group and B group independently, and bitwise-OR
// the results.
// When bitsPerElement is 32:
// There are no padding bits, 2x 32-bit values fit exactly into a uint64_t.
// We independently add the high and low halves and then XOR them together.
const uint64_t kMaskA =
bitsPerElement == 16 ? 0xffff0000ffff0000ULL : 0xffffffff00000000ULL;
const uint64_t kMaskB = ~kMaskA;
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const uint64_t* v1p = reinterpret_cast<const uint64_t*>(b1.data() + pos);
const uint64_t* v2p = reinterpret_cast<const uint64_t*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
uint64_t v1 = Endian::little(*(v1p + i));
uint64_t v2 = Endian::little(*(v2p + i));
uint64_t v1a = v1 & kMaskA;
uint64_t v1b = v1 & kMaskB;
uint64_t v2a = v2 & kMaskA;
uint64_t v2b = v2 & kMaskB;
uint64_t v3a = (v1a + (kMaskB - v2a)) & kMaskA;
uint64_t v3b = (v1b + (kMaskA - v2b)) & kMaskB;
results[i] = Endian::little(v3a | v3b);
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
} else {
for (size_t pos = 0; pos < b1.size(); pos += kCacheLineSize) {
const uint64_t* v1p = reinterpret_cast<const uint64_t*>(b1.data() + pos);
const uint64_t* v2p = reinterpret_cast<const uint64_t*>(b2.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
uint64_t v1 = Endian::little(*(v1p + i));
uint64_t v2 = Endian::little(*(v2p + i));
results[i] =
Endian::little((v1 + ((~dataMask - v2) & dataMask)) & dataMask);
}
std::memcpy(out.data() + pos, results.data(), sizeof(results));
}
}
}
template <>
void MathOperation<MathEngine::SIMPLE>::clearPaddingBits(
uint64_t dataMask,
MutableByteRange buf) {
if (dataMask == 0xffffffffffffffffULL) {
return;
}
DCHECK_EQ(0, buf.size() % kCacheLineSize);
static_assert(
kCacheLineSize % sizeof(uint64_t) == 0,
"kCacheLineSize must be a multiple of sizeof(uint64_t)");
static constexpr size_t kValsPerCacheLine = kCacheLineSize / sizeof(uint64_t);
static_assert(
kValsPerCacheLine > 0, "kCacheLineSize must be >= sizeof(uint64_t)");
alignas(kCacheLineSize) std::array<uint64_t, kValsPerCacheLine> results;
for (size_t pos = 0; pos < buf.size(); pos += kCacheLineSize) {
const uint64_t* p = reinterpret_cast<const uint64_t*>(buf.data() + pos);
for (size_t i = 0; i < kValsPerCacheLine; ++i) {
results[i] = Endian::little(Endian::little(*(p + i)) & dataMask);
}
std::memcpy(buf.data() + pos, results.data(), sizeof(results));
}
}
template <>
bool MathOperation<MathEngine::SIMPLE>::checkPaddingBits(
uint64_t dataMask,
ByteRange buf) {
if (dataMask == 0xffffffffffffffffULL) {
return true;
}
DCHECK_EQ(0, buf.size() % sizeof(uint64_t));
for (size_t pos = 0; pos < buf.size(); pos += sizeof(uint64_t)) {
uint64_t val =
Endian::little(*reinterpret_cast<const uint64_t*>(buf.data() + pos));
if ((val & ~dataMask) != 0ULL) {
return false;
}
}
return true;
}
template struct MathOperation<MathEngine::SIMPLE>;
} // namespace detail
} // namespace crypto
} // namespace folly
/*
* Copyright 2017-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/Benchmark.h>
#include <folly/Random.h>
#include <folly/experimental/crypto/LtHash.h>
#include <folly/init/Init.h>
#include <folly/io/IOBuf.h>
#include <glog/logging.h>
#include <sodium.h>
using namespace ::folly::crypto;
namespace {
constexpr size_t kObjectCount = 1000;
constexpr size_t kObjectSize = 150;
std::vector<std::unique_ptr<const folly::IOBuf>> kObjects;
} // namespace
std::unique_ptr<folly::IOBuf> makeRandomData(size_t length) {
auto data = std::make_unique<folly::IOBuf>(
folly::crypto::detail::allocateCacheAlignedIOBuf(length));
data->append(length);
randombytes_buf(data->writableData(), data->length());
return data;
}
template <std::size_t B, std::size_t N>
void runBenchmark(size_t n) {
LtHash<B, N> ltHash;
for (size_t i = 0; i < static_cast<size_t>(n); ++i) {
const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
ltHash.addObject({obj.data(), obj.length()});
}
}
BENCHMARK(single_blake2b, n) {
std::array<unsigned char, crypto_generichash_blake2b_BYTES_MAX> result;
for (size_t i = 0; i < static_cast<size_t>(n); ++i) {
const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
int res = crypto_generichash_blake2b(
result.data(), sizeof(result), obj.data(), obj.length(), nullptr, 0);
if (res != 0) {
throw std::runtime_error("blake2b hash failed");
}
}
}
BENCHMARK_RELATIVE(LtHash_element_count_1024_length_16, n) {
runBenchmark<16, 1024>(static_cast<size_t>(n));
}
BENCHMARK_RELATIVE(LtHash_element_count_1008_length_20, n) {
runBenchmark<20, 1008>(static_cast<size_t>(n));
}
BENCHMARK_RELATIVE(LtHash_element_count_1024_length_32, n) {
runBenchmark<32, 1024>(static_cast<size_t>(n));
}
BENCHMARK_RELATIVE(LtHash_element_count_2048_length_32, n) {
runBenchmark<32, 2048>(static_cast<size_t>(n));
}
BENCHMARK(calculateChecksumFor100KObjects_B20_N1008) {
LtHash<20, 1008> ltHash;
for (auto i = 0; i < 100000; ++i) {
const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
ltHash.addObject({obj.data(), obj.length()});
}
}
BENCHMARK_RELATIVE(calculateChecksumFor100KObjects_B16_N1024) {
LtHash<16, 1024> ltHash;
for (auto i = 0; i < 100000; ++i) {
const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
ltHash.addObject({obj.data(), obj.length()});
}
}
BENCHMARK_RELATIVE(calculateChecksumFor100KObjects_B32_N1024) {
LtHash<32, 1024> ltHash;
for (auto i = 0; i < 100000; ++i) {
const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
ltHash.addObject({obj.data(), obj.length()});
}
}
BENCHMARK(subtractChecksumFor100KObjects_B20_N1008) {
LtHash<20, 1008> ltHash;
for (auto i = 0; i < 100000; ++i) {
const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
ltHash.removeObject({obj.data(), obj.length()});
}
}
BENCHMARK_RELATIVE(subtractChecksumFor100KObjects_B16_N1024) {
LtHash<16, 1024> ltHash;
for (auto i = 0; i < 100000; ++i) {
const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
ltHash.removeObject({obj.data(), obj.length()});
}
}
BENCHMARK_RELATIVE(subtractChecksumFor100KObjects_B32_N1024) {
LtHash<32, 1024> ltHash;
for (auto i = 0; i < 100000; ++i) {
const folly::IOBuf& obj = *(kObjects[i % kObjects.size()]);
ltHash.removeObject({obj.data(), obj.length()});
}
}
int main(int argc, char** argv) {
folly::init(&argc, &argv);
if (sodium_init() < 0) {
throw std::runtime_error("Failed to initialize libsodium");
}
// pre-generate objects with random length to hash
for (size_t i = 0; i < kObjectCount; i++) {
kObjects.push_back(makeRandomData(kObjectSize));
}
// Trigger the implementation selection of AUTO math operations before
// starting the benchmark, so log messages don't pollute the output table.
LtHash<20, 1008> ltHash;
ltHash.addObject(folly::range("hello world"));
ltHash.removeObject(folly::range("hello world"));
folly::runBenchmarks();
return 0;
}
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment