Commit 0ec02d48 authored by Ilya Albrecht's avatar Ilya Albrecht Committed by Facebook GitHub Bot

asm memset implementation

Summary: asm memset implementation based on the https://github.com/nadavrot/memset_benchmark implementation by Nadav Rotem

Reviewed By: luciang, magedm

Differential Revision: D32888087

fbshipit-source-id: 3678950ed5178d2a46ba833502d1a6aef4f2c731
parent f5e28344
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cstring>
#if !defined(__AVX2__)
void* __folly_memset(void* dest, int ch, std::size_t count) {
return std::memset(dest, ch, count);
}
#endif
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cstddef>
namespace folly {
extern "C" void* __folly_memset(void* dest, int ch, std::size_t count);
} // namespace folly
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#if defined(__AVX2__)
#define LABEL(x) .L##x
.text
.p2align 5, 0x90
.global __folly_memset
.type __folly_memset, @function
__folly_memset:
.cfi_startproc
// RDI is the buffer
// RSI is the value
// RDX is length
vmovd %esi, %xmm0
vpbroadcastb %xmm0, %ymm0
mov %rdi, %rax
cmp $0x40, %rdx
jae LABEL(above_64)
LABEL(below_64):
cmp $0x20, %rdx
jb LABEL(below_32)
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm0, -0x20(%rdi,%rdx)
vzeroupper
retq
.align 32
LABEL(below_32):
cmp $0x10, %rdx
jae LABEL(in_16_to_32)
LABEL(below_16):
cmp $0x4, %rdx
jbe LABEL(below_4)
LABEL(in_4_to_16):
// Scalar stores from this point.
vmovq %xmm0, %rsi
cmp $0x7, %rdx
jbe LABEL(in_4_to_8)
// Two 8-wide stores, up to 16 bytes.
mov %rsi, -0x8(%rdi, %rdx)
mov %rsi, (%rdi)
vzeroupper
retq
.align 32
LABEL(below_4):
vmovq %xmm0, %rsi
vzeroupper
cmp $0x1, %rdx
jbe LABEL(none_or_one)
mov %si, (%rdi)
mov %si, -0x2(%rdi, %rdx)
LABEL(exit):
retq
.align 16
LABEL(in_4_to_8):
// two 4-wide stores, upto 8 bytes.
mov %esi, -0x4(%rdi,%rdx)
mov %esi, (%rdi)
vzeroupper
retq
.align 32
LABEL(in_16_to_32):
vmovups %xmm0, (%rdi)
vmovups %xmm0, -0x10(%rdi,%rdx)
vzeroupper
retq
LABEL(above_64):
cmp $0xb0, %rdx
ja LABEL(above_192)
cmp $0x80, %rdx
jbe LABEL(in_64_to_128)
// Do some work filling unaligned 32bit words.
// last_word -> rsi
lea -0x20(%rdi,%rdx), %rsi
// rcx -> fill pointer.
// We have at least 128 bytes to store.
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm0, 0x20(%rdi)
vmovdqu %ymm0, 0x40(%rdi)
add $0x60, %rdi
.align 32
LABEL(fill_32):
vmovdqu %ymm0, (%rdi)
add $0x20, %rdi
cmp %rdi, %rsi
ja LABEL(fill_32)
// Stamp the last unaligned store.
vmovdqu %ymm0, (%rsi)
vzeroupper
retq
.align 32
LABEL(in_64_to_128):
// Last_word -> rsi
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm0, 0x20(%rdi)
vmovdqu %ymm0, -0x40(%rdi,%rdx)
vmovdqu %ymm0, -0x20(%rdi,%rdx)
vzeroupper
retq
.align 32
LABEL(above_192):
// rdi is the buffer address
// rsi is the value
// rdx is length
cmp $0x1000, %rdx
jae LABEL(large_stosq)
// Store the first unaligned 32 bytes.
vmovdqu %ymm0, (%rdi)
// The first aligned word is stored in %rsi.
mov %rdi, %rsi
mov %rdi, %rax
and $0xffffffffffffffe0, %rsi
lea 0x20(%rsi), %rsi
// Compute the address of the last unaligned word into rdi.
lea -0x20(%rdx), %rdx
add %rdx, %rdi
// Check if we can do a full 5x32B stamp.
lea 0xa0(%rsi), %rcx
cmp %rcx, %rdi
jb LABEL(stamp_4)
LABEL(fill_192):
vmovdqa %ymm0, (%rsi)
vmovdqa %ymm0, 0x20(%rsi)
vmovdqa %ymm0, 0x40(%rsi)
vmovdqa %ymm0, 0x60(%rsi)
vmovdqa %ymm0, 0x80(%rsi)
add $0xa0, %rsi
lea 0xa0(%rsi), %rcx
cmp %rcx, %rdi
ja LABEL(fill_192)
LABEL(fill_192_tail):
cmp %rsi, %rdi
jb LABEL(fill_192_done)
vmovdqa %ymm0, (%rsi)
lea 0x20(%rsi), %rcx
cmp %rcx, %rdi
jb LABEL(fill_192_done)
vmovdqa %ymm0, 0x20(%rsi)
lea 0x40(%rsi), %rcx
cmp %rcx, %rdi
jb LABEL(fill_192_done)
vmovdqa %ymm0, 0x40(%rsi)
lea 0x60(%rsi), %rcx
cmp %rcx, %rdi
jb LABEL(fill_192_done)
vmovdqa %ymm0, 0x60(%rsi)
LABEL(last_wide_store):
lea 0x80(%rsi), %rcx
cmp %rcx, %rdi
jb LABEL(fill_192_done)
vmovdqa %ymm0, 0x80(%rsi)
.align 16
LABEL(fill_192_done):
// Stamp the last word.
vmovdqu %ymm0, (%rdi)
vzeroupper
// FIXME return buffer address
ret
LABEL(stamp_4):
vmovdqa %ymm0, (%rsi)
vmovdqa %ymm0, 0x20(%rsi)
vmovdqa %ymm0, 0x40(%rsi)
vmovdqa %ymm0, 0x60(%rsi)
jmp LABEL(last_wide_store)
LABEL(large_stosq):
// rdi is the buffer address
// rsi is the value
// rdx is length
vmovd %xmm0, %rax
mov %rax, (%rdi)
mov %rdi, %rsi
// Align rdi to 8B
and $0xfffffffffffffff8, %rdi
lea 0x8(%rdi), %rdi
// Fill buffer using stosq
mov %rdx, %rcx
sub $0x8, %rcx
shrq $0x3, %rcx
// rcx - number of QWORD elements
// rax - value
// rdi - buffer pointer
rep stosq
// Fill last 16 bytes
vmovdqu %xmm0, -0x10(%rsi, %rdx)
vzeroupper
mov %rsi, %rax
ret
.align 16
LABEL(none_or_one):
test %rdx, %rdx
je LABEL(exit)
// Store one and exit
mov %sil, (%rdi)
ret
.cfi_endproc
.size __folly_memset, .-__folly_memset
#ifdef FOLLY_MEMSET_IS_MEMSET
.weak memset
memset = __folly_memset
#endif // FOLLY_MEMSET_IS_MEMSET
#endif // __AVX2__
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/FollyMemset.h>
#include <stdint.h>
#include <stdlib.h>
#include <deque>
#include <string>
#include <fmt/core.h>
#include <folly/Benchmark.h>
#include <folly/Preprocessor.h>
#include <folly/portability/GFlags.h>
DEFINE_uint32(min_size, 1, "Minimum size to benchmark");
DEFINE_uint32(max_size, 32768, "Maximum size to benchmark");
DEFINE_bool(linear, false, "Test all sizes [min_size, max_size]");
DEFINE_uint32(step, 1, "Test sizes step");
DEFINE_uint32(page_offset, 0, "Buffer offset from page aligned size");
uint8_t* temp_buf;
size_t getPow2(size_t v) {
assert(v != 0);
return 1UL << (sizeof(size_t) * 8 - __builtin_clzl(v) - 1);
}
template <void* memset_impl(void*, int, size_t)>
void bmMemset(void* buf, size_t length, size_t iters) {
__asm__ volatile(".align 64\n");
#pragma unroll(1)
for (size_t i = 0; i < iters; ++i) {
memset_impl(buf, 0xFF, length);
}
}
template <void* memset_impl(void*, int, size_t)>
void addMemsetBenchmark(const std::string& name) {
static std::deque<std::string> names;
auto addBech = [&](size_t size) {
names.emplace_back(fmt::format("{}: size={}", name, size));
folly::addBenchmark(__FILE__, names.back().c_str(), [=](unsigned iters) {
bmMemset<memset_impl>(temp_buf + FLAGS_page_offset, size, iters);
return iters;
});
};
if (FLAGS_linear) {
for (size_t size = FLAGS_min_size; size <= FLAGS_max_size;
size += FLAGS_step) {
addBech(size);
}
} else {
for (size_t size = getPow2(FLAGS_min_size); size <= getPow2(FLAGS_max_size);
size <<= 1) {
addBech(size);
}
}
/* Draw line. */
folly::addBenchmark(__FILE__, "-", []() { return 0; });
}
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]);
assert(FLAGS_min_size <= FLAGS_max_size);
assert(FLAGS_page_offset < 4096);
assert(FLAGS_step > 0);
size_t totalBufSize = (FLAGS_max_size + FLAGS_page_offset + 4095) & ~4095;
temp_buf = (uint8_t*)aligned_alloc(4096, totalBufSize);
// Make sure all pages are allocated
for (size_t i = 0; i < totalBufSize; i++) {
temp_buf[i] = 0;
}
#define BENCHMARK_MEMSET(MEMSET) \
addMemsetBenchmark<MEMSET>(FOLLY_PP_STRINGIZE(MEMSET));
BENCHMARK_MEMSET(memset);
BENCHMARK_MEMSET(folly::__folly_memset);
folly::runBenchmarks();
free(temp_buf);
return 0;
}
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdlib.h>
#include <cstddef>
#include <folly/FollyMemset.h>
#include <folly/portability/GTest.h>
constexpr size_t kPageSize = 4096;
constexpr size_t kMaxSize = 2 * kPageSize;
constexpr uint8_t kBufEnd = 0xDB;
// memset implementation test with 0xFF pattern
// buf must have an extra byte to be filled with magic constant
void testMemsetImpl(uint8_t* buf, size_t maxLen) {
for (size_t len = 0; len < maxLen; len++) {
for (size_t i = 0; i < maxLen; i++) {
buf[i] = 0x0;
}
buf[len] = kBufEnd;
auto* p = folly::__folly_memset(buf, 0xFF, len);
EXPECT_EQ(buf, reinterpret_cast<uint8_t*>(p));
bool isEq = true;
for (size_t i = 0; i < len; i++) {
isEq &= buf[i] == 0xFF;
}
EXPECT_TRUE(isEq);
EXPECT_EQ(buf[len], kBufEnd);
}
}
TEST(MemsetAsmTest, alignedBuffer) {
uint8_t* buf = reinterpret_cast<uint8_t*>(
aligned_alloc(kPageSize, kMaxSize + 2 * kPageSize));
// Get buffer aligned power of 2 from 16 all the way up to a page size
for (size_t alignment = 16; alignment <= kPageSize; alignment <<= 1) {
testMemsetImpl(buf + (alignment % kPageSize), kMaxSize);
}
free(buf);
}
TEST(MemsetAsmTest, unalignedBuffer) {
uint8_t* buf = reinterpret_cast<uint8_t*>(
aligned_alloc(kPageSize, kMaxSize + 2 * kPageSize));
for (size_t alignment = 1; alignment <= 192; alignment++) {
testMemsetImpl(buf + alignment, kMaxSize);
}
free(buf);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment