Commit 1ec38b40 authored by Bin Liu's avatar Bin Liu Committed by facebook-github-bot-0

Put optimized memcpy into folly

Summary: There is an optimized assembler version of memcpy that is showing 1.5% gain on TAO, add it to folly

Reviewed By: yfeldblum

Differential Revision: D2218473

fb-gh-sync-id: d5ac7f5ab30ff6febe7e94b017766c68dbd8934d
parent f05cdbc1
/*
* Copyright 2015 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* memcpy: An optimized memcpy implementation for x86_64. It uses AVX when
* __AVX__ is defined, and uses SSE2 otherwise.
*
* @author Bin Liu <binliu@fb.com>
*/
#if defined(__x86_64__) && defined(__linux__) && !defined(__CYGWIN__)
.file "memcpy.S"
.text
/*
* _memcpy_short is a local helper used when length < 8. It cannot be called
* from outside, because it expects a non-standard calling convention:
*
* %rax: destination buffer address.
* %rsi: source buffer address.
* %edx: length, in the range of [0, 7]
*/
.type _memcpy_short, @function
_memcpy_short:
.LSHORT:
.cfi_startproc
// if (length == 0) return;
test %edx, %edx
jz .LEND
movzbl (%rsi), %ecx
// if (length - 4 < 0) goto LS4;
sub $4, %edx
jb .LS4
mov (%rsi), %ecx
mov (%rsi, %rdx), %edi
mov %ecx, (%rax)
mov %edi, (%rax, %rdx)
.LEND:
rep
ret
nop
.LS4:
// At this point, length can be 1 or 2 or 3, and $cl contains
// the first byte.
mov %cl, (%rax)
// if (length - 4 + 2 < 0) return;
add $2, %edx
jnc .LEND
// length is 2 or 3 here. In either case, just copy the last
// two bytes.
movzwl (%rsi, %rdx), %ecx
mov %cx, (%rax, %rdx)
ret
.cfi_endproc
.size _memcpy_short, .-_memcpy_short
/*
* void* memcpy(void* dst, void* src, uint32_t length);
*
*/
.align 16
.globl memcpy
.type memcpy, @function
memcpy:
.cfi_startproc
mov %rdx, %rcx
mov %rdi, %rax
cmp $8, %rdx
jb .LSHORT
mov -8(%rsi, %rdx), %r8
mov (%rsi), %r9
mov %r8, -8(%rdi, %rdx)
and $24, %rcx
jz .L32
mov %r9, (%rdi)
mov %rcx, %r8
sub $16, %rcx
jb .LT32
#ifndef __AVX__
movdqu (%rsi, %rcx), %xmm1
movdqu %xmm1, (%rdi, %rcx)
#else
vmovdqu (%rsi, %rcx), %xmm1
vmovdqu %xmm1, (%rdi, %rcx)
#endif
// Test if there are 32-byte groups
.LT32:
add %r8, %rsi
and $-32, %rdx
jnz .L32_adjDI
ret
.align 16
.L32_adjDI:
add %r8, %rdi
.L32:
#ifndef __AVX__
movdqu (%rsi), %xmm0
movdqu 16(%rsi), %xmm1
#else
vmovdqu (%rsi), %ymm0
#endif
shr $6, %rdx
jnc .L64_32read
#ifndef __AVX__
movdqu %xmm0, (%rdi)
movdqu %xmm1, 16(%rdi)
#else
vmovdqu %ymm0, (%rdi)
#endif
lea 32(%rsi), %rsi
jnz .L64_adjDI
#ifdef __AVX__
vzeroupper
#endif
ret
.L64_adjDI:
add $32, %rdi
.L64:
#ifndef __AVX__
movdqu (%rsi), %xmm0
movdqu 16(%rsi), %xmm1
#else
vmovdqu (%rsi), %ymm0
#endif
.L64_32read:
#ifndef __AVX__
movdqu 32(%rsi), %xmm2
movdqu 48(%rsi), %xmm3
add $64, %rsi
movdqu %xmm0, (%rdi)
movdqu %xmm1, 16(%rdi)
movdqu %xmm2, 32(%rdi)
movdqu %xmm3, 48(%rdi)
#else
vmovdqu 32(%rsi), %ymm1
add $64, %rsi
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, 32(%rdi)
#endif
add $64, %rdi
dec %rdx
jnz .L64
#ifdef __AVX__
vzeroupper
#endif
ret
.cfi_endproc
.size memcpy, .-memcpy
#endif
/*
* Copyright 2015 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <gtest/gtest.h>
namespace {
constexpr size_t SIZE = 4096 * 4;
char src[SIZE];
char dst[SIZE];
void init() {
for (size_t i = 0; i < SIZE; ++i) {
src[i] = static_cast<char>(i);
dst[i] = static_cast<char>(255 - i);
}
}
}
TEST(memcpy, zero_len) {
// If length is 0, we shouldn't touch any memory. So this should
// not crash.
char* srcNull = nullptr;
char* dstNull = nullptr;
memcpy(dstNull, srcNull, 0);
}
// Test copy `len' bytes and verify that exactly `len' bytes are copied.
void testLen(size_t len) {
if (len > SIZE) {
return;
}
init();
memcpy(dst, src, len);
for (size_t i = 0; i < len; ++i) {
EXPECT_EQ(src[i], static_cast<char>(i));
EXPECT_EQ(src[i], dst[i]);
}
if (len < SIZE) {
EXPECT_EQ(src[len], static_cast<char>(len));
EXPECT_EQ(dst[len], static_cast<char>(255 - len));
}
}
TEST(memcpy, small) {
for (size_t len = 1; len < 8; ++len) {
testLen(len);
}
}
TEST(memcpy, main) {
for (size_t len = 8; len < 128; ++len) {
testLen(len);
}
for (size_t len = 128; len < SIZE; len += 128) {
testLen(len);
}
for (size_t len = 128; len < SIZE; len += 73) {
testLen(len);
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment