Commit e19669d3 authored by Lucian Grijincu's avatar Lucian Grijincu Committed by Facebook GitHub Bot

__folly_memcpy: allow overlapping buffers of any size and provide drop-in replacement for memmove

Summary:
`__folly_memcpy` already behaved as `memmove` for n <= 256B.

For n > 256B and overlapping buffers `__folly_memcpy` did some work
- it determined the size is large enough for 128B AVX loads/stores
- it has already copied the first 128B and the last 128B in YMM registers

but then discarded it and fell back on `memmove`.

Instead of wasting this work, forward copy (dst < src) or backward copy (dst > src).
- use unaligned loads + aligned stores, but not non-temporal stores
- for dst < src forward copy in 128 byte batches:
-- unaligned load the first 32 bytes & last 4 x 32 bytes
-- forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
-- unaligned store the first 32 bytes & last 4 x 32 bytes
- for dst > src backward copy in 128 byte batches:
-- unaligned load the first 4 x 32 bytes & last 32 bytes
-- backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
-- unaligned store the first 4 x 32 bytes & last 32 bytes

Reviewed By: yfeldblum

Differential Revision: D31915389

fbshipit-source-id: 2c0197b2bddc102a7fb8f70a6f43e79ac994dc73
parent 62fa4e65
......@@ -18,37 +18,55 @@
* __folly_memcpy: An optimized memcpy implementation that uses prefetch and
* AVX2 instructions.
*
* This implementation of memcpy acts as a memmove, but it is not optimized for
* this purpose. While overlapping copies are undefined in memcpy, this
* implementation acts like memmove for sizes up through 256 bytes and will
* detect overlapping copies and call memmove for overlapping copies of 257 or
* more bytes.
* This implementation of memcpy acts as a memmove: while overlapping copies
* are undefined in memcpy, in some implementations they're the same function and
* legacy programs rely on this behavior.
*
* This implementation uses prefetch to avoid dtlb misses. This can
* substantially reduce dtlb store misses in cases where the destination
* location is absent from L1 cache and where the copy size is small enough
* that the hardware prefetcher doesn't have a large impact.
*
* The number of branches is limited by the use of overlapping copies. This
* helps with copies where the source and destination cache lines are already
* The number of branches is limited by the use of overlapping loads & stores.
* This helps with copies where the source and destination cache lines are already
* present in L1 because there are fewer instructions to execute and fewer
* branches to potentially mispredict.
* e.g. to copy the last 4 <= n <= 7 bytes: copy the first & last 4 bytes (overlapped):
* movl (%rsi), %r8d
* movl -4(%rsi,%rdx), %r9d
* movl %r8d, (%rdi)
* movl %r9d, -4(%rdi,%rdx)
*
* Vector operations up to 32-bytes are used (avx2 instruction set). Larger
* mov operations (avx512) are not used.
*
* Large copies make use of aligned store operations. This operation is
* observed to always be faster than rep movsb, so the rep movsb instruction
* is not used.
* For sizes up to 256 all source data is first read into registers and then written:
* - n <= 16: overlapping movs
* - n <= 32: overlapping unaligned 16-byte SSE XMM load/stores
* - n <= 256: overlapping unaligned 32-byte AVX YMM load/stores
*
* If the copy size is humongous and the source and destination are both
* aligned, this memcpy will use non-temporal operations. This can have
* Large copies (> 256 bytes) use unaligned loads + aligned stores.
* This is observed to always be faster than rep movsb, so the rep movsb
* instruction is not used.
* - The head & tail may be unaligned => they're always written using unaligned stores.
*
* If the copy size is humongous (> 32 KiB) and the source and destination are both
* aligned, this memcpy will use non-temporal operations (AVX2). This can have
* a substantial speedup for copies where data is absent from L1, but it
* is significantly slower if the source and destination data were already
* in L1. The use of non-temporal operations also has the effect that after
* the copy is complete, the data will be moved out of L1, even if the data was
* present before the copy started.
*
* For n > 256 and overlapping src & dst buffers (memmove):
* - use unaligned loads + aligned stores, but not non-temporal stores
* - for dst < src forward copy in 128 byte batches:
* - unaligned load the first 32 bytes & last 4 x 32 bytes
* - forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
* - unaligned store the first 32 bytes & last 4 x 32 bytes
* - for dst > src backward copy in 128 byte batches:
* - unaligned load the first 4 x 32 bytes & last 32 bytes
* - backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
* - unaligned store the first 4 x 32 bytes & last 32 bytes
*
* @author Logan Evans <lpe@fb.com>
*/
......@@ -111,7 +129,7 @@ __folly_memcpy_short:
__folly_memcpy:
.cfi_startproc
mov %rdi, %rax
mov %rdi, %rax # return: $rdi
test %rdx, %rdx
je .L_EQ0
......@@ -215,14 +233,19 @@ __folly_memcpy:
//
// %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many
// bytes remain to be copied.
// (%rsi + %rdx <= %rdi) => no overlap
lea (%rsi,%rdx), %r9
cmp %rdi, %r9
jbe .L_NO_OVERLAP
// (%rdi + %rdx <= %rsi) => no overlap
lea (%rdi,%rdx), %r8
cmp %rsi, %r8
// This is a forward jump so that the branch predictor will not predict
// a memmove.
ja .L_MEMMOVE
// If no info is available in branch predictor's cache, Intel CPUs assume
// forward jumps are not taken. Use a forward jump as overlapping buffers
// are unlikely.
ja .L_OVERLAP
.align 2
.L_NO_OVERLAP:
......@@ -309,16 +332,120 @@ __folly_memcpy:
sfence
jmp .L_ALIGNED_DST_LOOP_END
.L_MEMMOVE:
call memmove
.L_OVERLAP:
.align 2
cmp %rdi, %rsi
jb .L_OVERLAP_BWD // %rsi < %rdi => backward-copy
je .L_RET // %rsi == %rdi => return, nothing to copy
// Source & destination buffers overlap. Forward copy.
vmovdqu (%rsi), %ymm8
// Align %rdi to a 32 byte boundary.
// %rcx = 32 - 31 & %rdi
mov $32, %rcx
and $31, %rdi
sub %rdi, %rcx
lea (%rsi,%rcx), %rsi
lea (%rax,%rcx), %rdi
sub %rcx, %rdx
// %r8 is the end condition for the loop.
lea -128(%rsi,%rdx), %r8
.L_OVERLAP_FWD_ALIGNED_DST_LOOP:
prefetchw 128(%rdi)
prefetchw 192(%rdi)
vmovdqu (%rsi), %ymm0
vmovdqu 32(%rsi), %ymm1
vmovdqu 64(%rsi), %ymm2
vmovdqu 96(%rsi), %ymm3
add $128, %rsi
vmovdqa %ymm0, (%rdi)
vmovdqa %ymm1, 32(%rdi)
vmovdqa %ymm2, 64(%rdi)
vmovdqa %ymm3, 96(%rdi)
add $128, %rdi
cmp %r8, %rsi
jb .L_OVERLAP_FWD_ALIGNED_DST_LOOP
sub %rsi, %r9
mov %r9, %rdx
vmovdqu %ymm4, -128(%rdi,%rdx)
vmovdqu %ymm5, -96(%rdi,%rdx)
vmovdqu %ymm6, -64(%rdi,%rdx)
vmovdqu %ymm7, -32(%rdi,%rdx)
vmovdqu %ymm8, (%rax) // %rax == the original (unaligned) %rdi
vzeroupper
.L_RET:
ret
.L_OVERLAP_BWD:
# Save last 32 bytes.
vmovdqu -32(%rsi, %rdx), %ymm8
lea -32(%rdi, %rdx), %r9
// %r8 is the end condition for the loop.
lea 128(%rsi), %r8
// Align %rdi+%rdx (destination end) to a 32 byte boundary.
// %rcx = (%rdi + %rdx - 32) & 31
mov %r9, %rcx
and $31, %rcx
// Set %rsi & %rdi to the end of the 32 byte aligned range.
sub %rcx, %rdx
add %rdx, %rsi
add %rdx, %rdi
.L_OVERLAP_BWD_ALIGNED_DST_LOOP:
prefetchw -128(%rdi)
prefetchw -192(%rdi)
vmovdqu -32(%rsi), %ymm4
vmovdqu -64(%rsi), %ymm5
vmovdqu -96(%rsi), %ymm6
vmovdqu -128(%rsi), %ymm7
sub $128, %rsi
vmovdqa %ymm4, -32(%rdi)
vmovdqa %ymm5, -64(%rdi)
vmovdqa %ymm6, -96(%rdi)
vmovdqa %ymm7, -128(%rdi)
sub $128, %rdi
cmp %r8, %rsi
ja .L_OVERLAP_BWD_ALIGNED_DST_LOOP
vmovdqu %ymm0, (%rax) // %rax == the original unaligned %rdi
vmovdqu %ymm1, 32(%rax)
vmovdqu %ymm2, 64(%rax)
vmovdqu %ymm3, 96(%rax)
vmovdqu %ymm8, (%r9)
vzeroupper
ret
.cfi_endproc
.size __folly_memcpy, .-__folly_memcpy
#ifdef FOLLY_MEMCPY_IS_MEMCPY
.weak memcpy
memcpy = __folly_memcpy
.weak memmove
memmove = __folly_memcpy
#endif
.ident "GCC: (GNU) 4.8.2"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment