__folly_memcpy: allow overlapping buffers of any size and provide drop-in replacement for memmove

Summary: `__folly_memcpy` already behaved as `memmove` for n <= 256B. For n > 256B and overlapping buffers `__folly_memcpy` did some work - it determined the size is large enough for 128B AVX loads/stores - it has already copied the first 128B and the last 128B in YMM registers but then discarded it and fell back on `memmove`. Instead of wasting this work, forward copy (dst < src) or backward copy (dst > src). - use unaligned loads + aligned stores, but not non-temporal stores - for dst < src forward copy in 128 byte batches: -- unaligned load the first 32 bytes & last 4 x 32 bytes -- forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time -- unaligned store the first 32 bytes & last 4 x 32 bytes - for dst > src backward copy in 128 byte batches: -- unaligned load the first 4 x 32 bytes & last 32 bytes -- backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time -- unaligned store the first 4 x 32 bytes & last 32 bytes Reviewed By: yfeldblum Differential Revision: D31915389 fbshipit-source-id: 2c0197b2bddc102a7fb8f70a6f43e79ac994dc73

__folly_memcpy: allow overlapping buffers of any size and provide drop-in replacement for memmove
Summary: `__folly_memcpy` already behaved as `memmove` for n <= 256B. For n > 256B and overlapping buffers `__folly_memcpy` did some work - it determined the size is large enough for 128B AVX loads/stores - it has already copied the first 128B and the last 128B in YMM registers but then discarded it and fell back on `memmove`. Instead of wasting this work, forward copy (dst < src) or backward copy (dst > src). - use unaligned loads + aligned stores, but not non-temporal stores - for dst < src forward copy in 128 byte batches: -- unaligned load the first 32 bytes & last 4 x 32 bytes -- forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time -- unaligned store the first 32 bytes & last 4 x 32 bytes - for dst > src backward copy in 128 byte batches: -- unaligned load the first 4 x 32 bytes & last 32 bytes -- backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time -- unaligned store the first 4 x 32 bytes & last 32 bytes Reviewed By: yfeldblum Differential Revision: D31915389 fbshipit-source-id: 2c0197b2bddc102a7fb8f70a6f43e79ac994dc73
e19669d3 · Lucian Grijincu · Facebook GitHub Bot · 62fa4e65 · e19669d3
Commit e19669d3 authored Nov 29, 2021 by Lucian Grijincu Committed by Facebook GitHub Bot Nov 29, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 147 additions and 20 deletions

folly/memcpy.S folly/memcpy.S +147 -20

No files found.
--- a/folly/memcpy.S
+++ b/folly/memcpy.S
@@ -18,37 +18,55 @@
 * __folly_memcpy: An optimized memcpy implementation that uses prefetch and
 * AVX2 instructions.
 *
- * This implementation of memcpy acts as a memmove, but it is not optimized for
- * this purpose. While overlapping copies are undefined in memcpy, this
- * implementation acts like memmove for sizes up through 256 bytes and will
- * detect overlapping copies and call memmove for overlapping copies of 257 or
- * more bytes.
+ * This implementation of memcpy acts as a memmove: while overlapping copies
+ * are undefined in memcpy, in some implementations they're the same function and
+ * legacy programs rely on this behavior.
 *
 * This implementation uses prefetch to avoid dtlb misses. This can
 * substantially reduce dtlb store misses in cases where the destination
 * location is absent from L1 cache and where the copy size is small enough
 * that the hardware prefetcher doesn't have a large impact.
 *
- * The number of branches is limited by the use of overlapping copies. This
- * helps with copies where the source and destination cache lines are already
+ * The number of branches is limited by the use of overlapping loads & stores.
+ * This helps with copies where the source and destination cache lines are already
 * present in L1 because there are fewer instructions to execute and fewer
 * branches to potentially mispredict.
+ *   e.g. to copy the last 4 <= n <= 7 bytes: copy the first & last 4 bytes (overlapped):
+ *      movl        (%rsi), %r8d
+ *      movl        -4(%rsi,%rdx), %r9d
+ *      movl        %r8d, (%rdi)
+ *      movl        %r9d, -4(%rdi,%rdx)
 *
- * Vector operations up to 32-bytes are used (avx2 instruction set). Larger
- * mov operations (avx512) are not used.
 *
- * Large copies make use of aligned store operations. This operation is
- * observed to always be faster than rep movsb, so the rep movsb instruction
- * is not used.
+ * For sizes up to 256 all source data is first read into registers and then written:
+ * - n <=  16: overlapping movs
+ * - n <=  32: overlapping unaligned 16-byte SSE XMM load/stores
+ * - n <= 256: overlapping unaligned 32-byte AVX YMM load/stores
 *
- * If the copy size is humongous and the source and destination are both
- * aligned, this memcpy will use non-temporal operations. This can have
+ * Large copies (> 256 bytes) use unaligned loads + aligned stores.
+ * This is observed to always be faster than rep movsb, so the rep movsb
+ * instruction is not used.
+ * - The head & tail may be unaligned => they're always written using unaligned stores.
+ *
+ * If the copy size is humongous (> 32 KiB) and the source and destination are both
+ * aligned, this memcpy will use non-temporal operations (AVX2). This can have
 * a substantial speedup for copies where data is absent from L1, but it
 * is significantly slower if the source and destination data were already
 * in L1. The use of non-temporal operations also has the effect that after
 * the copy is complete, the data will be moved out of L1, even if the data was
 * present before the copy started.
 *
+ * For n > 256 and overlapping src & dst buffers (memmove):
+ * - use unaligned loads + aligned stores, but not non-temporal stores
+ * - for dst < src forward copy in 128 byte batches:
+ *   - unaligned load the first 32 bytes & last 4 x 32 bytes
+ *   - forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
+ *   - unaligned store the first 32 bytes & last 4 x 32 bytes
+ * - for dst > src backward copy in 128 byte batches:
+ *   - unaligned load the first 4 x 32 bytes & last 32 bytes
+ *   - backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
+ *   - unaligned store the first 4 x 32 bytes & last 32 bytes
+ *
 * @author Logan Evans <lpe@fb.com>
 */

@@ -111,7 +129,7 @@ __folly_memcpy_short:
 __folly_memcpy:
        .cfi_startproc

-        mov         %rdi, %rax
+        mov         %rdi, %rax    # return: $rdi

        test        %rdx, %rdx
        je          .L_EQ0
@@ -215,14 +233,19 @@ __folly_memcpy:
        //
        // %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many
        // bytes remain to be copied.
+
+        // (%rsi + %rdx <= %rdi) => no overlap
        lea         (%rsi,%rdx), %r9
        cmp         %rdi, %r9
        jbe         .L_NO_OVERLAP
+
+        // (%rdi + %rdx <= %rsi) => no overlap
        lea         (%rdi,%rdx), %r8
        cmp         %rsi, %r8
-        // This is a forward jump so that the branch predictor will not predict
-        // a memmove.
-        ja          .L_MEMMOVE
+        // If no info is available in branch predictor's cache, Intel CPUs assume
+        // forward jumps are not taken. Use a forward jump as overlapping buffers
+        // are unlikely.
+        ja          .L_OVERLAP

        .align      2
 .L_NO_OVERLAP:
@@ -309,16 +332,120 @@ __folly_memcpy:
        sfence
        jmp         .L_ALIGNED_DST_LOOP_END

-.L_MEMMOVE:
-        call        memmove
+
+.L_OVERLAP:
+        .align      2
+        cmp         %rdi, %rsi
+        jb          .L_OVERLAP_BWD  // %rsi  < %rdi => backward-copy
+        je          .L_RET          // %rsi == %rdi => return, nothing to copy
+
+        // Source & destination buffers overlap. Forward copy.
+
+        vmovdqu     (%rsi), %ymm8
+
+        // Align %rdi to a 32 byte boundary.
+        // %rcx = 32 - 31 & %rdi
+        mov         $32, %rcx
+        and         $31, %rdi
+        sub         %rdi, %rcx
+
+        lea         (%rsi,%rcx), %rsi
+        lea         (%rax,%rcx), %rdi
+        sub         %rcx, %rdx
+
+        // %r8 is the end condition for the loop.
+        lea         -128(%rsi,%rdx), %r8
+
+
+.L_OVERLAP_FWD_ALIGNED_DST_LOOP:
+        prefetchw   128(%rdi)
+        prefetchw   192(%rdi)
+
+        vmovdqu       (%rsi), %ymm0
+        vmovdqu     32(%rsi), %ymm1
+        vmovdqu     64(%rsi), %ymm2
+        vmovdqu     96(%rsi), %ymm3
+        add         $128, %rsi
+
+        vmovdqa     %ymm0,   (%rdi)
+        vmovdqa     %ymm1, 32(%rdi)
+        vmovdqa     %ymm2, 64(%rdi)
+        vmovdqa     %ymm3, 96(%rdi)
+        add         $128, %rdi
+
+        cmp         %r8, %rsi
+        jb          .L_OVERLAP_FWD_ALIGNED_DST_LOOP
+
+        sub         %rsi, %r9
+        mov         %r9, %rdx
+
+        vmovdqu     %ymm4, -128(%rdi,%rdx)
+        vmovdqu     %ymm5,  -96(%rdi,%rdx)
+        vmovdqu     %ymm6,  -64(%rdi,%rdx)
+        vmovdqu     %ymm7,  -32(%rdi,%rdx)
+        vmovdqu     %ymm8, (%rax)  // %rax == the original (unaligned) %rdi
+
+        vzeroupper
+
+.L_RET:
        ret

+.L_OVERLAP_BWD:
+        # Save last 32 bytes.
+        vmovdqu     -32(%rsi, %rdx), %ymm8
+        lea         -32(%rdi, %rdx), %r9
+
+
+        // %r8 is the end condition for the loop.
+        lea         128(%rsi), %r8
+
+        // Align %rdi+%rdx (destination end) to a 32 byte boundary.
+        // %rcx = (%rdi + %rdx - 32) & 31
+        mov         %r9, %rcx
+        and         $31, %rcx
+        // Set %rsi & %rdi to the end of the 32 byte aligned range.
+        sub         %rcx, %rdx
+        add         %rdx, %rsi
+        add         %rdx, %rdi
+
+
+.L_OVERLAP_BWD_ALIGNED_DST_LOOP:
+        prefetchw   -128(%rdi)
+        prefetchw   -192(%rdi)
+
+        vmovdqu      -32(%rsi), %ymm4
+        vmovdqu      -64(%rsi), %ymm5
+        vmovdqu      -96(%rsi), %ymm6
+        vmovdqu     -128(%rsi), %ymm7
+        sub         $128, %rsi
+
+        vmovdqa     %ymm4,  -32(%rdi)
+        vmovdqa     %ymm5,  -64(%rdi)
+        vmovdqa     %ymm6,  -96(%rdi)
+        vmovdqa     %ymm7, -128(%rdi)
+        sub         $128, %rdi
+
+        cmp         %r8, %rsi
+        ja          .L_OVERLAP_BWD_ALIGNED_DST_LOOP
+
+        vmovdqu     %ymm0,   (%rax)  // %rax == the original unaligned %rdi
+        vmovdqu     %ymm1, 32(%rax)
+        vmovdqu     %ymm2, 64(%rax)
+        vmovdqu     %ymm3, 96(%rax)
+        vmovdqu     %ymm8, (%r9)
+
+        vzeroupper
+	ret
+
        .cfi_endproc
        .size       __folly_memcpy, .-__folly_memcpy

 #ifdef FOLLY_MEMCPY_IS_MEMCPY
        .weak       memcpy
        memcpy = __folly_memcpy
+
+        .weak       memmove
+        memmove = __folly_memcpy
 #endif

        .ident "GCC: (GNU) 4.8.2"