[PATCH] ntdll: Optimize memcpy for x86-64.

Tue Mar 22 20:33:33 CDT 2022

Signed-off-by: Elaine Lefler <elaineclefler at gmail.com>
---

New vectorized implementation improves performance up to 65%.
---
 dlls/ntdll/string.c | 162 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 3 deletions(-)

diff --git a/dlls/ntdll/string.c b/dlls/ntdll/string.c
index 0fa83821d21..443fc98418a 100644
--- a/dlls/ntdll/string.c
+++ b/dlls/ntdll/string.c
@@ -33,6 +33,16 @@
 #include "winternl.h"
 #include "ntdll_misc.h"
 
+#ifdef __x86_64__
+
+#include <x86intrin.h>
+
+/* Enable vectorized memcpy implementation (all x86-64 CPUs have SSE2).
+ * TODO: This could be enabled for x86 with a cpuid check. */
+#define SSE2_MEMCPY
+
+#endif
+
 
 /* same as wctypes except for TAB, which doesn't have C1_BLANK for some reason... */
 static const unsigned short ctypes[257] =
@@ -96,10 +106,154 @@ int __cdecl memcmp( const void *ptr1, const void *ptr2, size_t n )
 
 /*********************************************************************
  *                  memcpy   (NTDLL.@)
- *
- * NOTES
- *  Behaves like memmove.
  */
+#ifdef SSE2_MEMCPY
+
+#define declare_fastcpy(n) \
+static void fastcpy_ ## n \
+( uintptr_t as, const uintptr_t as_end, uintptr_t d ) \
+{ \
+    __m128i x, y; \
+    x = *(const __m128i*)as; \
+    /* Read 32 bytes in, 16 bytes out. Shuffle variables when done so we don't
+     * re-read the first part. */ \
+    while (as < as_end) \
+    { \
+        /* Prefetch hint improves performance by minimizing cache pollution */ \
+        _mm_prefetch((const void*)(as + 16), _MM_HINT_NTA); \
+        _mm_prefetch((const void*)d, _MM_HINT_NTA); \
+        y = *(const __m128i*)(as + 16);  \
+        /* (n) is the number of bytes in *as that don't go to *d. Little endian
+         * means the first bytes appear on the right, so srl to remove them */ \
+        x = _mm_srli_si128(x, (n)); \
+        /* Take same number of bytes from *(as + 16) and push them to the upper
+         * part of the register */ \
+        x = _mm_or_si128(x, _mm_slli_si128(y, 16 - (n))); \
+        *(__m128i*)d = x; \
+        d += 16; \
+        as += 16; \
+        x = y; \
+    } \
+}
+
+declare_fastcpy(1)
+declare_fastcpy(2)
+declare_fastcpy(3)
+declare_fastcpy(4)
+declare_fastcpy(5)
+declare_fastcpy(6)
+declare_fastcpy(7)
+declare_fastcpy(8)
+declare_fastcpy(9)
+declare_fastcpy(10)
+declare_fastcpy(11)
+declare_fastcpy(12)
+declare_fastcpy(13)
+declare_fastcpy(14)
+declare_fastcpy(15)
+
+typedef void (*fastcpy_ptr) ( uintptr_t, const uintptr_t, uintptr_t );
+
+static const fastcpy_ptr fastcpy_table[16] = {
+    NULL,       /* special case, different code path */
+    fastcpy_1,
+    fastcpy_2,
+    fastcpy_3,
+    fastcpy_4,
+    fastcpy_5,
+    fastcpy_6,
+    fastcpy_7,
+    fastcpy_8,
+    fastcpy_9,
+    fastcpy_10,
+    fastcpy_11,
+    fastcpy_12,
+    fastcpy_13,
+    fastcpy_14,
+    fastcpy_15
+};
+
+void * __cdecl memcpy( void *dst, const void *src, size_t n )
+{
+    uintptr_t s = (uintptr_t)src;
+    uintptr_t d = (uintptr_t)dst;
+    uintptr_t as;
+
+    _mm_prefetch((const void*)s, _MM_HINT_NTA);
+    _mm_prefetch((const void*)d, _MM_HINT_NTA);
+
+    /* Ensure aligned destination */
+    while (d & 15)
+    {
+        if (n-- == 0)
+            return dst;
+        *(BYTE*)d++ = *(const BYTE*)s++;
+    }
+
+    if (n < 16)
+    {
+        /* Too small to vectorize */
+        while (n--) *(BYTE*)d++ = *(const BYTE*)s++;
+        return dst;
+    }
+
+    as = s & ~15;
+    if (as == s)
+    {
+        /* Fastest path: both pointers aligned */
+        while (n >= 16)
+        {
+            _mm_prefetch((const void*)s, _MM_HINT_NTA);
+            _mm_prefetch((const void*)d, _MM_HINT_NTA);
+            *(__m128i*)d = *(const __m128i*)s;
+
+            d += 16;
+            s += 16;
+            n -= 16;
+        }
+    }
+    else
+    {
+        /* Read from aligned s by rounding down. If as < src, we need to slow
+         * copy another 16 bytes to avoid OOB reads. */
+        ptrdiff_t shift = s - as;
+        uintptr_t as_end = ((s + n) & ~15) - 16;
+
+        if (as < (uintptr_t)src)
+        {
+            uintptr_t target_n = n - 16;
+            while (n > target_n)
+            {
+                if (n-- == 0)
+                    return dst;
+                *(BYTE*)d++ = *(const BYTE*)s++;
+            }
+
+            as += 16;
+        }
+
+        /* Copy 16-byte chunks if any are possible. Since s is misaligned, we
+         * need to read one chunk ahead of what we're writing, which means
+         * as_end must point to the _beginning_ of the last readable chunk.
+         * This also guarantees there is no overrun, since delta < n - 16. */
+        if (as_end > as)
+        {
+            ptrdiff_t delta = as_end - as;
+            fastcpy_table[shift](as, as_end, d);
+            s += delta;
+            d += delta;
+            n -= delta;
+        }
+    }
+
+    /* Slow copy anything that remains */
+    while (n--) *(BYTE*)d++ = *(const BYTE*)s++;
+    return dst;
+}
+
+#else   /* defined(SSE2_MEMCPY) */
+
+/* Note: Behaves like memmove */
 void * __cdecl memcpy( void *dst, const void *src, size_t n )
 {
     volatile unsigned char *d = dst;  /* avoid gcc optimizations */
@@ -118,6 +272,8 @@ void * __cdecl memcpy( void *dst, const void *src, size_t n )
     return dst;
 }
 
+#endif  /* !defined(SSE2_MEMCPY) */
+
 
 /*********************************************************************
  *                  memmove   (NTDLL.@)
-- 
2.32.0 (Apple Git-132)