[PATCH] ntdll: Optimize memcpy for x86-64.

Wed Mar 30 10:27:42 CDT 2022

On 3/23/22 10:33, Elaine Lefler wrote:
> Signed-off-by: Elaine Lefler <elaineclefler at gmail.com>
> ---
> 
> New vectorized implementation improves performance up to 65%.

MSVCRT has one. Maybe deduplicate?

> ---
>  dlls/ntdll/string.c | 162 +++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 159 insertions(+), 3 deletions(-)
> 
> diff --git a/dlls/ntdll/string.c b/dlls/ntdll/string.c
> index 0fa83821d21..443fc98418a 100644
> --- a/dlls/ntdll/string.c
> +++ b/dlls/ntdll/string.c
> @@ -33,6 +33,16 @@
>  #include "winternl.h"
>  #include "ntdll_misc.h"
>  
> +#ifdef __x86_64__
> +
> +#include <x86intrin.h>
> +
> +/* Enable vectorized memcpy implementation (all x86-64 CPUs have SSE2).
> + * TODO: This could be enabled for x86 with a cpuid check. */
> +#define SSE2_MEMCPY
> +
> +#endif
> +
>  
>  /* same as wctypes except for TAB, which doesn't have C1_BLANK for some reason... */
>  static const unsigned short ctypes[257] =
> @@ -96,10 +106,154 @@ int __cdecl memcmp( const void *ptr1, const void *ptr2, size_t n )
>  
>  /*********************************************************************
>   *                  memcpy   (NTDLL.@)
> - *
> - * NOTES
> - *  Behaves like memmove.
>   */
> +#ifdef SSE2_MEMCPY
> +
> +#define declare_fastcpy(n) \
> +static void fastcpy_ ## n \
> +( uintptr_t as, const uintptr_t as_end, uintptr_t d ) \
> +{ \
> +    __m128i x, y; \
> +    x = *(const __m128i*)as; \
> +    /* Read 32 bytes in, 16 bytes out. Shuffle variables when done so we don't
> +     * re-read the first part. */ \
> +    while (as < as_end) \
> +    { \
> +        /* Prefetch hint improves performance by minimizing cache pollution */ \
> +        _mm_prefetch((const void*)(as + 16), _MM_HINT_NTA); \
> +        _mm_prefetch((const void*)d, _MM_HINT_NTA); \
> +        y = *(const __m128i*)(as + 16);  \
> +        /* (n) is the number of bytes in *as that don't go to *d. Little endian
> +         * means the first bytes appear on the right, so srl to remove them */ \
> +        x = _mm_srli_si128(x, (n)); \
> +        /* Take same number of bytes from *(as + 16) and push them to the upper
> +         * part of the register */ \
> +        x = _mm_or_si128(x, _mm_slli_si128(y, 16 - (n))); \
> +        *(__m128i*)d = x; \
> +        d += 16; \
> +        as += 16; \
> +        x = y; \
> +    } \
> +}
> +
> +declare_fastcpy(1)
> +declare_fastcpy(2)
> +declare_fastcpy(3)
> +declare_fastcpy(4)
> +declare_fastcpy(5)
> +declare_fastcpy(6)
> +declare_fastcpy(7)
> +declare_fastcpy(8)
> +declare_fastcpy(9)
> +declare_fastcpy(10)
> +declare_fastcpy(11)
> +declare_fastcpy(12)
> +declare_fastcpy(13)
> +declare_fastcpy(14)
> +declare_fastcpy(15)
> +
> +typedef void (*fastcpy_ptr) ( uintptr_t, const uintptr_t, uintptr_t );
> +
> +static const fastcpy_ptr fastcpy_table[16] = {
> +    NULL,       /* special case, different code path */
> +    fastcpy_1,
> +    fastcpy_2,
> +    fastcpy_3,
> +    fastcpy_4,
> +    fastcpy_5,
> +    fastcpy_6,
> +    fastcpy_7,
> +    fastcpy_8,
> +    fastcpy_9,
> +    fastcpy_10,
> +    fastcpy_11,
> +    fastcpy_12,
> +    fastcpy_13,
> +    fastcpy_14,
> +    fastcpy_15
> +};
> +
> +void * __cdecl memcpy( void *dst, const void *src, size_t n )
> +{
> +    uintptr_t s = (uintptr_t)src;
> +    uintptr_t d = (uintptr_t)dst;
> +    uintptr_t as;
> +
> +    _mm_prefetch((const void*)s, _MM_HINT_NTA);
> +    _mm_prefetch((const void*)d, _MM_HINT_NTA);
> +
> +    /* Ensure aligned destination */
> +    while (d & 15)
> +    {
> +        if (n-- == 0)
> +            return dst;
> +        *(BYTE*)d++ = *(const BYTE*)s++;
> +    }
> +
> +    if (n < 16)
> +    {
> +        /* Too small to vectorize */
> +        while (n--) *(BYTE*)d++ = *(const BYTE*)s++;
> +        return dst;
> +    }
> +
> +    as = s & ~15;
> +    if (as == s)
> +    {
> +        /* Fastest path: both pointers aligned */
> +        while (n >= 16)
> +        {
> +            _mm_prefetch((const void*)s, _MM_HINT_NTA);
> +            _mm_prefetch((const void*)d, _MM_HINT_NTA);
> +            *(__m128i*)d = *(const __m128i*)s;
> +
> +            d += 16;
> +            s += 16;
> +            n -= 16;
> +        }
> +    }
> +    else
> +    {
> +        /* Read from aligned s by rounding down. If as < src, we need to slow
> +         * copy another 16 bytes to avoid OOB reads. */
> +        ptrdiff_t shift = s - as;
> +        uintptr_t as_end = ((s + n) & ~15) - 16;
> +
> +        if (as < (uintptr_t)src)
> +        {
> +            uintptr_t target_n = n - 16;
> +            while (n > target_n)
> +            {
> +                if (n-- == 0)
> +                    return dst;
> +                *(BYTE*)d++ = *(const BYTE*)s++;
> +            }
> +
> +            as += 16;
> +        }
> +
> +        /* Copy 16-byte chunks if any are possible. Since s is misaligned, we
> +         * need to read one chunk ahead of what we're writing, which means
> +         * as_end must point to the _beginning_ of the last readable chunk.
> +         * This also guarantees there is no overrun, since delta < n - 16. */
> +        if (as_end > as)
> +        {
> +            ptrdiff_t delta = as_end - as;
> +            fastcpy_table[shift](as, as_end, d);
> +            s += delta;
> +            d += delta;
> +            n -= delta;
> +        }
> +    }
> +
> +    /* Slow copy anything that remains */
> +    while (n--) *(BYTE*)d++ = *(const BYTE*)s++;
> +    return dst;
> +}
> +
> +#else   /* defined(SSE2_MEMCPY) */
> +
> +/* Note: Behaves like memmove */
>  void * __cdecl memcpy( void *dst, const void *src, size_t n )
>  {
>      volatile unsigned char *d = dst;  /* avoid gcc optimizations */
> @@ -118,6 +272,8 @@ void * __cdecl memcpy( void *dst, const void *src, size_t n )
>      return dst;
>  }
>  
> +#endif  /* !defined(SSE2_MEMCPY) */
> +
>  
>  /*********************************************************************
>   *                  memmove   (NTDLL.@)

-- 
Sincerely,
Jinoh Kang