[PATCH v2] msvcrt: Add an ermsb-based memset for x86/64.

Rémi Bernon rbernon at codeweavers.com
Tue Aug 31 05:52:58 CDT 2021


On 8/31/21 12:14 PM, Guillaume Charifi wrote:
> Signed-off-by: Guillaume Charifi <guillaume.charifi at sfr.fr>
> ---
>   dlls/msvcrt/string.c | 63 ++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 63 insertions(+)
> 
> diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c
> index 4d09405094d..6d1500cb194 100644
> --- a/dlls/msvcrt/string.c
> +++ b/dlls/msvcrt/string.c
> @@ -2527,6 +2527,7 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n)
>       __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
>       "popq " SRC_REG "\n\t" \
>       __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
> +
>   #endif
>   
>   void * __cdecl sse2_memmove(void *dst, const void *src, size_t n);
> @@ -2732,6 +2733,64 @@ __ASM_GLOBAL_FUNC( sse2_memmove,
>           MEMMOVE_CLEANUP
>           "ret" )
>   
> +#undef DEST_REG
> +#undef SRC_REG
> +#undef LEN_REG
> +#undef TMP_REG
> +
> +#ifdef __i386__
> +
> +#define DEST_REG "%edi"
> +#define SRC_REG "%eax"
> +#define LEN_REG "%ecx"
> +#define TMP_REG
> +
> +#define MEMSET_INIT \
> +    "pushl " DEST_REG "\n\t" \
> +    __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
> +    "movl 8(%esp), " DEST_REG "\n\t" \
> +    "movl 12(%esp), " SRC_REG "\n\t" \
> +    "movl 16(%esp), " LEN_REG "\n\t"
> +
> +#define MEMSET_CLEANUP \
> +    "movl 8(%esp), %eax\n\t" \
> +    "popl " DEST_REG "\n\t" \
> +    __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t")
> +
> +#else
> +
> +#define DEST_REG "%rdi"
> +#define SRC_REG "%rax"
> +#define LEN_REG "%rcx"
> +#define TMP_REG "%r9"
> +
> +#define MEMSET_INIT \
> +    "pushq " DEST_REG "\n\t" \
> +    __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
> +    "movq %rcx, " DEST_REG "\n\t" \
> +    "movq %rdx, " SRC_REG "\n\t" \
> +    "movq %r8, " LEN_REG "\n\t" \
> +    "movq " DEST_REG ", " TMP_REG "\n\t"
> +
> +#define MEMSET_CLEANUP \
> +    "movq " TMP_REG ", %rax\n\t" \
> +    "popq " DEST_REG "\n\t" \
> +    __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
> +
> +#endif
> +
> +void * __cdecl ermsb_memset(void *dst, int c, size_t n);
> +__ASM_GLOBAL_FUNC( ermsb_memset,
> +        MEMSET_INIT
> +        "rep stosb\n\t"
> +        MEMSET_CLEANUP
> +        "ret" )
> +
> +#undef DEST_REG
> +#undef SRC_REG
> +#undef LEN_REG
> +#undef TMP_REG
> +
>   #endif
>   
>   /*********************************************************************
> @@ -2860,9 +2919,13 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n)
>    */
>   void* __cdecl memset(void *dst, int c, size_t n)
>   {
> +#if defined(__i386__) || defined(__x86_64__)
> +    return ermsb_memset(dst, c, n);
> +#else
>       volatile unsigned char *d = dst;  /* avoid gcc optimizations */
>       while (n--) *d++ = c;
>       return dst;
> +#endif
>   }
>   
>   /*********************************************************************
> 

In my opinion we should instead get rid of the volatile, it's what 
really kills the performance. GCC has the bad idea of replacing the loop 
with a builtin memset call, which doesn't work in Wine, but we can solve 
this in various ways I think.

Then rep stosb/movsb aren't guaranteed to be fast on all CPUs. Even on 
modern CPUs, which have the "it's really fast" bit, as far as I could 
see it sometimes really makes a difference for large sizes (ie >= 4096), 
which I don't think is the common case.

For a start we could add -ffreestanding / -fno-builtin compilation flags 
  to msvcrt instead, although it won't be much faster, it will at least 
fix the issue with the builtin replacement.

Then we can also unroll the loop a bit, still writing it in C, which 
makes it much faster and especially on small sizes. I actually have 
something like that locally (and in Proton) in ntdll, where it actually 
helps for zeroed heap allocations (which call ntdll memset instead of 
msvcrt):

> 
> static inline void memset_c_unaligned_32( char *d, uint64_t v, size_t n )
> {
>     if (n >= 24)
>     {
>         *(uint64_t *)d = v;
>         *(uint64_t *)(d + 8) = v;
>         *(uint64_t *)(d + 16) = v;
>         *(uint64_t *)(d + n - 8) = v;
>     }
>     else if (n >= 16)
>     {
>         *(uint64_t *)d = v;
>         *(uint64_t *)(d + 8) = v;
>         *(uint64_t *)(d + n - 8) = v;
>     }
>     else if (n >= 8)
>     {
>         *(uint64_t *)d = v;
>         *(uint64_t *)(d + n - 8) = v;
>     }
>     else if (n >= 4)
>     {
>         *(uint32_t *)d = v;
>         *(uint32_t *)(d + n - 4) = v;
>     }
>     else if (n >= 2)
>     {
>         *(uint16_t *)d = v;
>         *(uint16_t *)(d + n - 2) = v;
>     }
>     else if (n >= 1)
>     {
>         *(uint8_t *)d = v;
>     }
> }
> 
> /*********************************************************************
>  *                  memset   (NTDLL.@)
>  */
> void *__cdecl memset(void *dst, int c, size_t n)
> {
>     uint16_t tmp16 = ((uint16_t)c << 8) | c;
>     uint32_t tmp32 = ((uint32_t)tmp16 << 16) | tmp16;
>     uint64_t v = ((uint64_t)tmp32 << 32) | tmp32;
> 
>     if (n <= 32)
>     {
>         memset_c_unaligned_32( dst, v, n );
>         return dst;
>     }
> 
>     while (n >= 48)
>     {
>         *(uint64_t*)((char *)dst + n -  8) = v;
>         *(uint64_t*)((char *)dst + n - 16) = v;
>         *(uint64_t*)((char *)dst + n - 24) = v;
>         *(uint64_t*)((char *)dst + n - 32) = v;
>         *(uint64_t*)((char *)dst + n - 40) = v;
>         *(uint64_t*)((char *)dst + n - 48) = v;
>         n -= 48;
>     }
> 
>     while (n >= 24)
>     {
>         *(uint64_t*)((char *)dst + n -  8) = v;
>         *(uint64_t*)((char *)dst + n - 16) = v;
>         *(uint64_t*)((char *)dst + n - 24) = v;
>         n -= 24;
>     }
> 
>     memset_c_unaligned_32( dst, v, n );
>     return dst;
> }

Cheers,
-- 
Rémi Bernon <rbernon at codeweavers.com>



More information about the wine-devel mailing list