[PATCH v2] msvcrt: Add an ermsb-based memset for x86/64.
Rémi Bernon
rbernon at codeweavers.com
Tue Aug 31 05:52:58 CDT 2021
On 8/31/21 12:14 PM, Guillaume Charifi wrote:
> Signed-off-by: Guillaume Charifi <guillaume.charifi at sfr.fr>
> ---
> dlls/msvcrt/string.c | 63 ++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 63 insertions(+)
>
> diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c
> index 4d09405094d..6d1500cb194 100644
> --- a/dlls/msvcrt/string.c
> +++ b/dlls/msvcrt/string.c
> @@ -2527,6 +2527,7 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n)
> __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
> "popq " SRC_REG "\n\t" \
> __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
> +
> #endif
>
> void * __cdecl sse2_memmove(void *dst, const void *src, size_t n);
> @@ -2732,6 +2733,64 @@ __ASM_GLOBAL_FUNC( sse2_memmove,
> MEMMOVE_CLEANUP
> "ret" )
>
> +#undef DEST_REG
> +#undef SRC_REG
> +#undef LEN_REG
> +#undef TMP_REG
> +
> +#ifdef __i386__
> +
> +#define DEST_REG "%edi"
> +#define SRC_REG "%eax"
> +#define LEN_REG "%ecx"
> +#define TMP_REG
> +
> +#define MEMSET_INIT \
> + "pushl " DEST_REG "\n\t" \
> + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
> + "movl 8(%esp), " DEST_REG "\n\t" \
> + "movl 12(%esp), " SRC_REG "\n\t" \
> + "movl 16(%esp), " LEN_REG "\n\t"
> +
> +#define MEMSET_CLEANUP \
> + "movl 8(%esp), %eax\n\t" \
> + "popl " DEST_REG "\n\t" \
> + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t")
> +
> +#else
> +
> +#define DEST_REG "%rdi"
> +#define SRC_REG "%rax"
> +#define LEN_REG "%rcx"
> +#define TMP_REG "%r9"
> +
> +#define MEMSET_INIT \
> + "pushq " DEST_REG "\n\t" \
> + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
> + "movq %rcx, " DEST_REG "\n\t" \
> + "movq %rdx, " SRC_REG "\n\t" \
> + "movq %r8, " LEN_REG "\n\t" \
> + "movq " DEST_REG ", " TMP_REG "\n\t"
> +
> +#define MEMSET_CLEANUP \
> + "movq " TMP_REG ", %rax\n\t" \
> + "popq " DEST_REG "\n\t" \
> + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
> +
> +#endif
> +
> +void * __cdecl ermsb_memset(void *dst, int c, size_t n);
> +__ASM_GLOBAL_FUNC( ermsb_memset,
> + MEMSET_INIT
> + "rep stosb\n\t"
> + MEMSET_CLEANUP
> + "ret" )
> +
> +#undef DEST_REG
> +#undef SRC_REG
> +#undef LEN_REG
> +#undef TMP_REG
> +
> #endif
>
> /*********************************************************************
> @@ -2860,9 +2919,13 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n)
> */
> void* __cdecl memset(void *dst, int c, size_t n)
> {
> +#if defined(__i386__) || defined(__x86_64__)
> + return ermsb_memset(dst, c, n);
> +#else
> volatile unsigned char *d = dst; /* avoid gcc optimizations */
> while (n--) *d++ = c;
> return dst;
> +#endif
> }
>
> /*********************************************************************
>
In my opinion we should instead get rid of the volatile, it's what
really kills the performance. GCC has the bad idea of replacing the loop
with a builtin memset call, which doesn't work in Wine, but we can solve
this in various ways I think.
Then rep stosb/movsb aren't guaranteed to be fast on all CPUs. Even on
modern CPUs, which have the "it's really fast" bit, as far as I could
see it sometimes really makes a difference for large sizes (ie >= 4096),
which I don't think is the common case.
For a start we could add -ffreestanding / -fno-builtin compilation flags
to msvcrt instead, although it won't be much faster, it will at least
fix the issue with the builtin replacement.
Then we can also unroll the loop a bit, still writing it in C, which
makes it much faster and especially on small sizes. I actually have
something like that locally (and in Proton) in ntdll, where it actually
helps for zeroed heap allocations (which call ntdll memset instead of
msvcrt):
>
> static inline void memset_c_unaligned_32( char *d, uint64_t v, size_t n )
> {
> if (n >= 24)
> {
> *(uint64_t *)d = v;
> *(uint64_t *)(d + 8) = v;
> *(uint64_t *)(d + 16) = v;
> *(uint64_t *)(d + n - 8) = v;
> }
> else if (n >= 16)
> {
> *(uint64_t *)d = v;
> *(uint64_t *)(d + 8) = v;
> *(uint64_t *)(d + n - 8) = v;
> }
> else if (n >= 8)
> {
> *(uint64_t *)d = v;
> *(uint64_t *)(d + n - 8) = v;
> }
> else if (n >= 4)
> {
> *(uint32_t *)d = v;
> *(uint32_t *)(d + n - 4) = v;
> }
> else if (n >= 2)
> {
> *(uint16_t *)d = v;
> *(uint16_t *)(d + n - 2) = v;
> }
> else if (n >= 1)
> {
> *(uint8_t *)d = v;
> }
> }
>
> /*********************************************************************
> * memset (NTDLL.@)
> */
> void *__cdecl memset(void *dst, int c, size_t n)
> {
> uint16_t tmp16 = ((uint16_t)c << 8) | c;
> uint32_t tmp32 = ((uint32_t)tmp16 << 16) | tmp16;
> uint64_t v = ((uint64_t)tmp32 << 32) | tmp32;
>
> if (n <= 32)
> {
> memset_c_unaligned_32( dst, v, n );
> return dst;
> }
>
> while (n >= 48)
> {
> *(uint64_t*)((char *)dst + n - 8) = v;
> *(uint64_t*)((char *)dst + n - 16) = v;
> *(uint64_t*)((char *)dst + n - 24) = v;
> *(uint64_t*)((char *)dst + n - 32) = v;
> *(uint64_t*)((char *)dst + n - 40) = v;
> *(uint64_t*)((char *)dst + n - 48) = v;
> n -= 48;
> }
>
> while (n >= 24)
> {
> *(uint64_t*)((char *)dst + n - 8) = v;
> *(uint64_t*)((char *)dst + n - 16) = v;
> *(uint64_t*)((char *)dst + n - 24) = v;
> n -= 24;
> }
>
> memset_c_unaligned_32( dst, v, n );
> return dst;
> }
Cheers,
--
Rémi Bernon <rbernon at codeweavers.com>
More information about the wine-devel
mailing list