[PATCH] msvcrt: Avoid disallowed unaligned writes in memset on ARM
Rémi Bernon
rbernon at codeweavers.com
Wed Sep 15 16:55:21 CDT 2021
On 9/15/21 10:27 PM, Martin Storsjo wrote:
> From: Martin Storsjö <martin at martin.st>
>
> This fixes a regression in memset on ARM since
> 7b17d7081512db52ef852705445762ac4016c29f.
>
> ARM can do 64 bit writes with the STRD instruction, but that
> instruction requires a 32 bit aligned address - while these stores
> are unaligned.
>
> Two consecutive stores to uint32_t* pointers can also be fused
> into one single STRD, as a uint32_t* is supposed to be properly
> aligned - therefore, do these stores as stores to volatile uint32_t*
> to avoid fusing them.
>
> Signed-off-by: Martin Storsjö <martin at martin.st>
> ---
> dlls/msvcrt/string.c | 29 +++++++++++++++++++++++++++++
> 1 file changed, 29 insertions(+)
>
> diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c
> index f2b1b4a5b11..bf491a91f40 100644
> --- a/dlls/msvcrt/string.c
> +++ b/dlls/msvcrt/string.c
> @@ -2878,15 +2878,37 @@ void *__cdecl memset(void *dst, int c, size_t n)
>
> if (n >= 16)
> {
> +#ifdef __arm__
> + *(volatile uint32_t *)(d + 0) = v;
> + *(volatile uint32_t *)(d + 4) = v;
> + *(volatile uint32_t *)(d + 8) = v;
> + *(volatile uint32_t *)(d + 12) = v;
> + *(volatile uint32_t *)(d + n - 16) = v;
> + *(volatile uint32_t *)(d + n - 12) = v;
> + *(volatile uint32_t *)(d + n - 8) = v;
> + *(volatile uint32_t *)(d + n - 4) = v;
> +#else
> *(uint64_t *)(d + 0) = v;
> *(uint64_t *)(d + 8) = v;
> *(uint64_t *)(d + n - 16) = v;
> *(uint64_t *)(d + n - 8) = v;
> +#endif
> if (n <= 32) return dst;
> +#ifdef __arm__
> + *(volatile uint32_t *)(d + 16) = v;
> + *(volatile uint32_t *)(d + 20) = v;
> + *(volatile uint32_t *)(d + 24) = v;
> + *(volatile uint32_t *)(d + 28) = v;
> + *(volatile uint32_t *)(d + n - 32) = v;
> + *(volatile uint32_t *)(d + n - 28) = v;
> + *(volatile uint32_t *)(d + n - 24) = v;
> + *(volatile uint32_t *)(d + n - 20) = v;
> +#else
> *(uint64_t *)(d + 16) = v;
> *(uint64_t *)(d + 24) = v;
> *(uint64_t *)(d + n - 32) = v;
> *(uint64_t *)(d + n - 24) = v;
> +#endif
> if (n <= 64) return dst;
>
> n = (n - a) & ~0x1f;
> @@ -2895,8 +2917,15 @@ void *__cdecl memset(void *dst, int c, size_t n)
> }
> if (n >= 8)
> {
> +#ifdef __arm__
> + *(volatile uint32_t *)d = v;
> + *(volatile uint32_t *)(d + 4) = v;
> + *(volatile uint32_t *)(d + n - 4) = v;
> + *(volatile uint32_t *)(d + n - 8) = v;
> +#else
> *(uint64_t *)d = v;
> *(uint64_t *)(d + n - 8) = v;
> +#endif
> return dst;
> }
> if (n >= 4)
>
I'm confused that it causes trouble when I thought it could benefit more
than just Intel architectures...
Maybe it could be made not too ugly with some macro to wrap the 64bit
stores, defined differently for arm?
--
Rémi Bernon <rbernon at codeweavers.com>
More information about the wine-devel
mailing list