Rémi Bernon : msvcrt: Improve memset performance using overlapping stores.

Alexandre Julliard julliard at winehq.org
Tue Sep 14 16:00:14 CDT 2021


Module: wine
Branch: master
Commit: 7b17d7081512db52ef852705445762ac4016c29f
URL:    https://source.winehq.org/git/wine.git/?a=commit;h=7b17d7081512db52ef852705445762ac4016c29f

Author: Rémi Bernon <rbernon at codeweavers.com>
Date:   Tue Sep 14 16:28:14 2021 +0200

msvcrt: Improve memset performance using overlapping stores.

For n larger than 16 we store 16 bytes on each end of the buffer,
eventually overlapping, and then 16 additional bytes for n > 32.

Then we can find a 32-byte aligned range overlapping the remaining part
of the destination buffer, which is filled 32 bytes at a time in a loop.

Signed-off-by: Rémi Bernon <rbernon at codeweavers.com>
Signed-off-by: Piotr Caban <piotr at codeweavers.com>
Signed-off-by: Alexandre Julliard <julliard at winehq.org>

---

 dlls/msvcrt/string.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c
index 4d09405094d..f2b1b4a5b11 100644
--- a/dlls/msvcrt/string.c
+++ b/dlls/msvcrt/string.c
@@ -2855,13 +2855,67 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n)
     return memmove(dst, src, n);
 }
 
+static inline void memset_aligned_32(unsigned char *d, uint64_t v, size_t n)
+{
+    while (n >= 32)
+    {
+        *(uint64_t *)(d + n - 32) = v;
+        *(uint64_t *)(d + n - 24) = v;
+        *(uint64_t *)(d + n - 16) = v;
+        *(uint64_t *)(d + n -  8) = v;
+        n -= 32;
+    }
+}
+
 /*********************************************************************
  *		    memset (MSVCRT.@)
  */
-void* __cdecl memset(void *dst, int c, size_t n)
+void *__cdecl memset(void *dst, int c, size_t n)
 {
-    volatile unsigned char *d = dst;  /* avoid gcc optimizations */
-    while (n--) *d++ = c;
+    uint64_t v = 0x101010101010101ull * (unsigned char)c;
+    unsigned char *d = (unsigned char *)dst;
+    size_t a = 0x20 - ((uintptr_t)d & 0x1f);
+
+    if (n >= 16)
+    {
+        *(uint64_t *)(d + 0) = v;
+        *(uint64_t *)(d + 8) = v;
+        *(uint64_t *)(d + n - 16) = v;
+        *(uint64_t *)(d + n - 8) = v;
+        if (n <= 32) return dst;
+        *(uint64_t *)(d + 16) = v;
+        *(uint64_t *)(d + 24) = v;
+        *(uint64_t *)(d + n - 32) = v;
+        *(uint64_t *)(d + n - 24) = v;
+        if (n <= 64) return dst;
+
+        n = (n - a) & ~0x1f;
+        memset_aligned_32(d + a, v, n);
+        return dst;
+    }
+    if (n >= 8)
+    {
+        *(uint64_t *)d = v;
+        *(uint64_t *)(d + n - 8) = v;
+        return dst;
+    }
+    if (n >= 4)
+    {
+        *(uint32_t *)d = v;
+        *(uint32_t *)(d + n - 4) = v;
+        return dst;
+    }
+    if (n >= 2)
+    {
+        *(uint16_t *)d = v;
+        *(uint16_t *)(d + n - 2) = v;
+        return dst;
+    }
+    if (n >= 1)
+    {
+        *(uint8_t *)d = v;
+        return dst;
+    }
     return dst;
 }
 




More information about the wine-cvs mailing list