[PATCH vkd3d 3/4] vkd3d: Replace descriptor mutex with per-descriptor spinlocks.

Conor McCarthy cmccarthy at codeweavers.com
Tue Sep 21 01:00:09 CDT 2021


Greatly improves performance in various games that update or copy a
large number of descriptors per frame due to the high overhead of
pthread_mutex_{un}lock.

Based on vkd3d-proton patches by Hans-Kristian Arntzen, Philip
Rebohle and Georg Lehmann.

Signed-off-by: Conor McCarthy <cmccarthy at codeweavers.com>
---
 configure.ac                   |  1 +
 include/private/vkd3d_common.h | 33 +++++++++++++++++++++++++++++++++
 libs/vkd3d/device.c            | 10 +---------
 libs/vkd3d/resource.c          | 18 ++++++++----------
 libs/vkd3d/vkd3d_private.h     | 17 ++---------------
 5 files changed, 45 insertions(+), 34 deletions(-)

diff --git a/configure.ac b/configure.ac
index d296dfd4..546b6c57 100644
--- a/configure.ac
+++ b/configure.ac
@@ -131,6 +131,7 @@ VKD3D_CHECK_FUNC([HAVE_BUILTIN_POPCOUNT], [__builtin_popcount], [__builtin_popco
 VKD3D_CHECK_FUNC([HAVE_BUILTIN_ADD_OVERFLOW], [__builtin_add_overflow], [__builtin_add_overflow(0, 0, (int *)0)])
 VKD3D_CHECK_FUNC([HAVE_SYNC_ADD_AND_FETCH], [__sync_add_and_fetch], [__sync_add_and_fetch((int *)0, 0)])
 VKD3D_CHECK_FUNC([HAVE_SYNC_SUB_AND_FETCH], [__sync_sub_and_fetch], [__sync_sub_and_fetch((int *)0, 0)])
+VKD3D_CHECK_FUNC([HAVE_SYNC_LOCK_TEST_AND_SET], [__sync_lock_test_and_set], [__sync_lock_test_and_set((int *)0, 0)])
 
 VKD3D_CHECK_PTHREAD_SETNAME_NP
 
diff --git a/include/private/vkd3d_common.h b/include/private/vkd3d_common.h
index 8d1ca397..1ae43d8f 100644
--- a/include/private/vkd3d_common.h
+++ b/include/private/vkd3d_common.h
@@ -28,6 +28,10 @@
 #include <stdbool.h>
 #include <stdint.h>
 
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
@@ -211,6 +215,18 @@ static inline LONG InterlockedDecrement(LONG volatile *x)
 # else
 #  error "InterlockedDecrement() not implemented for this platform"
 # endif
+
+# if HAVE_SYNC_LOCK_TEST_AND_SET
+static inline LONG InterlockedExchange(LONG volatile *ptr, LONG val)
+{
+    return __sync_lock_test_and_set(ptr, val);
+}
+#  define vkd3d_spinlock_unlock(lock) __sync_lock_release(lock)
+# else
+#  error "spinlocks not implemented for this platform"
+# endif /* HAVE_SYNC_LOCK_TEST_AND_SET */
+#else
+# define vkd3d_spinlock_unlock(lock) InterlockedExchange(lock, 0)
 #endif  /* _WIN32 */
 
 #if HAVE_SYNC_ADD_AND_FETCH
@@ -222,6 +238,23 @@ static inline LONG InterlockedDecrement(LONG volatile *x)
 # error "atomic_add_fetch() not implemented for this platform"
 #endif  /* HAVE_SYNC_ADD_AND_FETCH */
 
+typedef LONG vkd3d_spinlock_t;
+
+static inline void vkd3d_spinlock_acquire(vkd3d_spinlock_t *lock)
+{
+    while (InterlockedExchange(lock, 1))
+    {
+#ifdef __SSE2__
+    _mm_pause();
+#endif
+    }
+}
+
+static inline void vkd3d_spinlock_release(vkd3d_spinlock_t *lock)
+{
+    vkd3d_spinlock_unlock(lock);
+}
+
 static inline void vkd3d_parse_version(const char *version, int *major, int *minor)
 {
     *major = atoi(version);
diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c
index 0fadb521..28a01331 100644
--- a/libs/vkd3d/device.c
+++ b/libs/vkd3d/device.c
@@ -2213,7 +2213,6 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device *iface)
 {
     struct d3d12_device *device = impl_from_ID3D12Device(iface);
     ULONG refcount = InterlockedDecrement(&device->refcount);
-    size_t i;
 
     TRACE("%p decreasing refcount to %u.\n", device, refcount);
 
@@ -2231,8 +2230,6 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device *iface)
         vkd3d_fence_worker_stop(&device->fence_worker, device);
         d3d12_device_destroy_pipeline_cache(device);
         d3d12_device_destroy_vkd3d_queues(device);
-        for (i = 0; i < ARRAY_SIZE(device->desc_mutex); ++i)
-            pthread_mutex_destroy(&device->desc_mutex[i]);
         VK_CALL(vkDestroyDevice(device->vk_device, NULL));
         if (device->parent)
             IUnknown_Release(device->parent);
@@ -3135,8 +3132,7 @@ static void STDMETHODCALLTYPE d3d12_device_CopyDescriptors(ID3D12Device *iface,
     struct d3d12_device *device = impl_from_ID3D12Device(iface);
     unsigned int dst_range_idx, dst_idx, src_range_idx, src_idx;
     unsigned int dst_range_size, src_range_size;
-    const struct d3d12_desc *src;
-    struct d3d12_desc *dst;
+    struct d3d12_desc *src, *dst;
 
     TRACE("iface %p, dst_descriptor_range_count %u, dst_descriptor_range_offsets %p, "
             "dst_descriptor_range_sizes %p, src_descriptor_range_count %u, "
@@ -3692,7 +3688,6 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
 {
     const struct vkd3d_vk_device_procs *vk_procs;
     HRESULT hr;
-    size_t i;
 
     device->ID3D12Device_iface.lpVtbl = &d3d12_device_vtbl;
     device->refcount = 1;
@@ -3731,9 +3726,6 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
     vkd3d_render_pass_cache_init(&device->render_pass_cache);
     vkd3d_gpu_va_allocator_init(&device->gpu_va_allocator);
 
-    for (i = 0; i < ARRAY_SIZE(device->desc_mutex); ++i)
-        pthread_mutex_init(&device->desc_mutex[i], NULL);
-
     if ((device->parent = create_info->parent))
         IUnknown_AddRef(device->parent);
 
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c
index 2c6c07c7..f3cbb684 100644
--- a/libs/vkd3d/resource.c
+++ b/libs/vkd3d/resource.c
@@ -2122,21 +2122,21 @@ void d3d12_desc_write_atomic(struct d3d12_desc *dst, const struct d3d12_desc *sr
         struct d3d12_device *device)
 {
     struct d3d12_desc destroy_desc;
-    pthread_mutex_t *mutex;
 
     destroy_desc.u.view = NULL;
 
-    mutex = d3d12_device_get_descriptor_mutex(device, dst);
-    pthread_mutex_lock(mutex);
+    vkd3d_spinlock_acquire(&dst->spinlock);
 
     /* Nothing to do for VKD3D_DESCRIPTOR_MAGIC_CBV. */
     if ((dst->magic & VKD3D_DESCRIPTOR_MAGIC_HAS_VIEW)
             && !InterlockedDecrement(&dst->u.view->refcount))
         destroy_desc = *dst;
 
-    *dst = *src;
+    dst->magic = src->magic;
+    dst->vk_descriptor_type = src->vk_descriptor_type;
+    dst->u = src->u;
 
-    pthread_mutex_unlock(mutex);
+    vkd3d_spinlock_release(&dst->spinlock);
 
     /* Destroy the view after unlocking to reduce wait time. */
     if (destroy_desc.u.view)
@@ -2150,12 +2150,11 @@ static void d3d12_desc_destroy(struct d3d12_desc *descriptor, struct d3d12_devic
     d3d12_desc_write_atomic(descriptor, &null_desc, device);
 }
 
-void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src,
+void d3d12_desc_copy(struct d3d12_desc *dst, struct d3d12_desc *src,
         struct d3d12_device *device)
 {
     bool needs_update = true;
     struct d3d12_desc tmp;
-    pthread_mutex_t *mutex;
 
     assert(dst != src);
 
@@ -2181,14 +2180,13 @@ void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src,
 
     if (needs_update)
     {
-        mutex = d3d12_device_get_descriptor_mutex(device, src);
-        pthread_mutex_lock(mutex);
+        vkd3d_spinlock_acquire(&src->spinlock);
 
         if (src->magic & VKD3D_DESCRIPTOR_MAGIC_HAS_VIEW)
             vkd3d_view_incref(src->u.view);
         tmp = *src;
 
-        pthread_mutex_unlock(mutex);
+        vkd3d_spinlock_release(&src->spinlock);
 
         d3d12_desc_write_atomic(dst, &tmp, device);
     }
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h
index 9829e0aa..78e503ae 100644
--- a/libs/vkd3d/vkd3d_private.h
+++ b/libs/vkd3d/vkd3d_private.h
@@ -517,6 +517,7 @@ struct d3d12_desc
 {
     uint32_t magic;
     VkDescriptorType vk_descriptor_type;
+    vkd3d_spinlock_t spinlock;
     union
     {
         VkDescriptorBufferInfo vk_cbv_info;
@@ -534,7 +535,7 @@ static inline struct d3d12_desc *d3d12_desc_from_gpu_handle(D3D12_GPU_DESCRIPTOR
     return (struct d3d12_desc *)(intptr_t)gpu_handle.ptr;
 }
 
-void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src, struct d3d12_device *device);
+void d3d12_desc_copy(struct d3d12_desc *dst, struct d3d12_desc *src, struct d3d12_device *device);
 void d3d12_desc_create_cbv(struct d3d12_desc *descriptor,
         struct d3d12_device *device, const D3D12_CONSTANT_BUFFER_VIEW_DESC *desc);
 void d3d12_desc_create_srv(struct d3d12_desc *descriptor,
@@ -1134,7 +1135,6 @@ struct d3d12_device
     struct vkd3d_fence_worker fence_worker;
 
     pthread_mutex_t mutex;
-    pthread_mutex_t desc_mutex[8];
     struct vkd3d_render_pass_cache render_pass_cache;
     VkPipelineCache vk_pipeline_cache;
 
@@ -1200,19 +1200,6 @@ static inline unsigned int d3d12_device_get_descriptor_handle_increment_size(str
     return ID3D12Device_GetDescriptorHandleIncrementSize(&device->ID3D12Device_iface, descriptor_type);
 }
 
-static inline pthread_mutex_t *d3d12_device_get_descriptor_mutex(struct d3d12_device *device,
-        const struct d3d12_desc *descriptor)
-{
-    STATIC_ASSERT(!(ARRAY_SIZE(device->desc_mutex) & (ARRAY_SIZE(device->desc_mutex) - 1)));
-    uintptr_t idx = (uintptr_t)descriptor;
-
-    idx ^= idx >> 12;
-    idx ^= idx >> 6;
-    idx ^= idx >> 3;
-
-    return &device->desc_mutex[idx & (ARRAY_SIZE(device->desc_mutex) - 1)];
-}
-
 /* utils */
 enum vkd3d_format_type
 {
-- 
2.32.0




More information about the wine-devel mailing list