Testing: gcc -foutline-msabi-xlogues patch
Daniel Santos
daniel.santos at pobox.com
Sat Oct 22 23:41:51 CDT 2016
I've been working on a gcc patch targeting 64-bit Wine to reduce the
impact of the implicit register clobbers when a Microsoft's x64 function
calls a System V ABI function and I appear to finally have a working
implementation. However, I cannot get recent Wine tests to complete with
an unpatched compiler (I have 30 failures, 4 which are crashes and I
even get some dmesgs from drivers.) While the same Wine tests pass both
with and without the new optimization, I could use some external testing
to make sure there's no flaws that I've missed (especially a full test
with no failures).
The patch set is not actually complete as my unit tests didn't catch
many of the flaws (now fixed) that I caught when building real Wine, so
I'm re-working them. I have my current version up on github:
https://github.com/daniel-santos/gcc/tree/gcc-5_4_0-outline-msabi-xlogues.
Background
For those unfamiliar, the differences between the two ABIs requires an
ms_abi function to save and restore 12 registers: RSI, RDI and XMM6-15.
The size of pushing/popping RSI and RDI is insignificant, but saving and
restoring 10 SSE registers takes between 94 and 174 bytes each function
(dependent upon the SP offset), thereby bloating ".text" size quite a bit.
Details
This patch uses out-of-line stubs for register for saves/restores to
reduce .text size. The *theory* (not yet proven) is that the reduction
of instruction cache misses will offset the extra few instructions
required to facilitate this. But while we're going this far, the
optimization also saves/restores an additional 6 non-volatile ms_abi
registers if they are clobbered in the function: RBX, RBP and R12-15.
This extra step actually saves very little ".text" code, but the stub
implements them using MOVs which the CPU is better able to parallelize.
While MOV instructions are larger (4 bytes vs 1 or 2), they only appear
in the stubs, theoretically gaining the performance benefits of MOVs
without paying for the cost of the extra size.
This optimization works both with and without forced stack realignment
(https://bugs.winehq.org/show_bug.cgi?id=27680), although more can be
done to produce better code in the realigned case. In the forced
realignment case, this optimization reduces .text size Wine 64 by 19%
and in the normal (aligned stack) case by 22%. As an example, here is a
test function (with stubs) built with -foutline-msabi-xlogues (but no
realignment):
__attribute__ ((noinline)) long sysv_fn0 ()
{
return 42;
}
__attribute__ ((ms_abi, noinline)) long msabifn0_6 ()
{
__asm__ __volatile__ ("" ::: "rbx", "rbp", "r12", "r13", "r14", "r15");
return sysv_fn0 ();
}
0000000000401000 <msabifn0_6>:
401000: 48 8d 44 24 88 lea -0x78(%rsp),%rax
401005: 48 81 ec f8 00 00 00 sub $0xf8,%rsp
40100c: e8 09 e8 01 00 callq 41f81a <__msabi_save_18>
401011: 31 c0 xor %eax,%eax
401013: e8 48 fc ff ff callq 400c60 <sysv_fn0>
401018: 48 8d b4 24 80 00 00 lea 0x80(%rsp),%rsi
40101f: 00
401020: 4c 8d 56 78 lea 0x78(%rsi),%r10
401024: e9 41 e8 01 00 jmpq 41f86a
<__msabi_restore_ret_18>
000000000041f81a <__msabi_save_18>:
41f81a: 4c 89 78 90 mov %r15,-0x70(%rax)
000000000041f81e <__msabi_save_17>:
41f81e: 4c 89 70 98 mov %r14,-0x68(%rax)
000000000041f822 <__msabi_save_16>:
41f822: 4c 89 68 a0 mov %r13,-0x60(%rax)
000000000041f826 <__msabi_save_15>:
41f826: 4c 89 60 a8 mov %r12,-0x58(%rax)
000000000041f82a <__msabi_save_14>:
41f82a: 48 89 68 b0 mov %rbp,-0x50(%rax)
000000000041f82e <__msabi_save_13>:
41f82e: 48 89 58 b8 mov %rbx,-0x48(%rax)
000000000041f832 <__msabi_save_12>:
41f832: 48 89 78 c0 mov %rdi,-0x40(%rax)
41f836: 48 89 70 c8 mov %rsi,-0x38(%rax)
41f83a: 44 0f 29 78 d0 movaps %xmm15,-0x30(%rax)
41f83f: 44 0f 29 70 e0 movaps %xmm14,-0x20(%rax)
41f844: 44 0f 29 68 f0 movaps %xmm13,-0x10(%rax)
41f849: 44 0f 29 20 movaps %xmm12,(%rax)
41f84d: 44 0f 29 58 10 movaps %xmm11,0x10(%rax)
41f852: 44 0f 29 50 20 movaps %xmm10,0x20(%rax)
41f857: 44 0f 29 48 30 movaps %xmm9,0x30(%rax)
41f85c: 44 0f 29 40 40 movaps %xmm8,0x40(%rax)
41f861: 0f 29 78 50 movaps %xmm7,0x50(%rax)
41f865: 0f 29 70 60 movaps %xmm6,0x60(%rax)
41f869: c3 retq
000000000041f86a <__msabi_restore_ret_18>:
41f86a: 4c 8b 7e 90 mov -0x70(%rsi),%r15
000000000041f86e <__msabi_restore_ret_17>:
41f86e: 4c 8b 76 98 mov -0x68(%rsi),%r14
000000000041f872 <__msabi_restore_ret_16>:
41f872: 4c 8b 6e a0 mov -0x60(%rsi),%r13
000000000041f876 <__msabi_restore_ret_15>:
41f876: 4c 8b 66 a8 mov -0x58(%rsi),%r12
000000000041f87a <__msabi_restore_ret_14>:
41f87a: 48 8b 6e b0 mov -0x50(%rsi),%rbp
000000000041f87e <__msabi_restore_ret_13>:
41f87e: 48 8b 5e b8 mov -0x48(%rsi),%rbx
000000000041f882 <__msabi_restore_ret_12>:
41f882: 48 8b 7e c0 mov -0x40(%rsi),%rdi
41f886: 44 0f 28 7e d0 movaps -0x30(%rsi),%xmm15
41f88b: 44 0f 28 76 e0 movaps -0x20(%rsi),%xmm14
41f890: 44 0f 28 6e f0 movaps -0x10(%rsi),%xmm13
41f895: 44 0f 28 26 movaps (%rsi),%xmm12
41f899: 44 0f 28 5e 10 movaps 0x10(%rsi),%xmm11
41f89e: 44 0f 28 56 20 movaps 0x20(%rsi),%xmm10
41f8a3: 44 0f 28 4e 30 movaps 0x30(%rsi),%xmm9
41f8a8: 44 0f 28 46 40 movaps 0x40(%rsi),%xmm8
41f8ad: 0f 28 7e 50 movaps 0x50(%rsi),%xmm7
41f8b1: 0f 28 76 60 movaps 0x60(%rsi),%xmm6
41f8b5: 48 8b 76 c8 mov -0x38(%rsi),%rsi
41f8b9: 4c 89 d4 mov %r10,%rsp
41f8bc: c3 retq
Here is a similar function with forced realignment. This can be improved
with an additional "have frame pointer" set of stubs.
__attribute__ ((ms_abi, noinline, force_align_arg_pointer)) long
msabifn0_5 ()
{
__asm__ __volatile__ ( "" ::: "rbx", "r12", "r13", "r14", "r15" );
return sysv_fn0 ();
}
0000000000400fc0 <msabifn0_5>:
400fc0: 55 push %rbp
400fc1: 48 8d 44 24 90 lea -0x70(%rsp),%rax
400fc6: 48 89 e5 mov %rsp,%rbp
400fc9: 48 83 e0 f0 and $0xfffffffffffffff0,%rax
400fcd: 48 8d 60 b0 lea -0x50(%rax),%rsp
400fd1: e8 78 e8 01 00 callq 41f84e <__msabi_save_13>
400fd6: 41 57 push %r15
400fd8: 41 56 push %r14
400fda: 41 55 push %r13
400fdc: 41 54 push %r12
400fde: 31 c0 xor %eax,%eax
400fe0: e8 7b fc ff ff callq 400c60 <sysv_fn0>
400fe5: 48 8d a5 20 ff ff ff lea -0xe0(%rbp),%rsp
400fec: 41 5c pop %r12
400fee: 41 5d pop %r13
400ff0: 41 5e pop %r14
400ff2: 41 5f pop %r15
400ff4: 48 8d 74 24 50 lea 0x50(%rsp),%rsi
400ff9: e8 a0 e8 01 00 callq 41f89e <__msabi_restore_13>
400ffe: 48 81 c4 c0 00 00 00 add $0xc0,%rsp
401005: 5d pop %rbp
401006: c3 retq
Daniel
More information about the wine-devel
mailing list