diff --git a/ChangeLog b/ChangeLog index 6ee1bb0aa943444f9e83fee64d6d2bb414b3250f..5f718808a3785a5bfbc61e53b7da31e95cbfa319 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2014-06-01 Niels Möller <nisse@lysator.liu.se> + + * x86_64/gcm-hash8.asm: Pass correct argument count to W64_EXIT. + * x86_64/camellia-crypt-internal.asm: Pass correct argument count + to W64_ENTRY and W64_EXIT. + + * x86_64/machine.m4 [W64_ABI]: Fix for the case of 6 function + arguments. Also push %rdi unconditionally, and use aligned + accesses for save and restore %xmm registers (movdqa). + 2014-05-31 Niels Möller <nisse@lysator.liu.se> * configure.ac: Check for COFF type directives. diff --git a/x86_64/README b/x86_64/README index ae693be55b3591be1542d2ba87b1c7fb332655e7..d04e5dfc8d9481c9685dda77fb64db6f48bae934 100644 --- a/x86_64/README +++ b/x86_64/README @@ -4,6 +4,8 @@ Up to 6 integer and pointer arguments are passed in registers. Nine registers, %rax, %rcx, %rdx, %rsi, %rdi and %r8-%r11 can be used freely. Integers and pointers are returned in %rax. +At entry, it is required that %rsp == 8 (mod 16). + Registers May be Argument clobbered number @@ -51,14 +53,19 @@ Additional arguments are passed on the stack. "backing store" on the stack for the four register arguments is also required. %xmm6 to %xmm15 are callee-saved. The "long" type is just 32 bits. -If we have five arguments, and push the additional callee-save +If we have six arguments, and push the additional callee-save registers %rdi and %rsi on the stack, we get a stack frame like +64(%rsp): Sixth argument 56(%rsp): Fifth argument 48(%rsp): Space for fourth argument 40(%rsp): Space for third argument 32(%rsp): Space for second argument 24(%rsp): Space for first argument 16(%rsp): Return address -8(%rsp): Saved %rsi -(%rsp) : Saved %rdi +8(%rsp) : Saved %rdi +(%rsp): Saved %rsi + +If, in addition, we use more than 6 %xmm registers, we push them +*after* %rdi (but before %rsi), so that they are stored at 16-byte +aligned addresses. diff --git a/x86_64/camellia-crypt-internal.asm b/x86_64/camellia-crypt-internal.asm index c6032a5d11d024377c7e7c7769de8403111a64e8..040e030f168f4094e5fb26579ca09c1d7bab03c1 100644 --- a/x86_64/camellia-crypt-internal.asm +++ b/x86_64/camellia-crypt-internal.asm @@ -138,7 +138,7 @@ C xorl XREG(TMP), XREG($1) ALIGN(16) PROLOGUE(_nettle_camellia_crypt) - W64_ENTRY(5, 0) + W64_ENTRY(6, 0) test LENGTH, LENGTH jz .Lend @@ -197,6 +197,6 @@ PROLOGUE(_nettle_camellia_crypt) pop %rbp pop %rbx .Lend: - W64_EXIT(5, 0) + W64_EXIT(6, 0) ret EPILOGUE(_nettle_camellia_crypt) diff --git a/x86_64/gcm-hash8.asm b/x86_64/gcm-hash8.asm index f74f2f4b061648a3fd3eaa6c899016770f474999..6dec3b8c54a700b6f7779a36d9e11d2163dec3a2 100644 --- a/x86_64/gcm-hash8.asm +++ b/x86_64/gcm-hash8.asm @@ -162,7 +162,7 @@ ALIGN(16) pop %r12 pop %rbp pop %rbx - W64_EXIT(2, 0) + W64_EXIT(4, 0) ret .Lpartial: diff --git a/x86_64/machine.m4 b/x86_64/machine.m4 index b9556a27b26005629209097503e8dcfbf7f621de..397e9b25840b4d47bbd122c466bd9caa3ddc5da8 100644 --- a/x86_64/machine.m4 +++ b/x86_64/machine.m4 @@ -67,44 +67,48 @@ define(<XREG>,<ifelse( dnl W64_ENTRY(nargs, xmm_used) define(<W64_ENTRY>, < changequote([,])dnl - ifelse(<<<<<<<<<<<<<<<< ignored; only for balancing) + ifelse(<<<<<<<<<<<<<<<<<< ignored; only for balancing) ifelse(W64_ABI,yes,[ + dnl unconditionally push %rdi, making %rsp 16-byte aligned + push %rdi + dnl Save %xmm6, ..., if needed ifelse(eval($2 > 6), 1, [ - sub [$]eval(8 + 16*($2 - 6)), %rsp - movdqu %xmm6, 0(%rsp) + sub [$]eval(16*($2 - 6)), %rsp + movdqa %xmm6, 0(%rsp) ]) ifelse(eval($2 > 7), 1, [ - movdqu %xmm7, 16(%rsp) + movdqa %xmm7, 16(%rsp) ]) ifelse(eval($2 > 8), 1, [ - movdqu %xmm8, 32(%rsp) + movdqa %xmm8, 32(%rsp) ]) ifelse(eval($2 > 9), 1, [ - movdqu %xmm9, 48(%rsp) + movdqa %xmm9, 48(%rsp) ]) ifelse(eval($2 > 10), 1, [ - movdqu %xmm10, 64(%rsp) + movdqa %xmm10, 64(%rsp) ]) ifelse(eval($2 > 11), 1, [ - movdqu %xmm11, 80(%rsp) + movdqa %xmm11, 80(%rsp) ]) ifelse(eval($2 > 12), 1, [ - movdqu %xmm12, 96(%rsp) + movdqa %xmm12, 96(%rsp) ]) ifelse(eval($2 > 13), 1, [ - movdqu %xmm13, 112(%rsp) + movdqa %xmm13, 112(%rsp) ]) ifelse(eval($2 > 14), 1, [ - movdqu %xmm14, 128(%rsp) + movdqa %xmm14, 128(%rsp) ]) ifelse(eval($2 > 15), 1, [ - movdqu %xmm15, 144(%rsp) + movdqa %xmm15, 144(%rsp) ]) + dnl Move around arguments ifelse(eval($1 >= 1), 1, [ - push %rdi mov %rcx, %rdi ]) ifelse(eval($1 >= 2), 1, [ + dnl NOTE: Breaks 16-byte %rsp alignment push %rsi mov %rdx, %rsi ]) @@ -115,11 +119,10 @@ define(<W64_ENTRY>, < mov %r9, %rcx ]) ifelse(eval($1 >= 5), 1, [ - ifelse(eval($2 > 6), 1, [ - mov eval(8 + 16*($2 - 6) + 56)(%rsp), %r8 - ], [ - mov 56(%rsp), %r8 - ]) + mov ifelse(eval($2 > 6), 1, eval(16*($2-6)+56),56)(%rsp), %r8 + ]) + ifelse(eval($1 >= 6), 1, [ + mov ifelse(eval($2 > 6), 1, eval(16*($2-6)+64),64)(%rsp), %r9 ]) ]) changequote(<,>)dnl @@ -128,45 +131,43 @@ define(<W64_ENTRY>, < dnl W64_EXIT(nargs, xmm_used) define(<W64_EXIT>, < changequote([,])dnl - ifelse(<<<<<<<<<<<< ignored; only for balancing) + ifelse(<<<<<<<<<<< ignored; only for balancing) ifelse(W64_ABI,yes,[ ifelse(eval($1 >= 2), 1, [ pop %rsi - ]) - ifelse(eval($1 >= 1), 1, [ - pop %rdi - ]) + ]) ifelse(eval($2 > 15), 1, [ - movdqu 144(%rsp), %xmm15 + movdqa 144(%rsp), %xmm15 ]) ifelse(eval($2 > 14), 1, [ - movdqu 128(%rsp), %xmm14 + movdqa 128(%rsp), %xmm14 ]) ifelse(eval($2 > 13), 1, [ - movdqu 112(%rsp), %xmm13 + movdqa 112(%rsp), %xmm13 ]) ifelse(eval($2 > 12), 1, [ - movdqu 96(%rsp), %xmm12 + movdqa 96(%rsp), %xmm12 ]) ifelse(eval($2 > 11), 1, [ - movdqu 80(%rsp), %xmm11 + movdqa 80(%rsp), %xmm11 ]) ifelse(eval($2 > 10), 1, [ - movdqu 64(%rsp), %xmm10 + movdqa 64(%rsp), %xmm10 ]) ifelse(eval($2 > 9), 1, [ - movdqu 48(%rsp), %xmm9 + movdqa 48(%rsp), %xmm9 ]) ifelse(eval($2 > 8), 1, [ - movdqu 32(%rsp), %xmm8 + movdqa 32(%rsp), %xmm8 ]) ifelse(eval($2 > 7), 1, [ - movdqu 16(%rsp), %xmm7 + movdqa 16(%rsp), %xmm7 ]) ifelse(eval($2 > 6), 1, [ - movdqu 0(%rsp), %xmm6 - add [$]eval(8 + 16*($2 - 6)), %rsp + movdqa (%rsp), %xmm6 + add [$]eval(16*($2 - 6)), %rsp ]) + pop %rdi ]) changequote(<,>)dnl >)