diff --git a/ChangeLog b/ChangeLog index 34f9ac436ef1bb9785d669ad15572a5e5eb65910..e36bc8217e4b76bfb34f78820fba5d11ad6e247b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2014-11-24 Niels Möller <nisse@lysator.liu.se> + + * x86_64/memxor3.asm (memxor3): New file, code moved from old + memxor.asm. + * x86_64/memxor.asm (memxor): Rewritten, no longer jumps into + memxor3. + + * configure.ac (asm_replace_list): Added memxor.asm and + memxor3.asm. + 2014-10-23 Niels Möller <nisse@lysator.liu.se> * configure.ac (IF_ASM): New substituted variable. diff --git a/configure.ac b/configure.ac index 78bcce2eec92b4e8c1cf6717181aa3cf3a389d99..76beb3160853abde4dfaafcb90fcc35a558c6d2f 100644 --- a/configure.ac +++ b/configure.ac @@ -272,7 +272,8 @@ fi # to a new object file). asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ arcfour-crypt.asm camellia-crypt-internal.asm \ - md5-compress.asm poly1305-internal.asm \ + md5-compress.asm memxor.asm memxor3.asm \ + poly1305-internal.asm \ chacha-core-internal.asm \ salsa20-crypt.asm salsa20-core-internal.asm \ serpent-encrypt.asm serpent-decrypt.asm \ diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm index e14e31a13f4617bc202bb66239703caedee4f47d..69d6cfe382a12e8a0d2ace1d1019a9db44d23f24 100644 --- a/x86_64/memxor.asm +++ b/x86_64/memxor.asm @@ -1,7 +1,7 @@ C x86_64/memxor.asm ifelse(< - Copyright (C) 2010, Niels Möller + Copyright (C) 2010, 2014, Niels Möller This file is part of GNU Nettle. @@ -32,9 +32,8 @@ ifelse(< C Register usage: define(<DST>, <%rax>) C Originally in %rdi -define(<AP>, <%rsi>) -define(<BP>, <%rdx>) -define(<N>, <%r10>) +define(<SRC>, <%rsi>) +define(<N>, <%rdx>) define(<TMP>, <%r8>) define(<TMP2>, <%r9>) define(<CNT>, <%rdi>) @@ -53,20 +52,7 @@ define(<USE_SSE2>, <no>) PROLOGUE(nettle_memxor) W64_ENTRY(3, 0) - mov %rdx, %r10 - mov %rdi, %rdx - jmp .Lmemxor3_entry -EPILOGUE(nettle_memxor) - C memxor3(void *dst, const void *a, const void *b, size_t n) - C %rdi %rsi %rdx %rcx - ALIGN(16) - -PROLOGUE(nettle_memxor3) - W64_ENTRY(4, 0) - C %cl needed for shift count, so move away N - mov %rcx, N -.Lmemxor3_entry: test N, N C Get number of unaligned bytes at the end C %rdi is used as CNT, %rax as DST and as return value @@ -87,9 +73,8 @@ PROLOGUE(nettle_memxor3) .Lalign_loop: sub $1, N - movb (AP, N), LREG(TMP) - xorb (BP, N), LREG(TMP) - movb LREG(TMP), (DST, N) + movb (SRC, N), LREG(TMP) + xorb LREG(TMP), (DST, N) sub $1, CNT jnz .Lalign_loop @@ -98,83 +83,7 @@ ifelse(USE_SSE2, yes, < cmp $16, N jnc .Lsse2_case >) - C Check for the case that AP and BP have the same alignment, - C but different from DST. - mov AP, TMP - sub BP, TMP - test $7, TMP - jnz .Lno_shift_case - mov AP, %rcx - sub DST, %rcx - and $7, %rcx - jz .Lno_shift_case - sub %rcx, AP - sub %rcx, BP - shl $3, %rcx - - C Unrolling, with aligned values alternating in S0 and S1 - test $8, N - jnz .Lshift_odd - mov (AP, N), S1 - xor (BP, N), S1 - jmp .Lshift_next - -.Lshift_odd: - mov -8(AP, N), S1 - mov (AP, N), S0 - xor -8(BP, N), S1 - xor (BP, N), S0 - mov S1, TMP - shr %cl, TMP - neg %cl - shl %cl, S0 - neg %cl - - or S0, TMP - mov TMP, -8(DST, N) - sub $8, N - jz .Ldone - jmp .Lshift_next - - ALIGN(16) -.Lshift_loop: - mov 8(AP, N), S0 - xor 8(BP, N), S0 - mov S0, TMP - shr %cl, TMP - neg %cl - shl %cl, S1 - neg %cl - or S1, TMP - mov TMP, 8(DST, N) - - mov (AP, N), S1 - xor (BP, N), S1 - mov S1, TMP - shr %cl, TMP - neg %cl - shl %cl, S0 - neg %cl - or S0, TMP - mov TMP, (DST, N) -.Lshift_next: - sub $16, N - C FIXME: Handle the case N == 16 specially, - C like in the non-shifted case? -C ja .Lshift_loop -C jz .Ldone - jnc .Lshift_loop - - add $15, N - jnc .Ldone - - shr $3, %rcx - add %rcx, AP - add %rcx, BP - jmp .Lfinal_loop - -.Lno_shift_case: C Next destination word is -8(DST, N) C Setup for unrolling test $8, N @@ -183,21 +92,18 @@ C jz .Ldone sub $8, N jz .Lone_word - mov (AP, N), TMP - xor (BP, N), TMP - mov TMP, (DST, N) + mov (SRC, N), TMP + xor TMP, (DST, N) jmp .Lword_next ALIGN(16) .Lword_loop: - mov 8(AP, N), TMP - mov (AP, N), TMP2 - xor 8(BP, N), TMP - xor (BP, N), TMP2 - mov TMP, 8(DST, N) - mov TMP2, (DST, N) + mov 8(SRC, N), TMP + mov (SRC, N), TMP2 + xor TMP, 8(DST, N) + xor TMP2, (DST, N) .Lword_next: sub $16, N @@ -205,33 +111,28 @@ C jz .Ldone jnz .Lfinal C Final operation is word aligned - mov 8(AP, N), TMP - xor 8(BP, N), TMP - mov TMP, 8(DST, N) + mov 8(SRC, N), TMP + xor TMP, 8(DST, N) .Lone_word: - mov (AP, N), TMP - xor (BP, N), TMP - mov TMP, (DST, N) + mov (SRC, N), TMP + xor TMP, (DST, N) - C ENTRY might have been 3 args, too, but it doesn't matter for the exit - W64_EXIT(4, 0) + W64_EXIT(3, 0) ret .Lfinal: add $15, N .Lfinal_loop: - movb (AP, N), LREG(TMP) - xorb (BP, N), LREG(TMP) - movb LREG(TMP), (DST, N) + movb (SRC, N), LREG(TMP) + xorb LREG(TMP), (DST, N) .Lfinal_next: sub $1, N jnc .Lfinal_loop .Ldone: - C ENTRY might have been 3 args, too, but it doesn't matter for the exit - W64_EXIT(4, 0) + W64_EXIT(3, 0) ret ifelse(USE_SSE2, yes, < @@ -241,15 +142,14 @@ ifelse(USE_SSE2, yes, < test $8, TMP jz .Lsse2_next sub $8, N - mov (AP, N), TMP - xor (BP, N), TMP - mov TMP, (DST, N) + mov (SRC, N), TMP + xor TMP, (DST, N) jmp .Lsse2_next ALIGN(16) .Lsse2_loop: - movdqu (AP, N), %xmm0 - movdqu (BP, N), %xmm1 + movdqu (SRC, N), %xmm0 + movdqa (DST, N), %xmm1 pxor %xmm0, %xmm1 movdqa %xmm1, (DST, N) .Lsse2_next: @@ -261,14 +161,13 @@ ifelse(USE_SSE2, yes, < jnz .Lfinal C Final operation is aligned - movdqu (AP), %xmm0 - movdqu (BP), %xmm1 + movdqu (SRC), %xmm0 + movdqa (DST), %xmm1 pxor %xmm0, %xmm1 movdqa %xmm1, (DST) - C ENTRY might have been 3 args, too, but it doesn't matter for the exit - W64_EXIT(4, 0) + + W64_EXIT(3, 0) ret >) - -EPILOGUE(nettle_memxor3) +EPILOGUE(nettle_memxor) diff --git a/x86_64/memxor3.asm b/x86_64/memxor3.asm new file mode 100644 index 0000000000000000000000000000000000000000..8ff3e79cf14ef942f78438faac5263b1d9a48d67 --- /dev/null +++ b/x86_64/memxor3.asm @@ -0,0 +1,263 @@ +C x86_64/memxor3.asm + +ifelse(< + Copyright (C) 2010, 2014 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: +define(<DST>, <%rax>) C Originally in %rdi +define(<AP>, <%rsi>) +define(<BP>, <%rdx>) +define(<N>, <%r10>) +define(<TMP>, <%r8>) +define(<TMP2>, <%r9>) +define(<CNT>, <%rdi>) +define(<S0>, <%r11>) +define(<S1>, <%rdi>) C Overlaps with CNT + +define(<USE_SSE2>, <no>) + + .file "memxor3.asm" + + .text + + C memxor3(void *dst, const void *a, const void *b, size_t n) + C %rdi %rsi %rdx %rcx + ALIGN(16) + +PROLOGUE(nettle_memxor3) + W64_ENTRY(4, 0) + C %cl needed for shift count, so move away N + mov %rcx, N +.Lmemxor3_entry: + test N, N + C Get number of unaligned bytes at the end + C %rdi is used as CNT, %rax as DST and as return value + mov %rdi, %rax + jz .Ldone + add N, CNT + and $7, CNT + + jz .Laligned + + cmp $8, N + jc .Lfinal_next + + C FIXME: Instead of this loop, could try cmov with memory + C destination, as a sequence of one 8-bit, one 16-bit and one + C 32-bit operations. (Except that cmov can't do 8-bit ops, so + C that step has to use a conditional). +.Lalign_loop: + + sub $1, N + movb (AP, N), LREG(TMP) + xorb (BP, N), LREG(TMP) + movb LREG(TMP), (DST, N) + sub $1, CNT + jnz .Lalign_loop + +.Laligned: +ifelse(USE_SSE2, yes, < + cmp $16, N + jnc .Lsse2_case +>) + C Check for the case that AP and BP have the same alignment, + C but different from DST. + mov AP, TMP + sub BP, TMP + test $7, TMP + jnz .Lno_shift_case + mov AP, %rcx + sub DST, %rcx + and $7, %rcx + jz .Lno_shift_case + sub %rcx, AP + sub %rcx, BP + shl $3, %rcx + + C Unrolling, with aligned values alternating in S0 and S1 + test $8, N + jnz .Lshift_odd + mov (AP, N), S1 + xor (BP, N), S1 + jmp .Lshift_next + +.Lshift_odd: + mov -8(AP, N), S1 + mov (AP, N), S0 + xor -8(BP, N), S1 + xor (BP, N), S0 + mov S1, TMP + shr %cl, TMP + neg %cl + shl %cl, S0 + neg %cl + + or S0, TMP + mov TMP, -8(DST, N) + sub $8, N + jz .Ldone + jmp .Lshift_next + + ALIGN(16) + +.Lshift_loop: + mov 8(AP, N), S0 + xor 8(BP, N), S0 + mov S0, TMP + shr %cl, TMP + neg %cl + shl %cl, S1 + neg %cl + or S1, TMP + mov TMP, 8(DST, N) + + mov (AP, N), S1 + xor (BP, N), S1 + mov S1, TMP + shr %cl, TMP + neg %cl + shl %cl, S0 + neg %cl + or S0, TMP + mov TMP, (DST, N) +.Lshift_next: + sub $16, N + C FIXME: Handle the case N == 16 specially, + C like in the non-shifted case? +C ja .Lshift_loop +C jz .Ldone + jnc .Lshift_loop + + add $15, N + jnc .Ldone + + shr $3, %rcx + add %rcx, AP + add %rcx, BP + jmp .Lfinal_loop + +.Lno_shift_case: + C Next destination word is -8(DST, N) + C Setup for unrolling + test $8, N + jz .Lword_next + + sub $8, N + jz .Lone_word + + mov (AP, N), TMP + xor (BP, N), TMP + mov TMP, (DST, N) + + jmp .Lword_next + + ALIGN(16) + +.Lword_loop: + mov 8(AP, N), TMP + mov (AP, N), TMP2 + xor 8(BP, N), TMP + xor (BP, N), TMP2 + mov TMP, 8(DST, N) + mov TMP2, (DST, N) + +.Lword_next: + sub $16, N + ja .Lword_loop C Not zero and no carry + jnz .Lfinal + + C Final operation is word aligned + mov 8(AP, N), TMP + xor 8(BP, N), TMP + mov TMP, 8(DST, N) + +.Lone_word: + mov (AP, N), TMP + xor (BP, N), TMP + mov TMP, (DST, N) + + C ENTRY might have been 3 args, too, but it doesn't matter for the exit + W64_EXIT(4, 0) + ret + +.Lfinal: + add $15, N + +.Lfinal_loop: + movb (AP, N), LREG(TMP) + xorb (BP, N), LREG(TMP) + movb LREG(TMP), (DST, N) +.Lfinal_next: + sub $1, N + jnc .Lfinal_loop + +.Ldone: + C ENTRY might have been 3 args, too, but it doesn't matter for the exit + W64_EXIT(4, 0) + ret + +ifelse(USE_SSE2, yes, < + +.Lsse2_case: + lea (DST, N), TMP + test $8, TMP + jz .Lsse2_next + sub $8, N + mov (AP, N), TMP + xor (BP, N), TMP + mov TMP, (DST, N) + jmp .Lsse2_next + + ALIGN(16) +.Lsse2_loop: + movdqu (AP, N), %xmm0 + movdqu (BP, N), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST, N) +.Lsse2_next: + sub $16, N + ja .Lsse2_loop + + C FIXME: See if we can do a full word first, before the + C byte-wise final loop. + jnz .Lfinal + + C Final operation is aligned + movdqu (AP), %xmm0 + movdqu (BP), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST) + C ENTRY might have been 3 args, too, but it doesn't matter for the exit + W64_EXIT(4, 0) + ret +>) + + +EPILOGUE(nettle_memxor3)