From feb4bd7a65782ff43a499c7a4f81eece7c5f347f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Thu, 10 Feb 2011 23:57:39 +0100 Subject: [PATCH] New file. Improves performance by 22% for the unaligned01 case and 35% for the unaligned12 case, benchmarked on Intel SU1400. Rev: nettle/x86_64/memxor.asm:1.1 --- x86_64/memxor.asm | 212 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 x86_64/memxor.asm diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm new file mode 100644 index 00000000..17a22bbd --- /dev/null +++ b/x86_64/memxor.asm @@ -0,0 +1,212 @@ +C -*- mode: asm; asm-comment-char: ?C; -*- +C nettle, low-level cryptographics library +C +C Copyright (C) 2010, Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +C MA 02111-1307, USA. + +C Register usage: +define(<DST>, <%rax>) C Originally in %rdi +define(<AP>, <%rsi>) +define(<BP>, <%rdx>) +define(<N>, <%r10>) +define(<TMP>, <%r8>) +define(<TMP2>, <%r9>) +define(<CNT>, <%rdi>) +define(<S0>, <%r11>) +define(<S1>, <%rdi>) C Overlaps with CNT + + .file "memxor.asm" + + .text + + C memxor(uint8_t *dst, const uint8_t *src, size_t n) + C %rdi %rsi %rdx + ALIGN(4) + +PROLOGUE(memxor) + mov %rdx, %r10 + mov %rdi, %rdx + jmp .Lmemxor3_entry +EPILOGUE(memxor) + + C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n) + C %rdi %rsi %rdx %rcx + ALIGN(4) + +PROLOGUE(memxor3) + C %cl needed for shift count, so move away N + mov %rcx, N +.Lmemxor3_entry: + test N, N + C Get number of unaligned bytes at the end + C %rdi is used as CNT, %rax as DST and as return value + mov %rdi, %rax + jz .Ldone + add N, CNT + and $7, CNT + + jz .Laligned + + cmp $8, N + jc .Lfinal_next + + C FIXME: Instead of this loop, could try cmov with memory + C destination, as a sequence of one 8-bit, one 16-bit and one + C 32-bit operations. (Except that cmov can't do 8-bit ops, so + C that step has to use a conditional). +.Lalign_loop: + + sub $1, N + movb (AP, N), LREG(TMP) + xorb (BP, N), LREG(TMP) + movb LREG(TMP), (DST, N) + sub $1, CNT + jnz .Lalign_loop + +.Laligned: +ifelse(x,x,< + C Check for the case that AP and BP have the same alignment, + C but different from DST. + mov AP, TMP + sub BP, TMP + test $7, TMP + jnz .Lno_shift_case + mov AP, %rcx + sub DST, %rcx + and $7, %rcx + jz .Lno_shift_case + sub %rcx, AP + sub %rcx, BP + shl $3, %rcx + + C Unrolling, with aligned values alternating in S0 and S1 + test $8, N + jnz .Lshift_odd + mov (AP, N), S1 + xor (BP, N), S1 + jmp .Lshift_next + +.Lshift_odd: + mov -8(AP, N), S1 + mov (AP, N), S0 + xor -8(BP, N), S1 + xor (BP, N), S0 + mov S1, TMP + shr %cl, TMP + neg %cl + shl %cl, S0 + neg %cl + + or S0, TMP + mov TMP, -8(DST, N) + sub $8, N + jz .Ldone + jmp .Lshift_next + +.Lshift_loop: + mov 8(AP, N), S0 + xor 8(BP, N), S0 + mov S0, TMP + shr %cl, TMP + neg %cl + shl %cl, S1 + neg %cl + or S1, TMP + mov TMP, 8(DST, N) + + mov (AP, N), S1 + xor (BP, N), S1 + mov S1, TMP + shr %cl, TMP + neg %cl + shl %cl, S0 + neg %cl + or S0, TMP + mov TMP, (DST, N) +.Lshift_next: + sub $16, N + C FIXME: Handle the case N == 16 specially, + C like in the non-shifted case? +C ja .Lshift_loop +C jz .Ldone + jnc .Lshift_loop + + add $15, N + jnc .Ldone + + shr $3, %rcx + add %rcx, AP + add %rcx, BP + jmp .Lfinal_loop + +.Lno_shift_case: +>) + C Next destination word is -8(DST, N) + C Setup for unrolling + test $8, N + jz .Lword_next + + sub $8, N + jz .Lone_word + + mov (AP, N), TMP + xor (BP, N), TMP + mov TMP, (DST, N) + + jmp .Lword_next + + ALIGN(4) + +.Lword_loop: + mov 8(AP, N), TMP + mov (AP, N), TMP2 + xor 8(BP, N), TMP + xor (BP, N), TMP2 + mov TMP, 8(DST, N) + mov TMP2, (DST, N) + +.Lword_next: + sub $16, N + ja .Lword_loop C Not zero and no carry + jnz .Lfinal + + C Final operation is word aligned + mov 8(AP, N), TMP + xor 8(BP, N), TMP + mov TMP, 8(DST, N) + +.Lone_word: + mov (AP, N), TMP + xor (BP, N), TMP + mov TMP, (DST, N) + + ret + +.Lfinal: + add $15, N + +.Lfinal_loop: + movb (AP, N), LREG(TMP) + xorb (BP, N), LREG(TMP) + movb LREG(TMP), (DST, N) +.Lfinal_next: + sub $1, N + jnc .Lfinal_loop + +.Ldone: + ret +EPILOGUE(memxor3) -- GitLab