Commit feb4bd7a authored by Niels Möller's avatar Niels Möller

New file. Improves performance by 22% for the

unaligned01 case and 35% for the unaligned12 case, benchmarked on
Intel SU1400.

Rev: nettle/x86_64/memxor.asm:1.1
parent 1f6409c0
C -*- mode: asm; asm-comment-char: ?C; -*-
C nettle, low-level cryptographics library
C
C Copyright (C) 2010, Niels Möller
C
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
C License for more details.
C
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
C MA 02111-1307, USA.
C Register usage:
define(<DST>, <%rax>) C Originally in %rdi
define(<AP>, <%rsi>)
define(<BP>, <%rdx>)
define(<N>, <%r10>)
define(<TMP>, <%r8>)
define(<TMP2>, <%r9>)
define(<CNT>, <%rdi>)
define(<S0>, <%r11>)
define(<S1>, <%rdi>) C Overlaps with CNT
.file "memxor.asm"
.text
C memxor(uint8_t *dst, const uint8_t *src, size_t n)
C %rdi %rsi %rdx
ALIGN(4)
PROLOGUE(memxor)
mov %rdx, %r10
mov %rdi, %rdx
jmp .Lmemxor3_entry
EPILOGUE(memxor)
C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
C %rdi %rsi %rdx %rcx
ALIGN(4)
PROLOGUE(memxor3)
C %cl needed for shift count, so move away N
mov %rcx, N
.Lmemxor3_entry:
test N, N
C Get number of unaligned bytes at the end
C %rdi is used as CNT, %rax as DST and as return value
mov %rdi, %rax
jz .Ldone
add N, CNT
and $7, CNT
jz .Laligned
cmp $8, N
jc .Lfinal_next
C FIXME: Instead of this loop, could try cmov with memory
C destination, as a sequence of one 8-bit, one 16-bit and one
C 32-bit operations. (Except that cmov can't do 8-bit ops, so
C that step has to use a conditional).
.Lalign_loop:
sub $1, N
movb (AP, N), LREG(TMP)
xorb (BP, N), LREG(TMP)
movb LREG(TMP), (DST, N)
sub $1, CNT
jnz .Lalign_loop
.Laligned:
ifelse(x,x,<
C Check for the case that AP and BP have the same alignment,
C but different from DST.
mov AP, TMP
sub BP, TMP
test $7, TMP
jnz .Lno_shift_case
mov AP, %rcx
sub DST, %rcx
and $7, %rcx
jz .Lno_shift_case
sub %rcx, AP
sub %rcx, BP
shl $3, %rcx
C Unrolling, with aligned values alternating in S0 and S1
test $8, N
jnz .Lshift_odd
mov (AP, N), S1
xor (BP, N), S1
jmp .Lshift_next
.Lshift_odd:
mov -8(AP, N), S1
mov (AP, N), S0
xor -8(BP, N), S1
xor (BP, N), S0
mov S1, TMP
shr %cl, TMP
neg %cl
shl %cl, S0
neg %cl
or S0, TMP
mov TMP, -8(DST, N)
sub $8, N
jz .Ldone
jmp .Lshift_next
.Lshift_loop:
mov 8(AP, N), S0
xor 8(BP, N), S0
mov S0, TMP
shr %cl, TMP
neg %cl
shl %cl, S1
neg %cl
or S1, TMP
mov TMP, 8(DST, N)
mov (AP, N), S1
xor (BP, N), S1
mov S1, TMP
shr %cl, TMP
neg %cl
shl %cl, S0
neg %cl
or S0, TMP
mov TMP, (DST, N)
.Lshift_next:
sub $16, N
C FIXME: Handle the case N == 16 specially,
C like in the non-shifted case?
C ja .Lshift_loop
C jz .Ldone
jnc .Lshift_loop
add $15, N
jnc .Ldone
shr $3, %rcx
add %rcx, AP
add %rcx, BP
jmp .Lfinal_loop
.Lno_shift_case:
>)
C Next destination word is -8(DST, N)
C Setup for unrolling
test $8, N
jz .Lword_next
sub $8, N
jz .Lone_word
mov (AP, N), TMP
xor (BP, N), TMP
mov TMP, (DST, N)
jmp .Lword_next
ALIGN(4)
.Lword_loop:
mov 8(AP, N), TMP
mov (AP, N), TMP2
xor 8(BP, N), TMP
xor (BP, N), TMP2
mov TMP, 8(DST, N)
mov TMP2, (DST, N)
.Lword_next:
sub $16, N
ja .Lword_loop C Not zero and no carry
jnz .Lfinal
C Final operation is word aligned
mov 8(AP, N), TMP
xor 8(BP, N), TMP
mov TMP, 8(DST, N)
.Lone_word:
mov (AP, N), TMP
xor (BP, N), TMP
mov TMP, (DST, N)
ret
.Lfinal:
add $15, N
.Lfinal_loop:
movb (AP, N), LREG(TMP)
xorb (BP, N), LREG(TMP)
movb LREG(TMP), (DST, N)
.Lfinal_next:
sub $1, N
jnc .Lfinal_loop
.Ldone:
ret
EPILOGUE(memxor3)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment