From feb4bd7a65782ff43a499c7a4f81eece7c5f347f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Thu, 10 Feb 2011 23:57:39 +0100
Subject: [PATCH] New file. Improves performance by 22% for the unaligned01
 case and 35% for the unaligned12 case, benchmarked on Intel SU1400.

Rev: nettle/x86_64/memxor.asm:1.1
---
 x86_64/memxor.asm | 212 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 x86_64/memxor.asm

diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm
new file mode 100644
index 00000000..17a22bbd
--- /dev/null
+++ b/x86_64/memxor.asm
@@ -0,0 +1,212 @@
+C -*- mode: asm; asm-comment-char: ?C; -*-  
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2010, Niels Möller
+C  
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+C MA 02111-1307, USA.
+
+C Register usage:
+define(<DST>, <%rax>) C Originally in %rdi
+define(<AP>, <%rsi>)
+define(<BP>, <%rdx>)
+define(<N>, <%r10>)
+define(<TMP>, <%r8>)
+define(<TMP2>, <%r9>)
+define(<CNT>, <%rdi>)
+define(<S0>, <%r11>)
+define(<S1>, <%rdi>) C Overlaps with CNT 
+	
+	.file "memxor.asm"
+
+	.text
+
+	C memxor(uint8_t *dst, const uint8_t *src, size_t n)
+	C 	          %rdi               %rsi      %rdx
+	ALIGN(4)
+
+PROLOGUE(memxor)
+	mov	%rdx, %r10
+	mov	%rdi, %rdx
+	jmp 	.Lmemxor3_entry
+EPILOGUE(memxor)
+
+	C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
+	C 	          %rdi              %rsi              %rdx      %rcx
+	ALIGN(4)
+	
+PROLOGUE(memxor3)
+	C %cl needed for shift count, so move away N
+	mov	%rcx, N
+.Lmemxor3_entry:
+	test	N, N
+	C Get number of unaligned bytes at the end
+	C %rdi is used as CNT, %rax as DST and as return value
+	mov	%rdi, %rax
+	jz	.Ldone
+	add 	N, CNT
+	and	$7, CNT
+	
+	jz	.Laligned
+
+	cmp	$8, N
+	jc	.Lfinal_next
+
+	C FIXME: Instead of this loop, could try cmov with memory
+	C destination, as a sequence of one 8-bit, one 16-bit and one
+	C 32-bit operations. (Except that cmov can't do 8-bit ops, so
+	C that step has to use a conditional).
+.Lalign_loop:
+	
+	sub	$1, N
+	movb	(AP, N), LREG(TMP)
+	xorb	(BP, N), LREG(TMP)
+	movb	LREG(TMP), (DST, N)
+	sub	$1, CNT
+	jnz	.Lalign_loop
+
+.Laligned:
+ifelse(x,x,<
+	C Check for the case that AP and BP have the same alignment,
+	C but different from DST.
+	mov	AP, TMP
+	sub	BP, TMP
+	test	$7, TMP
+	jnz	.Lno_shift_case
+	mov	AP, %rcx
+	sub	DST, %rcx
+	and	$7, %rcx
+	jz	.Lno_shift_case
+	sub	%rcx, AP
+	sub	%rcx, BP
+	shl	$3, %rcx
+
+	C Unrolling, with aligned values alternating in S0 and S1
+	test	$8, N
+	jnz	.Lshift_odd
+	mov	(AP, N), S1
+	xor	(BP, N), S1
+	jmp	.Lshift_next
+
+.Lshift_odd:
+	mov	-8(AP, N), S1
+	mov	(AP, N), S0
+	xor	-8(BP, N), S1
+	xor	(BP, N), S0
+	mov	S1, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S0
+	neg	%cl
+	
+	or	S0, TMP
+	mov	TMP, -8(DST, N)
+	sub	$8, N
+	jz	.Ldone
+	jmp 	.Lshift_next
+
+.Lshift_loop:
+	mov	8(AP, N), S0
+	xor	8(BP, N), S0
+	mov	S0, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S1
+	neg	%cl
+	or	S1, TMP
+	mov	TMP, 8(DST, N)
+
+	mov	(AP, N), S1
+	xor	(BP, N), S1
+	mov	S1, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S0
+	neg 	%cl
+	or	S0, TMP
+	mov	TMP, (DST, N)
+.Lshift_next:
+	sub	$16, N
+	C FIXME: Handle the case N == 16 specially,
+	C like in the non-shifted case? 
+C 	ja	.Lshift_loop
+C 	jz	.Ldone
+	jnc	.Lshift_loop
+
+	add	$15, N
+	jnc	.Ldone
+
+	shr	$3, %rcx
+	add	%rcx, AP
+	add	%rcx, BP
+	jmp	.Lfinal_loop
+	
+.Lno_shift_case:
+>)
+	C Next destination word is -8(DST, N)
+	C Setup for unrolling
+	test	$8, N
+	jz	.Lword_next
+
+	sub	$8, N
+	jz	.Lone_word
+
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+	
+	jmp	.Lword_next
+
+	ALIGN(4)
+
+.Lword_loop:
+	mov	8(AP, N), TMP
+	mov	(AP, N), TMP2
+	xor	8(BP, N), TMP
+	xor	(BP, N), TMP2
+	mov	TMP, 8(DST, N)
+	mov	TMP2, (DST, N)
+
+.Lword_next:
+	sub	$16, N
+	ja	.Lword_loop	C Not zero and no carry
+	jnz	.Lfinal
+
+	C Final operation is word aligned
+	mov	8(AP, N), TMP
+	xor	8(BP, N), TMP
+	mov	TMP, 8(DST, N)
+	
+.Lone_word:
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+
+	ret
+
+.Lfinal:
+	add	$15, N
+
+.Lfinal_loop:
+	movb	(AP, N), LREG(TMP)
+	xorb	(BP, N), LREG(TMP)
+	movb	LREG(TMP), (DST, N)
+.Lfinal_next:
+	sub	$1, N
+	jnc	.Lfinal_loop
+
+.Ldone:
+	ret
+EPILOGUE(memxor3)
-- 
GitLab