Implemented sse2-loop. Configured at compile time, and currently

disabled. Rev: nettle/x86_64/memxor.asm:1.3

Implemented sse2-loop. Configured at compile time, and currently
dd8652d4 · Niels Möller · af1d0e1c · dd8652d4
Commit dd8652d4 authored Oct 3, 2011 by Niels Möller
--- a/x86_64/memxor.asm
+++ b/x86_64/memxor.asm
@@ -29,6 +29,8 @@ define(<CNT>, <%rdi>)
 define(<S0>, <%r11>)
 define(<S1>, <%rdi>) C Overlaps with CNT 

+define(<USE_SSE2>, <no>)
+
 	.file "memxor.asm"

 	.text
@@ -78,6 +80,10 @@ PROLOGUE(memxor3)
 	jnz	.Lalign_loop

 .Laligned:
+ifelse(USE_SSE2, yes, <
+	cmp	$16, N
+	jnc	.Lsse2_case
+>)
 	C Check for the case that AP and BP have the same alignment,
 	C but different from DST.
 	mov	AP, TMP
@@ -209,4 +215,40 @@ C 	jz	.Ldone

 .Ldone:
 	ret
+
+ifelse(USE_SSE2, yes, <
+
+.Lsse2_case:
+	lea	(DST, N), TMP
+	test	$8, TMP
+	jz	.Lsse2_next
+	sub	$8, N
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+	jmp	.Lsse2_next
+
+	ALIGN(4)
+.Lsse2_loop:
+	movdqu	(AP, N), %xmm0
+	movdqu	(BP, N), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST, N)
+.Lsse2_next:
+	sub	$16, N
+	ja	.Lsse2_loop
+	
+	C FIXME: See if we can do a full word first, before the
+	C byte-wise final loop.
+	jnz	.Lfinal		
+
+	C Final operation is aligned
+	movdqu	(AP), %xmm0
+	movdqu	(BP), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST)
+	ret
+>)	
+	
+
 EPILOGUE(memxor3)