From dd8652d4b27173f2adefa77922b5fd3e18ddc8ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Mon, 3 Oct 2011 09:43:08 +0200 Subject: [PATCH] Implemented sse2-loop. Configured at compile time, and currently disabled. Rev: nettle/x86_64/memxor.asm:1.3 --- x86_64/memxor.asm | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm index d9b05b18..7a5a23b6 100644 --- a/x86_64/memxor.asm +++ b/x86_64/memxor.asm @@ -28,7 +28,9 @@ define(<TMP2>, <%r9>) define(<CNT>, <%rdi>) define(<S0>, <%r11>) define(<S1>, <%rdi>) C Overlaps with CNT - + +define(<USE_SSE2>, <no>) + .file "memxor.asm" .text @@ -78,6 +80,10 @@ PROLOGUE(memxor3) jnz .Lalign_loop .Laligned: +ifelse(USE_SSE2, yes, < + cmp $16, N + jnc .Lsse2_case +>) C Check for the case that AP and BP have the same alignment, C but different from DST. mov AP, TMP @@ -209,4 +215,40 @@ C jz .Ldone .Ldone: ret + +ifelse(USE_SSE2, yes, < + +.Lsse2_case: + lea (DST, N), TMP + test $8, TMP + jz .Lsse2_next + sub $8, N + mov (AP, N), TMP + xor (BP, N), TMP + mov TMP, (DST, N) + jmp .Lsse2_next + + ALIGN(4) +.Lsse2_loop: + movdqu (AP, N), %xmm0 + movdqu (BP, N), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST, N) +.Lsse2_next: + sub $16, N + ja .Lsse2_loop + + C FIXME: See if we can do a full word first, before the + C byte-wise final loop. + jnz .Lfinal + + C Final operation is aligned + movdqu (AP), %xmm0 + movdqu (BP), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST) + ret +>) + + EPILOGUE(memxor3) -- GitLab