Commit ac0b533d authored by Niels Möller's avatar Niels Möller

Implemented sse2-loop. Configured at compile time, and currently

disabled.

Rev: nettle/x86_64/memxor.asm:1.3
parent b36a6056
......@@ -28,7 +28,9 @@ define(<TMP2>, <%r9>)
define(<CNT>, <%rdi>)
define(<S0>, <%r11>)
define(<S1>, <%rdi>) C Overlaps with CNT
define(<USE_SSE2>, <no>)
.file "memxor.asm"
.text
......@@ -78,6 +80,10 @@ PROLOGUE(memxor3)
jnz .Lalign_loop
.Laligned:
ifelse(USE_SSE2, yes, <
cmp $16, N
jnc .Lsse2_case
>)
C Check for the case that AP and BP have the same alignment,
C but different from DST.
mov AP, TMP
......@@ -209,4 +215,40 @@ C jz .Ldone
.Ldone:
ret
ifelse(USE_SSE2, yes, <
.Lsse2_case:
lea (DST, N), TMP
test $8, TMP
jz .Lsse2_next
sub $8, N
mov (AP, N), TMP
xor (BP, N), TMP
mov TMP, (DST, N)
jmp .Lsse2_next
ALIGN(4)
.Lsse2_loop:
movdqu (AP, N), %xmm0
movdqu (BP, N), %xmm1
pxor %xmm0, %xmm1
movdqa %xmm1, (DST, N)
.Lsse2_next:
sub $16, N
ja .Lsse2_loop
C FIXME: See if we can do a full word first, before the
C byte-wise final loop.
jnz .Lfinal
C Final operation is aligned
movdqu (AP), %xmm0
movdqu (BP), %xmm1
pxor %xmm0, %xmm1
movdqa %xmm1, (DST)
ret
>)
EPILOGUE(memxor3)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment