Commit dd8652d4 authored by Niels Möller's avatar Niels Möller
Browse files

Implemented sse2-loop. Configured at compile time, and currently


Rev: nettle/x86_64/memxor.asm:1.3
parent af1d0e1c
......@@ -28,7 +28,9 @@ define(<TMP2>, <%r9>)
define(<CNT>, <%rdi>)
define(<S0>, <%r11>)
define(<S1>, <%rdi>) C Overlaps with CNT
define(<USE_SSE2>, <no>)
.file "memxor.asm"
......@@ -78,6 +80,10 @@ PROLOGUE(memxor3)
jnz .Lalign_loop
ifelse(USE_SSE2, yes, <
cmp $16, N
jnc .Lsse2_case
C Check for the case that AP and BP have the same alignment,
C but different from DST.
mov AP, TMP
......@@ -209,4 +215,40 @@ C jz .Ldone
ifelse(USE_SSE2, yes, <
lea (DST, N), TMP
test $8, TMP
jz .Lsse2_next
sub $8, N
mov (AP, N), TMP
xor (BP, N), TMP
mov TMP, (DST, N)
jmp .Lsse2_next
movdqu (AP, N), %xmm0
movdqu (BP, N), %xmm1
pxor %xmm0, %xmm1
movdqa %xmm1, (DST, N)
sub $16, N
ja .Lsse2_loop
C FIXME: See if we can do a full word first, before the
C byte-wise final loop.
jnz .Lfinal
C Final operation is aligned
movdqu (AP), %xmm0
movdqu (BP), %xmm1
pxor %xmm0, %xmm1
movdqa %xmm1, (DST)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment