diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm index d9b05b18e5c5aa50c891e29ef182e4139fc45ea3..7a5a23b688708103d730dba3c5cd78e4bcd5b6ca 100644 --- a/x86_64/memxor.asm +++ b/x86_64/memxor.asm @@ -28,7 +28,9 @@ define(<TMP2>, <%r9>) define(<CNT>, <%rdi>) define(<S0>, <%r11>) define(<S1>, <%rdi>) C Overlaps with CNT - + +define(<USE_SSE2>, <no>) + .file "memxor.asm" .text @@ -78,6 +80,10 @@ PROLOGUE(memxor3) jnz .Lalign_loop .Laligned: +ifelse(USE_SSE2, yes, < + cmp $16, N + jnc .Lsse2_case +>) C Check for the case that AP and BP have the same alignment, C but different from DST. mov AP, TMP @@ -209,4 +215,40 @@ C jz .Ldone .Ldone: ret + +ifelse(USE_SSE2, yes, < + +.Lsse2_case: + lea (DST, N), TMP + test $8, TMP + jz .Lsse2_next + sub $8, N + mov (AP, N), TMP + xor (BP, N), TMP + mov TMP, (DST, N) + jmp .Lsse2_next + + ALIGN(4) +.Lsse2_loop: + movdqu (AP, N), %xmm0 + movdqu (BP, N), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST, N) +.Lsse2_next: + sub $16, N + ja .Lsse2_loop + + C FIXME: See if we can do a full word first, before the + C byte-wise final loop. + jnz .Lfinal + + C Final operation is aligned + movdqu (AP), %xmm0 + movdqu (BP), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST) + ret +>) + + EPILOGUE(memxor3)