diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm
index d9b05b18e5c5aa50c891e29ef182e4139fc45ea3..7a5a23b688708103d730dba3c5cd78e4bcd5b6ca 100644
--- a/x86_64/memxor.asm
+++ b/x86_64/memxor.asm
@@ -28,7 +28,9 @@ define(<TMP2>, <%r9>)
 define(<CNT>, <%rdi>)
 define(<S0>, <%r11>)
 define(<S1>, <%rdi>) C Overlaps with CNT 
-	
+
+define(<USE_SSE2>, <no>)
+
 	.file "memxor.asm"
 
 	.text
@@ -78,6 +80,10 @@ PROLOGUE(memxor3)
 	jnz	.Lalign_loop
 
 .Laligned:
+ifelse(USE_SSE2, yes, <
+	cmp	$16, N
+	jnc	.Lsse2_case
+>)
 	C Check for the case that AP and BP have the same alignment,
 	C but different from DST.
 	mov	AP, TMP
@@ -209,4 +215,40 @@ C 	jz	.Ldone
 
 .Ldone:
 	ret
+
+ifelse(USE_SSE2, yes, <
+
+.Lsse2_case:
+	lea	(DST, N), TMP
+	test	$8, TMP
+	jz	.Lsse2_next
+	sub	$8, N
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+	jmp	.Lsse2_next
+
+	ALIGN(4)
+.Lsse2_loop:
+	movdqu	(AP, N), %xmm0
+	movdqu	(BP, N), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST, N)
+.Lsse2_next:
+	sub	$16, N
+	ja	.Lsse2_loop
+	
+	C FIXME: See if we can do a full word first, before the
+	C byte-wise final loop.
+	jnz	.Lfinal		
+
+	C Final operation is aligned
+	movdqu	(AP), %xmm0
+	movdqu	(BP), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST)
+	ret
+>)	
+	
+
 EPILOGUE(memxor3)