diff --git a/ChangeLog b/ChangeLog
index 34f9ac436ef1bb9785d669ad15572a5e5eb65910..e36bc8217e4b76bfb34f78820fba5d11ad6e247b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2014-11-24  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/memxor3.asm (memxor3): New file, code moved from old
+	memxor.asm.
+	* x86_64/memxor.asm (memxor): Rewritten, no longer jumps into
+	memxor3.
+
+	* configure.ac (asm_replace_list): Added memxor.asm and
+	memxor3.asm.
+
 2014-10-23  Niels Möller  <nisse@lysator.liu.se>
 
 	* configure.ac (IF_ASM): New substituted variable.
diff --git a/configure.ac b/configure.ac
index 78bcce2eec92b4e8c1cf6717181aa3cf3a389d99..76beb3160853abde4dfaafcb90fcc35a558c6d2f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -272,7 +272,8 @@ fi
 # to a new object file).
 asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		arcfour-crypt.asm camellia-crypt-internal.asm \
-		md5-compress.asm poly1305-internal.asm \
+		md5-compress.asm memxor.asm memxor3.asm \
+		poly1305-internal.asm \
 		chacha-core-internal.asm \
 		salsa20-crypt.asm salsa20-core-internal.asm \
 		serpent-encrypt.asm serpent-decrypt.asm \
diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm
index e14e31a13f4617bc202bb66239703caedee4f47d..69d6cfe382a12e8a0d2ace1d1019a9db44d23f24 100644
--- a/x86_64/memxor.asm
+++ b/x86_64/memxor.asm
@@ -1,7 +1,7 @@
 C x86_64/memxor.asm
 
 ifelse(<
-   Copyright (C) 2010, Niels Möller
+   Copyright (C) 2010, 2014, Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -32,9 +32,8 @@ ifelse(<
 
 C Register usage:
 define(<DST>, <%rax>) C Originally in %rdi
-define(<AP>, <%rsi>)
-define(<BP>, <%rdx>)
-define(<N>, <%r10>)
+define(<SRC>, <%rsi>)
+define(<N>, <%rdx>)
 define(<TMP>, <%r8>)
 define(<TMP2>, <%r9>)
 define(<CNT>, <%rdi>)
@@ -53,20 +52,7 @@ define(<USE_SSE2>, <no>)
 
 PROLOGUE(nettle_memxor)
 	W64_ENTRY(3, 0)
-	mov	%rdx, %r10
-	mov	%rdi, %rdx
-	jmp 	.Lmemxor3_entry
-EPILOGUE(nettle_memxor)
 
-	C memxor3(void *dst, const void *a, const void *b, size_t n)
-	C 	          %rdi              %rsi              %rdx      %rcx
-	ALIGN(16)
-	
-PROLOGUE(nettle_memxor3)
-	W64_ENTRY(4, 0)
-	C %cl needed for shift count, so move away N
-	mov	%rcx, N
-.Lmemxor3_entry:
 	test	N, N
 	C Get number of unaligned bytes at the end
 	C %rdi is used as CNT, %rax as DST and as return value
@@ -87,9 +73,8 @@ PROLOGUE(nettle_memxor3)
 .Lalign_loop:
 	
 	sub	$1, N
-	movb	(AP, N), LREG(TMP)
-	xorb	(BP, N), LREG(TMP)
-	movb	LREG(TMP), (DST, N)
+	movb	(SRC, N), LREG(TMP)
+	xorb	LREG(TMP), (DST, N)
 	sub	$1, CNT
 	jnz	.Lalign_loop
 
@@ -98,83 +83,7 @@ ifelse(USE_SSE2, yes, <
 	cmp	$16, N
 	jnc	.Lsse2_case
 >)
-	C Check for the case that AP and BP have the same alignment,
-	C but different from DST.
-	mov	AP, TMP
-	sub	BP, TMP
-	test	$7, TMP
-	jnz	.Lno_shift_case
-	mov	AP, %rcx
-	sub	DST, %rcx
-	and	$7, %rcx
-	jz	.Lno_shift_case
-	sub	%rcx, AP
-	sub	%rcx, BP
-	shl	$3, %rcx
-
-	C Unrolling, with aligned values alternating in S0 and S1
-	test	$8, N
-	jnz	.Lshift_odd
-	mov	(AP, N), S1
-	xor	(BP, N), S1
-	jmp	.Lshift_next
-
-.Lshift_odd:
-	mov	-8(AP, N), S1
-	mov	(AP, N), S0
-	xor	-8(BP, N), S1
-	xor	(BP, N), S0
-	mov	S1, TMP
-	shr	%cl, TMP
-	neg	%cl
-	shl	%cl, S0
-	neg	%cl
-	
-	or	S0, TMP
-	mov	TMP, -8(DST, N)
-	sub	$8, N
-	jz	.Ldone
-	jmp 	.Lshift_next
-
-	ALIGN(16)
 
-.Lshift_loop:
-	mov	8(AP, N), S0
-	xor	8(BP, N), S0
-	mov	S0, TMP
-	shr	%cl, TMP
-	neg	%cl
-	shl	%cl, S1
-	neg	%cl
-	or	S1, TMP
-	mov	TMP, 8(DST, N)
-
-	mov	(AP, N), S1
-	xor	(BP, N), S1
-	mov	S1, TMP
-	shr	%cl, TMP
-	neg	%cl
-	shl	%cl, S0
-	neg 	%cl
-	or	S0, TMP
-	mov	TMP, (DST, N)
-.Lshift_next:
-	sub	$16, N
-	C FIXME: Handle the case N == 16 specially,
-	C like in the non-shifted case? 
-C 	ja	.Lshift_loop
-C 	jz	.Ldone
-	jnc	.Lshift_loop
-
-	add	$15, N
-	jnc	.Ldone
-
-	shr	$3, %rcx
-	add	%rcx, AP
-	add	%rcx, BP
-	jmp	.Lfinal_loop
-	
-.Lno_shift_case:
 	C Next destination word is -8(DST, N)
 	C Setup for unrolling
 	test	$8, N
@@ -183,21 +92,18 @@ C 	jz	.Ldone
 	sub	$8, N
 	jz	.Lone_word
 
-	mov	(AP, N), TMP
-	xor	(BP, N), TMP
-	mov	TMP, (DST, N)
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
 	
 	jmp	.Lword_next
 
 	ALIGN(16)
 
 .Lword_loop:
-	mov	8(AP, N), TMP
-	mov	(AP, N), TMP2
-	xor	8(BP, N), TMP
-	xor	(BP, N), TMP2
-	mov	TMP, 8(DST, N)
-	mov	TMP2, (DST, N)
+	mov	8(SRC, N), TMP
+	mov	(SRC, N), TMP2
+	xor	TMP, 8(DST, N)
+	xor	TMP2, (DST, N)
 
 .Lword_next:
 	sub	$16, N
@@ -205,33 +111,28 @@ C 	jz	.Ldone
 	jnz	.Lfinal
 
 	C Final operation is word aligned
-	mov	8(AP, N), TMP
-	xor	8(BP, N), TMP
-	mov	TMP, 8(DST, N)
+	mov	8(SRC, N), TMP
+	xor	TMP, 8(DST, N)
 	
 .Lone_word:
-	mov	(AP, N), TMP
-	xor	(BP, N), TMP
-	mov	TMP, (DST, N)
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
 
-	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-	W64_EXIT(4, 0)
+	W64_EXIT(3, 0)
 	ret
 
 .Lfinal:
 	add	$15, N
 
 .Lfinal_loop:
-	movb	(AP, N), LREG(TMP)
-	xorb	(BP, N), LREG(TMP)
-	movb	LREG(TMP), (DST, N)
+	movb	(SRC, N), LREG(TMP)
+	xorb	LREG(TMP), (DST, N)
 .Lfinal_next:
 	sub	$1, N
 	jnc	.Lfinal_loop
 
 .Ldone:
-	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-	W64_EXIT(4, 0)
+	W64_EXIT(3, 0)
 	ret
 
 ifelse(USE_SSE2, yes, <
@@ -241,15 +142,14 @@ ifelse(USE_SSE2, yes, <
 	test	$8, TMP
 	jz	.Lsse2_next
 	sub	$8, N
-	mov	(AP, N), TMP
-	xor	(BP, N), TMP
-	mov	TMP, (DST, N)
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
 	jmp	.Lsse2_next
 
 	ALIGN(16)
 .Lsse2_loop:
-	movdqu	(AP, N), %xmm0
-	movdqu	(BP, N), %xmm1
+	movdqu	(SRC, N), %xmm0
+	movdqa	(DST, N), %xmm1
 	pxor	%xmm0, %xmm1
 	movdqa	%xmm1, (DST, N)
 .Lsse2_next:
@@ -261,14 +161,13 @@ ifelse(USE_SSE2, yes, <
 	jnz	.Lfinal		
 
 	C Final operation is aligned
-	movdqu	(AP), %xmm0
-	movdqu	(BP), %xmm1
+	movdqu	(SRC), %xmm0
+	movdqa	(DST), %xmm1
 	pxor	%xmm0, %xmm1
 	movdqa	%xmm1, (DST)
-	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-	W64_EXIT(4, 0)
+
+	W64_EXIT(3, 0)
 	ret
 >)	
-	
 
-EPILOGUE(nettle_memxor3)
+EPILOGUE(nettle_memxor)
diff --git a/x86_64/memxor3.asm b/x86_64/memxor3.asm
new file mode 100644
index 0000000000000000000000000000000000000000..8ff3e79cf14ef942f78438faac5263b1d9a48d67
--- /dev/null
+++ b/x86_64/memxor3.asm
@@ -0,0 +1,263 @@
+C x86_64/memxor3.asm
+
+ifelse(<
+   Copyright (C) 2010, 2014 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Register usage:
+define(<DST>, <%rax>) C Originally in %rdi
+define(<AP>, <%rsi>)
+define(<BP>, <%rdx>)
+define(<N>, <%r10>)
+define(<TMP>, <%r8>)
+define(<TMP2>, <%r9>)
+define(<CNT>, <%rdi>)
+define(<S0>, <%r11>)
+define(<S1>, <%rdi>) C Overlaps with CNT 
+
+define(<USE_SSE2>, <no>)
+
+	.file "memxor3.asm"
+
+	.text
+
+	C memxor3(void *dst, const void *a, const void *b, size_t n)
+	C 	          %rdi              %rsi              %rdx      %rcx
+	ALIGN(16)
+	
+PROLOGUE(nettle_memxor3)
+	W64_ENTRY(4, 0)
+	C %cl needed for shift count, so move away N
+	mov	%rcx, N
+.Lmemxor3_entry:
+	test	N, N
+	C Get number of unaligned bytes at the end
+	C %rdi is used as CNT, %rax as DST and as return value
+	mov	%rdi, %rax
+	jz	.Ldone
+	add 	N, CNT
+	and	$7, CNT
+	
+	jz	.Laligned
+
+	cmp	$8, N
+	jc	.Lfinal_next
+
+	C FIXME: Instead of this loop, could try cmov with memory
+	C destination, as a sequence of one 8-bit, one 16-bit and one
+	C 32-bit operations. (Except that cmov can't do 8-bit ops, so
+	C that step has to use a conditional).
+.Lalign_loop:
+	
+	sub	$1, N
+	movb	(AP, N), LREG(TMP)
+	xorb	(BP, N), LREG(TMP)
+	movb	LREG(TMP), (DST, N)
+	sub	$1, CNT
+	jnz	.Lalign_loop
+
+.Laligned:
+ifelse(USE_SSE2, yes, <
+	cmp	$16, N
+	jnc	.Lsse2_case
+>)
+	C Check for the case that AP and BP have the same alignment,
+	C but different from DST.
+	mov	AP, TMP
+	sub	BP, TMP
+	test	$7, TMP
+	jnz	.Lno_shift_case
+	mov	AP, %rcx
+	sub	DST, %rcx
+	and	$7, %rcx
+	jz	.Lno_shift_case
+	sub	%rcx, AP
+	sub	%rcx, BP
+	shl	$3, %rcx
+
+	C Unrolling, with aligned values alternating in S0 and S1
+	test	$8, N
+	jnz	.Lshift_odd
+	mov	(AP, N), S1
+	xor	(BP, N), S1
+	jmp	.Lshift_next
+
+.Lshift_odd:
+	mov	-8(AP, N), S1
+	mov	(AP, N), S0
+	xor	-8(BP, N), S1
+	xor	(BP, N), S0
+	mov	S1, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S0
+	neg	%cl
+	
+	or	S0, TMP
+	mov	TMP, -8(DST, N)
+	sub	$8, N
+	jz	.Ldone
+	jmp 	.Lshift_next
+
+	ALIGN(16)
+
+.Lshift_loop:
+	mov	8(AP, N), S0
+	xor	8(BP, N), S0
+	mov	S0, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S1
+	neg	%cl
+	or	S1, TMP
+	mov	TMP, 8(DST, N)
+
+	mov	(AP, N), S1
+	xor	(BP, N), S1
+	mov	S1, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S0
+	neg 	%cl
+	or	S0, TMP
+	mov	TMP, (DST, N)
+.Lshift_next:
+	sub	$16, N
+	C FIXME: Handle the case N == 16 specially,
+	C like in the non-shifted case? 
+C 	ja	.Lshift_loop
+C 	jz	.Ldone
+	jnc	.Lshift_loop
+
+	add	$15, N
+	jnc	.Ldone
+
+	shr	$3, %rcx
+	add	%rcx, AP
+	add	%rcx, BP
+	jmp	.Lfinal_loop
+	
+.Lno_shift_case:
+	C Next destination word is -8(DST, N)
+	C Setup for unrolling
+	test	$8, N
+	jz	.Lword_next
+
+	sub	$8, N
+	jz	.Lone_word
+
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+	
+	jmp	.Lword_next
+
+	ALIGN(16)
+
+.Lword_loop:
+	mov	8(AP, N), TMP
+	mov	(AP, N), TMP2
+	xor	8(BP, N), TMP
+	xor	(BP, N), TMP2
+	mov	TMP, 8(DST, N)
+	mov	TMP2, (DST, N)
+
+.Lword_next:
+	sub	$16, N
+	ja	.Lword_loop	C Not zero and no carry
+	jnz	.Lfinal
+
+	C Final operation is word aligned
+	mov	8(AP, N), TMP
+	xor	8(BP, N), TMP
+	mov	TMP, 8(DST, N)
+	
+.Lone_word:
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+
+	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+	W64_EXIT(4, 0)
+	ret
+
+.Lfinal:
+	add	$15, N
+
+.Lfinal_loop:
+	movb	(AP, N), LREG(TMP)
+	xorb	(BP, N), LREG(TMP)
+	movb	LREG(TMP), (DST, N)
+.Lfinal_next:
+	sub	$1, N
+	jnc	.Lfinal_loop
+
+.Ldone:
+	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+	W64_EXIT(4, 0)
+	ret
+
+ifelse(USE_SSE2, yes, <
+
+.Lsse2_case:
+	lea	(DST, N), TMP
+	test	$8, TMP
+	jz	.Lsse2_next
+	sub	$8, N
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+	jmp	.Lsse2_next
+
+	ALIGN(16)
+.Lsse2_loop:
+	movdqu	(AP, N), %xmm0
+	movdqu	(BP, N), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST, N)
+.Lsse2_next:
+	sub	$16, N
+	ja	.Lsse2_loop
+	
+	C FIXME: See if we can do a full word first, before the
+	C byte-wise final loop.
+	jnz	.Lfinal		
+
+	C Final operation is aligned
+	movdqu	(AP), %xmm0
+	movdqu	(BP), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST)
+	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+	W64_EXIT(4, 0)
+	ret
+>)	
+	
+
+EPILOGUE(nettle_memxor3)